Example #1
0
def sortandcleansekeywords(filename, filename2):

    import csv
    import link
    import remove

    btypes = []
    btypes2 = []
    headers = []
    keys = []
    companies = []
    rows = []
    counter = 0
    counter2 = 0
    filename3 = "industry_type_sorted.csv"
    filename4 = "industry_type_sorted_&_cleaned.csv"

    #open csv files, get url into array
    geo = open(filename2, 'rU')
    csv_geo = csv.reader(geo)
    layers = csv_geo.next()

    for k in range(len(layers)):
        header = "Business Type" + str(k)
        headers.append(header)
    headers.insert(0, "Company Name")

    #part one
    #create tags csv file
    f = open(filename, 'wb')
    csv_f = csv.writer(f)
    csv_f.writerow(headers)
    f.close()
    
    for row in csv_geo:
        
        tags = row
        for i, col in enumerate(row):
            if col.isdigit() == True:
                btype = layers[i]
            else:
                continue
            btypes.append(btype)
        btypes.insert(0, row[0])

        if len(tags) == len(btypes):
            for j in range(len(tags)):
                if tags[j] == "1":
                    btype = btypes[j]
                else:
                    continue
                btypes2.append(btype)
        else:
            btype = "N/A"
            btypes2.append(btype)

        btypes2.insert(0, row[0])
        link.writelink(btypes2, filename)
        btypes = []
        btypes2 = []
        
    geo.close()

    f = open(filename3, 'wb')
    csv_f = csv.writer(f)
    csv_f.writerow(headers)
    f.close()

    #part two
    #edit csv
    geo2 = open(filename, 'rU')
    csv_geo2 = csv.reader(geo2)
    csv_geo2.next()
    for row in csv_geo2:
        if counter > 0:
            counter = counter + 1
            temp2 = row
            if temp2[0] == temp[0]:
                counter2 = counter2 + 1
                if len(temp) == len(temp2):
                    for l in range(len(temp)):
                        if temp[l] in temp2:
                            continue
                        else:
                            temp2.append(temp[l])
                    if counter2 == 0:
                        link.writelink(temp2, filename3)
                    else:
                        pass
                counter2 = 0
            elif temp2[0] != temp[0]:
                link.writelink(temp, filename3)
            temp = temp2
        else:
            counter = counter + 1
            temp = row
    geo2.close()

    tcounter = counter
    counter = 0

    #second editing
    geo2 = open(filename, 'rU')
    csv_geo2 = csv.reader(geo2)
    csv_geo2.next()
    for row in csv_geo2:
        counter = counter + 1
        if counter == (tcounter - 1):
            temp2 = row
        elif counter == tcounter:
            temp = row
            if temp2[0] == temp[0]:
                counter2 = counter2 + 1
                if len(temp) == len(temp2):
                    for l in range(len(temp)):
                        if temp[l] in temp2:
                            continue
                        else:
                            temp2.append(temp[l])
                    if counter2 == 0:
                        link.writelink(temp2, filename3)
                    else:
                        pass
                counter2 = 0
            elif temp2[0] != temp[0]:
                link.writelink(temp, filename3)
    geo2.close()

    #part three
    crm = "customer relationship management"
    erp = "entreprise resource planning"
    ba = "business analytics"

    f = open(filename4, 'wb')
    csv_f = csv.writer(f)
    csv_f.writerow(headers)
    f.close()
    
    #third editing
    geo2 = open(filename3, 'rU')
    csv_geo2 = csv.reader(geo2)
    csv_geo2.next()
    for row in csv_geo2:

        temp = row
        for m in range(len(temp)):
            if temp[m] == crm :
                temp[m] = "crm"
            elif temp[m] == erp :
                temp[m] = "erp"
            elif temp[m] == ba :
                temp[m] = "analytics"
            else:
                pass
        temp = remove.removeduplicates(temp)
        link.writelink(temp, filename4)
        
    geo2.close()

    return filename4
Example #2
0
def keywords(filename, filename2, newmdata):

    import csv
    import link
    
    companies = []
    counts = []
    counter = 0
    layers = []
    clayer = []
    urls2 = []
    filename3 = "tags.csv"

    #open csv files, get company into array
    geo = open(filename, 'rU')
    csv_geo = csv.reader(geo)
    csv_geo.next()
    for row in csv_geo:
        company = row[0]
        companies.append(company)
    geo.close()

    #open a csv file with keywords
    geo2 = open(filename2, 'rU')
    csv_geo2 = csv.reader(geo2)
    csv_geo2.next()
    for row in csv_geo2:
        layer = row[0]
        layers.append(layer)
    geo2.close()

    headers = layers
    headers.insert(0, "Company Name")
    
    #print headers
    f = open(filename3, 'wb')
    csv_f = csv.writer(f)
    csv_f.writerow(str(headers))
    f.close()

    #create tags csv file
    f = open(filename3, 'wb')
    csv_f = csv.writer(f)
    csv_f.writerow(headers)
    f.close()

    #open a csv file with keywords
    geo3 = open(newmdata, 'rU')
    csv_geo3 = csv.reader(geo3)
    csv_geo3.next()
    for row in csv_geo3:
        url = row

        if len(url) > 1:
            for j in range(len(url)):
                if j == 0:
                    continue
                for k in range(len(layers)):
                    if counter < 1:
                        count = link.searchlink(url[j], layers[k])
                        clayer2 = count
                        clayer.append(clayer2)
                    elif counter > 0:
                        if clayer[k] == 1:
                            count = 1
                        elif clayer[k] == 0:
                            count = link.searchlink(url[j], layers[k])
                    counts.append(count)
                counts[0] = url[0]                 
                link.writelink(counts, filename3)
                counts = []
                counter = counter + 1
        else:
            for l in range(len(layers)):
                count = "Website not available"
                counts.append(count)

            counts[0] = url[0]                           
            link.writelink(counts, filename3)
            counts = []

        counter = 0
            
    geo3.close()
Example #3
0
def reversedata(filename):

    import csv
    import link

    counter = 0
    counter2 = 0
    filename2 = "tags_cleaned.csv"

    #open csv files, get url into array
    geo = open(filename, 'rU')
    csv_geo = csv.reader(geo)
    headers = csv_geo.next()
    
    f = open(filename2, 'wb')
    csv_f = csv.writer(f)
    csv_f.writerow(headers)
    f.close()
    
    #edit csv
    geo2 = open(filename, 'rU')
    csv_geo2 = csv.reader(geo2)
    csv_geo2.next()
    for row in csv_geo2:
        if counter > 0:
            counter = counter + 1
            temp2 = row
            if temp2[0] == temp[0]:
                counter2 = counter2 + 1
                if len(temp) == len(temp2):
                    for l in range(len(temp)):
                        if temp[l] in temp2:
                            continue
                        else:
                            temp2.append(temp[l])
                    if counter2 == 0:
                        link.writelink(temp2, filename2)
                    else:
                        pass
                counter2 = 0
            elif temp2[0] != temp[0]:
                link.writelink(temp, filename2)
            temp = temp2
        else:
            counter = counter + 1
            temp = row
    geo2.close()

    tcounter = counter
    counter = 0

    #second editing
    geo2 = open(filename, 'rU')
    csv_geo2 = csv.reader(geo2)
    csv_geo2.next()
    for row in csv_geo2:
        counter = counter + 1
        if counter == (tcounter - 1):
            temp2 = row
        elif counter == tcounter:
            temp = row
            if temp2[0] == temp[0]:
                counter2 = counter2 + 1
                if len(temp) == len(temp2):
                    for l in range(len(temp)):
                        if temp[l] in temp2:
                            continue
                        else:
                            temp2.append(temp[l])
                    if counter2 == 0:
                        link.writelink(temp2, filename2)
                    else:
                        pass
                counter2 = 0
            elif temp2[0] != temp[0]:
                link.writelink(temp, filename2)
    geo2.close()