def sortandcleansekeywords(filename, filename2): import csv import link import remove btypes = [] btypes2 = [] headers = [] keys = [] companies = [] rows = [] counter = 0 counter2 = 0 filename3 = "industry_type_sorted.csv" filename4 = "industry_type_sorted_&_cleaned.csv" #open csv files, get url into array geo = open(filename2, 'rU') csv_geo = csv.reader(geo) layers = csv_geo.next() for k in range(len(layers)): header = "Business Type" + str(k) headers.append(header) headers.insert(0, "Company Name") #part one #create tags csv file f = open(filename, 'wb') csv_f = csv.writer(f) csv_f.writerow(headers) f.close() for row in csv_geo: tags = row for i, col in enumerate(row): if col.isdigit() == True: btype = layers[i] else: continue btypes.append(btype) btypes.insert(0, row[0]) if len(tags) == len(btypes): for j in range(len(tags)): if tags[j] == "1": btype = btypes[j] else: continue btypes2.append(btype) else: btype = "N/A" btypes2.append(btype) btypes2.insert(0, row[0]) link.writelink(btypes2, filename) btypes = [] btypes2 = [] geo.close() f = open(filename3, 'wb') csv_f = csv.writer(f) csv_f.writerow(headers) f.close() #part two #edit csv geo2 = open(filename, 'rU') csv_geo2 = csv.reader(geo2) csv_geo2.next() for row in csv_geo2: if counter > 0: counter = counter + 1 temp2 = row if temp2[0] == temp[0]: counter2 = counter2 + 1 if len(temp) == len(temp2): for l in range(len(temp)): if temp[l] in temp2: continue else: temp2.append(temp[l]) if counter2 == 0: link.writelink(temp2, filename3) else: pass counter2 = 0 elif temp2[0] != temp[0]: link.writelink(temp, filename3) temp = temp2 else: counter = counter + 1 temp = row geo2.close() tcounter = counter counter = 0 #second editing geo2 = open(filename, 'rU') csv_geo2 = csv.reader(geo2) csv_geo2.next() for row in csv_geo2: counter = counter + 1 if counter == (tcounter - 1): temp2 = row elif counter == tcounter: temp = row if temp2[0] == temp[0]: counter2 = counter2 + 1 if len(temp) == len(temp2): for l in range(len(temp)): if temp[l] in temp2: continue else: temp2.append(temp[l]) if counter2 == 0: link.writelink(temp2, filename3) else: pass counter2 = 0 elif temp2[0] != temp[0]: link.writelink(temp, filename3) geo2.close() #part three crm = "customer relationship management" erp = "entreprise resource planning" ba = "business analytics" f = open(filename4, 'wb') csv_f = csv.writer(f) csv_f.writerow(headers) f.close() #third editing geo2 = open(filename3, 'rU') csv_geo2 = csv.reader(geo2) csv_geo2.next() for row in csv_geo2: temp = row for m in range(len(temp)): if temp[m] == crm : temp[m] = "crm" elif temp[m] == erp : temp[m] = "erp" elif temp[m] == ba : temp[m] = "analytics" else: pass temp = remove.removeduplicates(temp) link.writelink(temp, filename4) geo2.close() return filename4
def keywords(filename, filename2, newmdata): import csv import link companies = [] counts = [] counter = 0 layers = [] clayer = [] urls2 = [] filename3 = "tags.csv" #open csv files, get company into array geo = open(filename, 'rU') csv_geo = csv.reader(geo) csv_geo.next() for row in csv_geo: company = row[0] companies.append(company) geo.close() #open a csv file with keywords geo2 = open(filename2, 'rU') csv_geo2 = csv.reader(geo2) csv_geo2.next() for row in csv_geo2: layer = row[0] layers.append(layer) geo2.close() headers = layers headers.insert(0, "Company Name") #print headers f = open(filename3, 'wb') csv_f = csv.writer(f) csv_f.writerow(str(headers)) f.close() #create tags csv file f = open(filename3, 'wb') csv_f = csv.writer(f) csv_f.writerow(headers) f.close() #open a csv file with keywords geo3 = open(newmdata, 'rU') csv_geo3 = csv.reader(geo3) csv_geo3.next() for row in csv_geo3: url = row if len(url) > 1: for j in range(len(url)): if j == 0: continue for k in range(len(layers)): if counter < 1: count = link.searchlink(url[j], layers[k]) clayer2 = count clayer.append(clayer2) elif counter > 0: if clayer[k] == 1: count = 1 elif clayer[k] == 0: count = link.searchlink(url[j], layers[k]) counts.append(count) counts[0] = url[0] link.writelink(counts, filename3) counts = [] counter = counter + 1 else: for l in range(len(layers)): count = "Website not available" counts.append(count) counts[0] = url[0] link.writelink(counts, filename3) counts = [] counter = 0 geo3.close()
def reversedata(filename): import csv import link counter = 0 counter2 = 0 filename2 = "tags_cleaned.csv" #open csv files, get url into array geo = open(filename, 'rU') csv_geo = csv.reader(geo) headers = csv_geo.next() f = open(filename2, 'wb') csv_f = csv.writer(f) csv_f.writerow(headers) f.close() #edit csv geo2 = open(filename, 'rU') csv_geo2 = csv.reader(geo2) csv_geo2.next() for row in csv_geo2: if counter > 0: counter = counter + 1 temp2 = row if temp2[0] == temp[0]: counter2 = counter2 + 1 if len(temp) == len(temp2): for l in range(len(temp)): if temp[l] in temp2: continue else: temp2.append(temp[l]) if counter2 == 0: link.writelink(temp2, filename2) else: pass counter2 = 0 elif temp2[0] != temp[0]: link.writelink(temp, filename2) temp = temp2 else: counter = counter + 1 temp = row geo2.close() tcounter = counter counter = 0 #second editing geo2 = open(filename, 'rU') csv_geo2 = csv.reader(geo2) csv_geo2.next() for row in csv_geo2: counter = counter + 1 if counter == (tcounter - 1): temp2 = row elif counter == tcounter: temp = row if temp2[0] == temp[0]: counter2 = counter2 + 1 if len(temp) == len(temp2): for l in range(len(temp)): if temp[l] in temp2: continue else: temp2.append(temp[l]) if counter2 == 0: link.writelink(temp2, filename2) else: pass counter2 = 0 elif temp2[0] != temp[0]: link.writelink(temp, filename2) geo2.close()