def makeSubjectCsv(subjects_sorted): final = [] yearLookup = {} subjects = ["Travel","Education","Love","Health","Family","Religion","Political","Lifestyle","Unidentified"] counter = 0 for x in range(1800,1912): final.append({"year":x}) for subject in subjects: final[counter][subject] = 0 yearLookup[x] = counter counter += 1 for entry in subjects_sorted: mySub = entry["subject"] for year in entry["yearDate"].keys(): if not year == '': try: index = yearLookup[int(year)] final[index][mySub] = entry["yearDate"][year] except: print year headers = ["year","Travel","Education","Love","Health","Family","Religion","Political","Lifestyle","Unidentified"] filename = "subject_year.csv" gd.write_data_dicts(filename,headers,final)
def makeTrainingData(wordList): list_of_dicts = gd.get_data_list_of_dicts() wordCounts = [] probability = 3 full = [] partial = [] added = { "0-10": False, "10-20": False, "20-30": False, "30-40": False, "40-50": False, "50-60": False, "60-70": False, "70-80": False, "80-90": False, "unkown": False } for entry in list_of_dicts: tempDict = {} decade = "unkown" if len(entry["Age of Author"]) > 3 and len(entry["Age of Author"]) < 9: decade = entry["Age of Author"] tempDict["class"] = decade t = entry["Transcript"] t = t.replace("<br>", " ") t = t.replace("COMMA", " ") t = re.sub('\W', ' ', t) t = t.split(" ") for word in t: if not word == "" and word in wordList: word = word.lower() try: tempDict[word] += 1 except: tempDict[word] = 1 full.append(tempDict) temp = randint(0, 10) if temp > probability or not added[decade]: added[decade] = True partial.append(tempDict) filename1 = "trainingData.csv" filename2 = "fullData.csv" wordList.append("class") headers = wordList gd.write_data_dicts(filename1, headers, partial) gd.write_data_dicts(filename2, headers, full)
def makeTrainingData(wordList): list_of_dicts = gd.get_data_list_of_dicts() wordCounts = [] probability = 3 full = [] partial = [] added = {"0-10":False, "10-20":False, "20-30":False, "30-40":False, "40-50":False, "50-60":False, "60-70":False, "70-80":False, "80-90":False, "unkown":False } for entry in list_of_dicts: tempDict = {} decade = "unkown" if len(entry["Age of Author"]) > 3 and len(entry["Age of Author"]) < 9: decade = entry["Age of Author"] tempDict["class"] = decade t = entry["Transcript"] t = t.replace("<br>", " ") t = t.replace("COMMA", " ") t = re.sub('\W',' ',t) t = t.split(" ") for word in t: if not word == "" and word in wordList: word = word.lower() try: tempDict[word] += 1 except: tempDict[word] = 1 full.append(tempDict) temp = randint(0,10) if temp > probability or not added[decade]: added[decade] = True partial.append(tempDict) filename1 = "trainingData.csv" filename2 = "fullData.csv" wordList.append("class") headers = wordList gd.write_data_dicts(filename1,headers,partial) gd.write_data_dicts(filename2,headers,full)
import get_data as gd headers_clean = gd.get_headers() list_of_dicts = gd.get_data_list_of_dicts() new_headers = [] for h in headers_clean: h = h.split(",") if not " Error" in h and not "" in h: new_headers.append(",".join(h)) final = [] for entry in list_of_dicts: temp = {} for header in new_headers: temp[header] = entry[header] final.append(temp) filename = "noError.csv" headers = new_headers gd.write_data_dicts(filename, headers, final)
for entry in list_of_dicts: if entry["Creator"] == "" or entry["Recipient"] == "" or entry["Creator"] == "Unknown": continue else: temp_string = entry["Recipient"].split(";") for i in temp_string: if not i == "": my_dict = entry.copy() person = i.split("COMMA") if len(person) == 2: person = person[1] + " " + person[0] elif len(person) == 3: person = person[1] + " " + person[0] + ": " + person[2] my_dict["Recipient"] = person temp.append(my_dict) headers_wanted = ["Title","Date","Creator","Identifier","Recipient", "Gender of Author","Age of Author","Identified People","Unidentified People","Subject","Geographic Subjects","Place Of Origin","Destination","Notes","Language","Transcript","Reference URL","CONTENTdm number","CONTENTdm file name"] final = [] for entry in temp: temp_dict = {} for key in entry.keys(): if key in headers_wanted: temp_dict[key]=entry[key] final.append(temp_dict) headers = headers_wanted file_name = "Recipient_and_Creator_cleaned2.csv" gd.write_data_dicts(file_name, headers, final)
print "lifestyle:\t" + str(len(lifestyle_bin))+"\t\t"+str(min_age[7])+"\t\t"+str(max_age[7]) print "---------------------------------------------" print "Total:\t\t" + str(total) print "---------------------------------------------" print "unidentified: " + str(len(unidentified)) print "max_age: " + str(max_age) print "min_age: " + str(min_age) print print "Distribution:" print "\t\t0-10\t10-20\t20-30\t30-40\t40-50\t50-60\t60-70\t70-80\t80-90\t90-100\tunkown" age_strings = [] for entry in age_count: temp = "" for item in entry: temp += str(item) + "\t" age_strings.append(temp) print "travel:\t\t"+age_strings[0] print "education:\t"+age_strings[1] print "love:\t\t"+age_strings[2] print "health:\t\t" +age_strings[3] print "family:\t\t" +age_strings[4] print "religion:\t" +age_strings[5] print "political:\t"+age_strings[6] print "lifestyle:\t"+age_strings[7] """ """ filename = "organized.csv" headers = gd.get_headers() gd.write_data_dicts(filename, headers, has_subject) """
is_int = 0 try: is_int = int(raw_list[item][0]) except: is_int = -1 if is_int >= 0: final_dicts[counter // 3]["date"] = raw_list[item] counter += 1 item += 1 else: final_dicts[counter // 3]["date"] = "unknown" counter += 1 return final_dicts list_of_dicts = [] for x in range(1, 17): print x temp = get_page_data(x) if temp == list_of_dicts: print "what" else: list_of_dicts = list_of_dicts + temp filename = "letters_list.csv" headers = ["date", "recipient", "creator"] gd.write_data_dicts(filename, headers, list_of_dicts)
print "Total:\t\t" + str(total) print "---------------------------------------------" print "unidentified: " + str(len(unidentified)) print "max_age: " + str(max_age) print "min_age: " + str(min_age) print print "Distribution:" print "\t\t0-10\t10-20\t20-30\t30-40\t40-50\t50-60\t60-70\t70-80\t80-90\t90-100\tunkown" age_strings = [] for entry in age_count: temp = "" for item in entry: temp += str(item) + "\t" age_strings.append(temp) print "travel:\t\t"+age_strings[0] print "education:\t"+age_strings[1] print "love:\t\t"+age_strings[2] print "health:\t\t" +age_strings[3] print "family:\t\t" +age_strings[4] print "religion:\t" +age_strings[5] print "political:\t"+age_strings[6] print "lifestyle:\t"+age_strings[7] """ """ filename = "organized.csv" headers = gd.get_headers() gd.write_data_dicts(filename, headers, has_subject) """
for h2 in headers_income: h2 = h2.split(" - ") code = h2[0] try: if not "Error" in h2[1]: name = h2[1] codes[code] = name except: print h2 for entry in full: temp = {} for h1 in full_headers: try: temp[codes[h1]] = entry[h1] if not codes[h1] in final_headers: final_headers.append(codes[h1]) except: if not h1[len(h1) - 1] == "e": temp[h1] = entry[h1] if not h1 in final_headers: final_headers.append(h1) full_clean.append(temp) filename = "partialNameClean2.csv" headers = final_headers gd.write_data_dicts(filename, headers, full_clean)
if len(Poo) == 1: Poo = Poo[0] else: Poo = Poo[0] + ", " + Poo[1] Dest = item["Destination"].replace(" ", "") Dest = Dest.replace("(", "COMMA") Dest = Dest.replace(")", "COMMA") Dest = Dest.split("COMMA") Dest2 = [] for word in Dest: if not word == "": Dest2.append(word) Dest = Dest2[:] if len(Dest) == 1: Dest = Dest[0] else: Dest = Dest[0] + ", " + Dest[1] if Poo in places and Dest in places: my_json = json.dumps(item) has_full.append({ "Poo": places.index(Poo), "Dest": places.index(Dest), "Letter": my_json }) filename = "letterTravels.csv" headers = ["Poo", "Dest", "Letter"] gd.write_data_dicts(filename, headers, has_full)
for h2 in headers_income: h2 = h2.split(" - ") code = h2[0] try: if not "Error" in h2[1]: name = h2[1] codes[code] = name except: print h2 for entry in full: temp = {} for h1 in full_headers: try: temp[codes[h1]] = entry[h1] if not codes[h1] in final_headers: final_headers.append(codes[h1]) except: if not h1[len(h1)-1] == "e": temp[h1] = entry[h1] if not h1 in final_headers: final_headers.append(h1) full_clean.append(temp) filename = "partialNameClean2.csv" headers = final_headers gd.write_data_dicts(filename,headers,full_clean)
is_int = 0 try: is_int = int(raw_list[item][0]) except: is_int = -1 if is_int >= 0: final_dicts[counter//3]["date"] = raw_list[item] counter += 1 item += 1 else: final_dicts[counter//3]["date"] = "unknown" counter +=1 return final_dicts list_of_dicts = [] for x in range(1,17): print x temp = get_page_data(x) if temp == list_of_dicts: print "what" else: list_of_dicts = list_of_dicts + temp filename = "letters_list.csv" headers = ["date","recipient","creator"] gd.write_data_dicts(filename, headers, list_of_dicts)
for word in Poo: if not word == "": Poo2.append(word) Poo = Poo2[:] if len(Poo) == 1: Poo = Poo[0] else: Poo = Poo[0] +", "+Poo[1] Dest = item["Destination"].replace(" ","") Dest = Dest.replace("(","COMMA") Dest = Dest.replace(")","COMMA") Dest = Dest.split("COMMA") Dest2 = [] for word in Dest: if not word == "": Dest2.append(word) Dest = Dest2[:] if len(Dest) == 1: Dest = Dest[0] else: Dest = Dest[0] +", "+Dest[1] if Poo in places and Dest in places: my_json = json.dumps(item) has_full.append({"Poo":places.index(Poo),"Dest":places.index(Dest), "Letter":my_json}) filename="letterTravels.csv" headers = ["Poo","Dest","Letter"] gd.write_data_dicts(filename,headers,has_full)
"2ndBeach, Newport", "Mossgiel, R.I.", "Mossgiel" ] print len(not_right) not_right_clean = [] for entry in not_right: not_right_clean.append(entry.replace(" ?","")) right = [] counter = 0 left_out = [] for entry in list_of_dicts: if entry["Name"] in not_right_clean: counter += 1 left_out.append(entry["Name"]) else: right.append(entry) print counter for entry in not_right_clean: if entry in left_out: continue else: print entry filename = "partialCurrectLocations.csv" headers = gd.get_headers() gd.write_data_dicts(filename,headers,right)