def get_minimum_year(): """ Gets the minimum year in the address/constituents CSVs using only BeginDate Saves in minyear.txt """ basepath = os.environ['BASEPATH'] response = open(basepath+'address.csv') reader = csv.DictReader(response) years = [int(Converter.str_to_float(item['BeginDate'])) for item in reader if item['BeginDate'] != ''] nozeros = [number for number in years if (number != 0 and number != -1)] minyeara = np.amin(nozeros) print "\n\nMinimum year in address is: " + str(minyeara) response = open(basepath+'constituents.csv') reader = csv.DictReader(response) years = [int(Converter.str_to_float(item['BeginDate'])) for item in reader if item['BeginDate'] != ''] nozeros = [number for number in years if (number != 0 and number != -1)] minyearb = np.amin(nozeros) print "\n\nMinimum year in constituents is: " + str(minyearb) minyear = min(minyeara, minyearb) output = basepath+'minyear.txt' print "\nwrote " + output text_file = open(output, "w") text_file.write(str(minyear)) text_file.close()
def create_base_constituents(): """ Builds the raw skeleton of a constituent from constituents.csv """ reader = Converter.process_csv(os.environ['BASEPATH'] + 'constituents.csv') constituents = {} for row in reader: row = Converter.remove_bom(row) if (row['AlphaSort'] == None): print "No AlphaSort in: " + row['ConstituentID'] row['AlphaSort'] = '' row['nameSort'] = row['AlphaSort'].split(" ")[0] row['id'] = row['ConstituentID'] constituents[row['ConstituentID']] = row return constituents
def get_join_data(filepath): """ To denormalize CSV data term IDs """ reader = Converter.process_csv(filepath) data = {} for row in reader: row = Converter.remove_bom(row) tmp = [] for k in row: tmp.append(row[k]) if tmp[1].isdigit(): data[tmp[1]] = tmp[0] else: data[tmp[0]] = tmp[1] return data
def build_action(row, index, doc_type): """ Creates an ES action for indexing """ cleaned = Converter.remove_bom(row) if not "id" in cleaned: print "NO ID!!!1!" print cleaned action = { "_index": index, "_type": doc_type, "_source": cleaned, "_id": cleaned["id"] } if doc_type == 'address': action["_parent"] = cleaned['ConstituentID'] action["_routing"] = cleaned['ConstituentID'] return action
def process_constituents(endpoint): """ Consolidates all constituent data into a single dictionary. Pushes each dictionary as a document to Elastic. """ start = timeit.default_timer() constituents = create_base_constituents() counter = 0 tables = ["format","biography","address","gender","process","role","collection"] joins = ["formats","biographies",["addresstypes","countries"],"genders","processes","roles","collections"] location_pattern = re.compile("(\-?\d+(\.\d+)?)\s*,\s*(\-?\d+(\.\d+)?)") for table in tables: reader = Converter.process_csv(os.environ['BASEPATH'] + table + ".csv") if type(joins[counter]) is str: joindata = get_join_data(os.environ['BASEPATH'] + joins[counter] + ".csv") else: j0 = get_join_data(os.environ['BASEPATH'] + joins[counter][0] + ".csv") j1 = get_join_data(os.environ['BASEPATH'] + joins[counter][1] + ".csv") joindata = [j0,j1] for row in reader: row = Converter.remove_bom(row) if ((row['ConstituentID'] in constituents) == False): print "No constituent:" + row['ConstituentID'] continue if ((table in constituents[row['ConstituentID']]) == False): constituents[row['ConstituentID']][table] = [] cid = row['ConstituentID'] if not 'ConAddressID' in row: del row['ConstituentID'] else: row['id'] = row['ConAddressID'] # add the value of the term id if 'TermID' in row: if ((row['TermID'] in joindata) == False): print "No " + table + ":" + row['TermID'] else: row['Term'] = joindata[row['TermID']] if 'AddressTypeID' in row: if ((row['AddressTypeID'] in joindata[0]) == False): print "No " + joins[counter][0] + ":" + row['AddressTypeID'] print joindata[0] else: row['AddressType'] = joindata[0][row['AddressTypeID']] if ((row['CountryID'] in joindata[1]) == False): print "No " + joins[counter][1] + ":" + row['CountryID'] else: row['Country'] = joindata[1][row['CountryID']] if 'Remarks' in row: row['Remarks'] = Converter.convert_whitespace(row['Remarks']) if location_pattern.match(row['Remarks']): latlon = Converter.compress_address(row['Remarks']) row['Remarks'] = ",".join(latlon) row['Location'] = { "lat" : float(latlon[0]), "lon" : float(latlon[1]) } constituents[cid][table].append(row) counter = counter + 1 end = timeit.default_timer() print "\n\nProcessed CSVs in " + str(end - start) + " seconds\n" print "\n\nPreparing indexing actions...\n" start = timeit.default_timer() # now on to elastic (index already created) actions = [] addresslist = [] for index, cid in enumerate(constituents): addresses = [] if 'address' in constituents[cid]: # sort addresses addresses = Converter.sort_addresses(constituents[cid]['address']) constituents[cid]['addressTotal'] = len(constituents[cid]['address']) del constituents[cid]['address'] # put the addresses addresslist = addresslist + addresses # put the constituent in there actions.append(create_constituent(constituents[cid])) end = timeit.default_timer() print "\n\nActions prepared in " + str((end - start)/60) + " minutes\n" print "\n\nIndexing...\n" create_indices(endpoint) es = Elasticsearch([endpoint], timeout=360, max_retries=10, retry_on_timeout=True) # split the actions into batches of 10k print " Constituents..." index = 0 n = 10000 splitactions = split_list(actions, n) for actionchunk in splitactions: print " actions " + str(index*n) + " to " + str((index+1)*n) index = index + 1 helpers.bulk(es, actionchunk) print " Addesses..." index = 0 splitaddresses = split_list(addresslist, n) for addresschunk in splitaddresses: print " actions " + str(index*n) + " to " + str((index+1)*n) index = index + 1 helpers.bulk(es, [build_action(value, 'pic', 'address') for value in addresschunk]) return constituents
def generate_base_locations(): """ Gets all locations/heights for photographers Saves in latlons.txt and heights.txt """ basepath = os.environ['BASEPATH'] response = open(basepath + 'address.csv') print "\n\nloaded " + basepath + "address.csv" reader = csv.DictReader(response) location_pattern = re.compile("(\-?\d+(\.\d+)?)\s*,\s*(\-?\d+(\.\d+)?).*") places = [] heights = [] every_row = [] for row in reader: row = Converter.remove_bom(row) remarks = row['Remarks'] remarks = Converter.convert_whitespace(remarks) if remarks == None: print "No remarks:" + row['ConstituentID'] + ":" + remarks continue if location_pattern.match(remarks) == None: if remarks != "NULL": print "NULL remarks:" + row['ConstituentID'] + ":" + remarks continue row['Remarks'] = remarks every_row.append(row) every_row = sorted(every_row, key=lambda d: d['BeginDate']) # put born first and died last always other_addresses = [] born_addresses = [] died_addresses = [] for add in every_row: # put the active/biz ones if (add['AddressTypeID'] == '7' or add['AddressTypeID'] == '2'): other_addresses.append(add) # find born if any if (add['AddressTypeID'] == '5'): born_addresses.append(add) # find died if any if (add['AddressTypeID'] == '6'): died_addresses.append(add) for_real_sorted_every_row = [] for_real_sorted_every_row.extend(born_addresses) for_real_sorted_every_row.extend(other_addresses) for_real_sorted_every_row.extend(died_addresses) for row in for_real_sorted_every_row: address = Converter.compress_address(row['Remarks']) height = 0 if len(address) > 2: height = address.pop() heights.extend([row['ConAddressID'], height]) address.append(row['ConstituentID']) address.append(row['ConAddressID']) if row['AddressTypeID'] != "NULL": address.append(row['AddressTypeID']) else: print "no type:" + row['ConstituentID'] address.append("1") address.append(row['CountryID']) places.extend(address) locations = "[\"constituents\", [" + ",".join(places) + "]]" output = basepath + 'latlons.txt' print "\nwrote " + output text_file = open(output, "w") text_file.write(locations) text_file.close() locations = "[\"heights\", [" + ",".join(heights) + "]]" output = basepath + 'heights.txt' print "\nwrote " + output text_file = open(output, "w") text_file.write(locations) text_file.close()
def generate_base_locations(): """ Gets all locations/heights for photographers Saves in latlons.txt and heights.txt """ basepath = os.environ["BASEPATH"] response = open(basepath + "address.csv") print "\n\nloaded " + basepath + "address.csv" reader = csv.DictReader(response) location_pattern = re.compile("(\-?\d+(\.\d+)?)\s*,\s*(\-?\d+(\.\d+)?).*") places = [] heights = [] every_row = [] for row in reader: row = Converter.remove_bom(row) remarks = row["Remarks"] remarks = Converter.convert_whitespace(remarks) if remarks == None: print "No remarks:" + row["ConstituentID"] + ":" + remarks continue if location_pattern.match(remarks) == None: if remarks != "NULL": print "NULL remarks:" + row["ConstituentID"] + ":" + remarks continue row["Remarks"] = remarks every_row.append(row) every_row = sorted(every_row, key=lambda d: d["BeginDate"]) # put born first and died last always other_addresses = [] born_addresses = [] died_addresses = [] for add in every_row: # put the active/biz ones if add["AddressTypeID"] == "7" or add["AddressTypeID"] == "2": other_addresses.append(add) # find born if any if add["AddressTypeID"] == "5": born_addresses.append(add) # find died if any if add["AddressTypeID"] == "6": died_addresses.append(add) for_real_sorted_every_row = [] for_real_sorted_every_row.extend(born_addresses) for_real_sorted_every_row.extend(other_addresses) for_real_sorted_every_row.extend(died_addresses) for row in for_real_sorted_every_row: address = Converter.compress_address(row["Remarks"]) height = 0 if len(address) > 2: height = address.pop() heights.extend([row["ConAddressID"], height]) address.append(row["ConstituentID"]) address.append(row["ConAddressID"]) if row["AddressTypeID"] != "NULL": address.append(row["AddressTypeID"]) else: print "no type:" + row["ConstituentID"] address.append("1") address.append(row["CountryID"]) places.extend(address) locations = '["constituents", [' + ",".join(places) + "]]" output = basepath + "latlons.txt" print "\nwrote " + output text_file = open(output, "w") text_file.write(locations) text_file.close() locations = '["heights", [' + ",".join(heights) + "]]" output = basepath + "heights.txt" print "\nwrote " + output text_file = open(output, "w") text_file.write(locations) text_file.close()
def process_constituents(endpoint): """ Consolidates all constituent data into a single dictionary. Pushes each dictionary as a document to Elastic. """ start = timeit.default_timer() constituents = create_base_constituents() counter = 0 tables = [ "format", "biography", "address", "gender", "process", "role", "collection" ] joins = [ "formats", "biographies", ["addresstypes", "countries"], "genders", "processes", "roles", "collections" ] location_pattern = re.compile("(\-?\d+(\.\d+)?)\s*,\s*(\-?\d+(\.\d+)?)") for table in tables: reader = Converter.process_csv(os.environ['BASEPATH'] + table + ".csv") if type(joins[counter]) is str: joindata = get_join_data(os.environ['BASEPATH'] + joins[counter] + ".csv") else: j0 = get_join_data(os.environ['BASEPATH'] + joins[counter][0] + ".csv") j1 = get_join_data(os.environ['BASEPATH'] + joins[counter][1] + ".csv") joindata = [j0, j1] for row in reader: row = Converter.remove_bom(row) if ((row['ConstituentID'] in constituents) == False): print "No constituent:" + row['ConstituentID'] continue if ((table in constituents[row['ConstituentID']]) == False): constituents[row['ConstituentID']][table] = [] cid = row['ConstituentID'] if not 'ConAddressID' in row: del row['ConstituentID'] else: row['id'] = row['ConAddressID'] # add the value of the term id if 'TermID' in row: if ((row['TermID'] in joindata) == False): print "No " + table + ":" + row['TermID'] else: row['Term'] = joindata[row['TermID']] if 'AddressTypeID' in row: if ((row['AddressTypeID'] in joindata[0]) == False): print "No " + joins[counter][0] + ":" + row['AddressTypeID'] print joindata[0] else: row['AddressType'] = joindata[0][row['AddressTypeID']] if ((row['CountryID'] in joindata[1]) == False): print "No " + joins[counter][1] + ":" + row['CountryID'] else: row['Country'] = joindata[1][row['CountryID']] if 'Remarks' in row: row['Remarks'] = Converter.convert_whitespace(row['Remarks']) if location_pattern.match(row['Remarks']): latlon = Converter.compress_address(row['Remarks']) row['Remarks'] = ",".join(latlon) row['Location'] = { "lat": float(latlon[0]), "lon": float(latlon[1]) } constituents[cid][table].append(row) counter = counter + 1 end = timeit.default_timer() print "\n\nProcessed CSVs in " + str(end - start) + " seconds\n" print "\n\nPreparing indexing actions...\n" start = timeit.default_timer() # now on to elastic (index already created) actions = [] addresslist = [] for index, cid in enumerate(constituents): addresses = [] if 'address' in constituents[cid]: # sort addresses addresses = Converter.sort_addresses(constituents[cid]['address']) constituents[cid]['addressTotal'] = len( constituents[cid]['address']) del constituents[cid]['address'] # put the addresses addresslist = addresslist + addresses # put the constituent in there actions.append(create_constituent(constituents[cid])) end = timeit.default_timer() print "\n\nActions prepared in " + str((end - start) / 60) + " minutes\n" print "\n\nIndexing...\n" create_indices(endpoint) es = Elasticsearch([endpoint], timeout=360, max_retries=10, retry_on_timeout=True) # split the actions into batches of 10k print " Constituents..." index = 0 n = 10000 splitactions = split_list(actions, n) for actionchunk in splitactions: print " actions " + str(index * n) + " to " + str((index + 1) * n) index = index + 1 helpers.bulk(es, actionchunk) print " Addesses..." index = 0 splitaddresses = split_list(addresslist, n) for addresschunk in splitaddresses: print " actions " + str(index * n) + " to " + str((index + 1) * n) index = index + 1 helpers.bulk( es, [build_action(value, 'pic', 'address') for value in addresschunk]) return constituents