def create_base_constituents(): """ Builds the raw skeleton of a constituent from constituents.csv """ reader = Converter.process_csv(os.environ['BASEPATH'] + 'constituents.csv') constituents = {} for row in reader: row = Converter.remove_bom(row) if (row['AlphaSort'] == None): print "No AlphaSort in: " + row['ConstituentID'] row['AlphaSort'] = '' row['nameSort'] = row['AlphaSort'].split(" ")[0] row['id'] = row['ConstituentID'] constituents[row['ConstituentID']] = row return constituents
def get_join_data(filepath): """ To denormalize CSV data term IDs """ reader = Converter.process_csv(filepath) data = {} for row in reader: row = Converter.remove_bom(row) tmp = [] for k in row: tmp.append(row[k]) if tmp[1].isdigit(): data[tmp[1]] = tmp[0] else: data[tmp[0]] = tmp[1] return data
def process_constituents(endpoint): """ Consolidates all constituent data into a single dictionary. Pushes each dictionary as a document to Elastic. """ start = timeit.default_timer() constituents = create_base_constituents() counter = 0 tables = ["format","biography","address","gender","process","role","collection"] joins = ["formats","biographies",["addresstypes","countries"],"genders","processes","roles","collections"] location_pattern = re.compile("(\-?\d+(\.\d+)?)\s*,\s*(\-?\d+(\.\d+)?)") for table in tables: reader = Converter.process_csv(os.environ['BASEPATH'] + table + ".csv") if type(joins[counter]) is str: joindata = get_join_data(os.environ['BASEPATH'] + joins[counter] + ".csv") else: j0 = get_join_data(os.environ['BASEPATH'] + joins[counter][0] + ".csv") j1 = get_join_data(os.environ['BASEPATH'] + joins[counter][1] + ".csv") joindata = [j0,j1] for row in reader: row = Converter.remove_bom(row) if ((row['ConstituentID'] in constituents) == False): print "No constituent:" + row['ConstituentID'] continue if ((table in constituents[row['ConstituentID']]) == False): constituents[row['ConstituentID']][table] = [] cid = row['ConstituentID'] if not 'ConAddressID' in row: del row['ConstituentID'] else: row['id'] = row['ConAddressID'] # add the value of the term id if 'TermID' in row: if ((row['TermID'] in joindata) == False): print "No " + table + ":" + row['TermID'] else: row['Term'] = joindata[row['TermID']] if 'AddressTypeID' in row: if ((row['AddressTypeID'] in joindata[0]) == False): print "No " + joins[counter][0] + ":" + row['AddressTypeID'] print joindata[0] else: row['AddressType'] = joindata[0][row['AddressTypeID']] if ((row['CountryID'] in joindata[1]) == False): print "No " + joins[counter][1] + ":" + row['CountryID'] else: row['Country'] = joindata[1][row['CountryID']] if 'Remarks' in row: row['Remarks'] = Converter.convert_whitespace(row['Remarks']) if location_pattern.match(row['Remarks']): latlon = Converter.compress_address(row['Remarks']) row['Remarks'] = ",".join(latlon) row['Location'] = { "lat" : float(latlon[0]), "lon" : float(latlon[1]) } constituents[cid][table].append(row) counter = counter + 1 end = timeit.default_timer() print "\n\nProcessed CSVs in " + str(end - start) + " seconds\n" print "\n\nPreparing indexing actions...\n" start = timeit.default_timer() # now on to elastic (index already created) actions = [] addresslist = [] for index, cid in enumerate(constituents): addresses = [] if 'address' in constituents[cid]: # sort addresses addresses = Converter.sort_addresses(constituents[cid]['address']) constituents[cid]['addressTotal'] = len(constituents[cid]['address']) del constituents[cid]['address'] # put the addresses addresslist = addresslist + addresses # put the constituent in there actions.append(create_constituent(constituents[cid])) end = timeit.default_timer() print "\n\nActions prepared in " + str((end - start)/60) + " minutes\n" print "\n\nIndexing...\n" create_indices(endpoint) es = Elasticsearch([endpoint], timeout=360, max_retries=10, retry_on_timeout=True) # split the actions into batches of 10k print " Constituents..." index = 0 n = 10000 splitactions = split_list(actions, n) for actionchunk in splitactions: print " actions " + str(index*n) + " to " + str((index+1)*n) index = index + 1 helpers.bulk(es, actionchunk) print " Addesses..." index = 0 splitaddresses = split_list(addresslist, n) for addresschunk in splitaddresses: print " actions " + str(index*n) + " to " + str((index+1)*n) index = index + 1 helpers.bulk(es, [build_action(value, 'pic', 'address') for value in addresschunk]) return constituents
def process_constituents(endpoint): """ Consolidates all constituent data into a single dictionary. Pushes each dictionary as a document to Elastic. """ start = timeit.default_timer() constituents = create_base_constituents() counter = 0 tables = [ "format", "biography", "address", "gender", "process", "role", "collection" ] joins = [ "formats", "biographies", ["addresstypes", "countries"], "genders", "processes", "roles", "collections" ] location_pattern = re.compile("(\-?\d+(\.\d+)?)\s*,\s*(\-?\d+(\.\d+)?)") for table in tables: reader = Converter.process_csv(os.environ['BASEPATH'] + table + ".csv") if type(joins[counter]) is str: joindata = get_join_data(os.environ['BASEPATH'] + joins[counter] + ".csv") else: j0 = get_join_data(os.environ['BASEPATH'] + joins[counter][0] + ".csv") j1 = get_join_data(os.environ['BASEPATH'] + joins[counter][1] + ".csv") joindata = [j0, j1] for row in reader: row = Converter.remove_bom(row) if ((row['ConstituentID'] in constituents) == False): print "No constituent:" + row['ConstituentID'] continue if ((table in constituents[row['ConstituentID']]) == False): constituents[row['ConstituentID']][table] = [] cid = row['ConstituentID'] if not 'ConAddressID' in row: del row['ConstituentID'] else: row['id'] = row['ConAddressID'] # add the value of the term id if 'TermID' in row: if ((row['TermID'] in joindata) == False): print "No " + table + ":" + row['TermID'] else: row['Term'] = joindata[row['TermID']] if 'AddressTypeID' in row: if ((row['AddressTypeID'] in joindata[0]) == False): print "No " + joins[counter][0] + ":" + row['AddressTypeID'] print joindata[0] else: row['AddressType'] = joindata[0][row['AddressTypeID']] if ((row['CountryID'] in joindata[1]) == False): print "No " + joins[counter][1] + ":" + row['CountryID'] else: row['Country'] = joindata[1][row['CountryID']] if 'Remarks' in row: row['Remarks'] = Converter.convert_whitespace(row['Remarks']) if location_pattern.match(row['Remarks']): latlon = Converter.compress_address(row['Remarks']) row['Remarks'] = ",".join(latlon) row['Location'] = { "lat": float(latlon[0]), "lon": float(latlon[1]) } constituents[cid][table].append(row) counter = counter + 1 end = timeit.default_timer() print "\n\nProcessed CSVs in " + str(end - start) + " seconds\n" print "\n\nPreparing indexing actions...\n" start = timeit.default_timer() # now on to elastic (index already created) actions = [] addresslist = [] for index, cid in enumerate(constituents): addresses = [] if 'address' in constituents[cid]: # sort addresses addresses = Converter.sort_addresses(constituents[cid]['address']) constituents[cid]['addressTotal'] = len( constituents[cid]['address']) del constituents[cid]['address'] # put the addresses addresslist = addresslist + addresses # put the constituent in there actions.append(create_constituent(constituents[cid])) end = timeit.default_timer() print "\n\nActions prepared in " + str((end - start) / 60) + " minutes\n" print "\n\nIndexing...\n" create_indices(endpoint) es = Elasticsearch([endpoint], timeout=360, max_retries=10, retry_on_timeout=True) # split the actions into batches of 10k print " Constituents..." index = 0 n = 10000 splitactions = split_list(actions, n) for actionchunk in splitactions: print " actions " + str(index * n) + " to " + str((index + 1) * n) index = index + 1 helpers.bulk(es, actionchunk) print " Addesses..." index = 0 splitaddresses = split_list(addresslist, n) for addresschunk in splitaddresses: print " actions " + str(index * n) + " to " + str((index + 1) * n) index = index + 1 helpers.bulk( es, [build_action(value, 'pic', 'address') for value in addresschunk]) return constituents