Ejemplo n.º 1
0
def get_minimum_year():
    """
    Gets the minimum year in the address/constituents CSVs using only BeginDate

    Saves in minyear.txt
    """
    basepath = os.environ['BASEPATH']

    response = open(basepath+'address.csv')
    reader = csv.DictReader(response)
    years = [int(Converter.str_to_float(item['BeginDate'])) for item in reader if item['BeginDate'] != '']
    nozeros = [number for number in years if (number != 0 and number != -1)]
    minyeara = np.amin(nozeros)
    print "\n\nMinimum year in address is: " + str(minyeara)

    response = open(basepath+'constituents.csv')
    reader = csv.DictReader(response)
    years = [int(Converter.str_to_float(item['BeginDate'])) for item in reader if item['BeginDate'] != '']
    nozeros = [number for number in years if (number != 0 and number != -1)]
    minyearb = np.amin(nozeros)
    print "\n\nMinimum year in constituents is: " + str(minyearb)

    minyear = min(minyeara, minyearb)

    output = basepath+'minyear.txt'
    print "\nwrote " + output
    text_file = open(output, "w")
    text_file.write(str(minyear))
    text_file.close()
Ejemplo n.º 2
0
def create_base_constituents():
    """
    Builds the raw skeleton of a constituent from constituents.csv
    """
    reader = Converter.process_csv(os.environ['BASEPATH'] + 'constituents.csv')
    constituents = {}
    for row in reader:
        row = Converter.remove_bom(row)
        if (row['AlphaSort'] == None):
            print "No AlphaSort in: " + row['ConstituentID']
            row['AlphaSort'] = ''
        row['nameSort'] = row['AlphaSort'].split(" ")[0]
        row['id'] = row['ConstituentID']
        constituents[row['ConstituentID']] = row
    return constituents
Ejemplo n.º 3
0
def create_base_constituents():
    """
    Builds the raw skeleton of a constituent from constituents.csv
    """
    reader = Converter.process_csv(os.environ['BASEPATH'] + 'constituents.csv')
    constituents = {}
    for row in reader:
        row = Converter.remove_bom(row)
        if (row['AlphaSort'] == None):
            print "No AlphaSort in: " + row['ConstituentID']
            row['AlphaSort'] = ''
        row['nameSort'] = row['AlphaSort'].split(" ")[0]
        row['id'] = row['ConstituentID']
        constituents[row['ConstituentID']] = row
    return constituents
Ejemplo n.º 4
0
def get_join_data(filepath):
    """
    To denormalize CSV data term IDs
    """
    reader = Converter.process_csv(filepath)
    data = {}
    for row in reader:
        row = Converter.remove_bom(row)
        tmp = []
        for k in row:
            tmp.append(row[k])
        if tmp[1].isdigit():
            data[tmp[1]] = tmp[0]
        else:
            data[tmp[0]] = tmp[1]
    return data
Ejemplo n.º 5
0
def get_join_data(filepath):
    """
    To denormalize CSV data term IDs
    """
    reader = Converter.process_csv(filepath)
    data = {}
    for row in reader:
        row = Converter.remove_bom(row)
        tmp = []
        for k in row:
            tmp.append(row[k])
        if tmp[1].isdigit():
            data[tmp[1]] = tmp[0]
        else:
            data[tmp[0]] = tmp[1]
    return data
Ejemplo n.º 6
0
def build_action(row, index, doc_type):
    """
    Creates an ES action for indexing
    """
    cleaned = Converter.remove_bom(row)
    if not "id" in cleaned:
        print "NO ID!!!1!"
        print cleaned
    action = {
        "_index": index,
        "_type": doc_type,
        "_source": cleaned,
        "_id": cleaned["id"]
    }
    if doc_type == 'address':
        action["_parent"] = cleaned['ConstituentID']
        action["_routing"] = cleaned['ConstituentID']
    return action
Ejemplo n.º 7
0
def build_action(row, index, doc_type):
    """
    Creates an ES action for indexing
    """
    cleaned = Converter.remove_bom(row)
    if not "id" in cleaned:
        print "NO ID!!!1!"
        print cleaned
    action = {
        "_index": index,
        "_type": doc_type,
        "_source": cleaned,
        "_id": cleaned["id"]
    }
    if doc_type == 'address':
        action["_parent"] = cleaned['ConstituentID']
        action["_routing"] = cleaned['ConstituentID']
    return action
Ejemplo n.º 8
0
def process_constituents(endpoint):
    """
    Consolidates all constituent data into a single dictionary. Pushes each dictionary as a document to Elastic.
    """
    start = timeit.default_timer()
    constituents = create_base_constituents()
    counter = 0
    tables = ["format","biography","address","gender","process","role","collection"]
    joins = ["formats","biographies",["addresstypes","countries"],"genders","processes","roles","collections"]
    location_pattern = re.compile("(\-?\d+(\.\d+)?)\s*,\s*(\-?\d+(\.\d+)?)")
    for table in tables:
        reader = Converter.process_csv(os.environ['BASEPATH'] + table + ".csv")
        if type(joins[counter]) is str:
            joindata = get_join_data(os.environ['BASEPATH'] + joins[counter] + ".csv")
        else:
            j0 = get_join_data(os.environ['BASEPATH'] + joins[counter][0] + ".csv")
            j1 = get_join_data(os.environ['BASEPATH'] + joins[counter][1] + ".csv")
            joindata = [j0,j1]
        for row in reader:
            row = Converter.remove_bom(row)
            if ((row['ConstituentID'] in constituents) == False):
                print "No constituent:" + row['ConstituentID']
                continue
            if ((table in constituents[row['ConstituentID']]) == False):
                constituents[row['ConstituentID']][table] = []
            cid = row['ConstituentID']
            if not 'ConAddressID' in row:
                del row['ConstituentID']
            else:
                row['id'] = row['ConAddressID']
            # add the value of the term id
            if 'TermID' in row:
                if ((row['TermID'] in joindata) == False):
                    print "No " + table + ":" + row['TermID']
                else:
                    row['Term'] = joindata[row['TermID']]
            if 'AddressTypeID' in row:
                if ((row['AddressTypeID'] in joindata[0]) == False):
                    print "No " + joins[counter][0] + ":" + row['AddressTypeID']
                    print joindata[0]
                else:
                    row['AddressType'] = joindata[0][row['AddressTypeID']]
                if ((row['CountryID'] in joindata[1]) == False):
                    print "No " + joins[counter][1] + ":" + row['CountryID']
                else:
                    row['Country'] = joindata[1][row['CountryID']]
            if 'Remarks' in row:
                row['Remarks'] = Converter.convert_whitespace(row['Remarks'])
                if location_pattern.match(row['Remarks']):
                    latlon = Converter.compress_address(row['Remarks'])
                    row['Remarks'] = ",".join(latlon)
                    row['Location'] = { "lat" : float(latlon[0]), "lon" : float(latlon[1]) }
            constituents[cid][table].append(row)
        counter = counter + 1
    end = timeit.default_timer()
    print "\n\nProcessed CSVs in " + str(end - start) + " seconds\n"
    print "\n\nPreparing indexing actions...\n"
    start = timeit.default_timer()
    # now on to elastic (index already created)
    actions = []
    addresslist = []
    for index, cid in enumerate(constituents):
        addresses = []
        if 'address' in constituents[cid]:
            # sort addresses
            addresses = Converter.sort_addresses(constituents[cid]['address'])
            constituents[cid]['addressTotal'] = len(constituents[cid]['address'])
            del constituents[cid]['address']
            # put the addresses
            addresslist = addresslist + addresses
        # put the constituent in there
        actions.append(create_constituent(constituents[cid]))
    end = timeit.default_timer()
    print "\n\nActions prepared in " + str((end - start)/60) + " minutes\n"
    print "\n\nIndexing...\n"
    create_indices(endpoint)
    es = Elasticsearch([endpoint], timeout=360, max_retries=10, retry_on_timeout=True)
    # split the actions into batches of 10k
    print "  Constituents..."
    index = 0
    n = 10000
    splitactions = split_list(actions, n)
    for actionchunk in splitactions:
        print "    actions " + str(index*n) + " to " + str((index+1)*n)
        index = index + 1
        helpers.bulk(es, actionchunk)
    print "  Addesses..."
    index = 0
    splitaddresses = split_list(addresslist, n)
    for addresschunk in splitaddresses:
        print "    actions " + str(index*n) + " to " + str((index+1)*n)
        index = index + 1
        helpers.bulk(es, [build_action(value, 'pic', 'address') for value in addresschunk])
    return constituents
Ejemplo n.º 9
0
def generate_base_locations():
    """
    Gets all locations/heights for photographers
    Saves in latlons.txt and heights.txt
    """
    basepath = os.environ['BASEPATH']
    response = open(basepath + 'address.csv')
    print "\n\nloaded " + basepath + "address.csv"
    reader = csv.DictReader(response)
    location_pattern = re.compile("(\-?\d+(\.\d+)?)\s*,\s*(\-?\d+(\.\d+)?).*")
    places = []
    heights = []
    every_row = []
    for row in reader:
        row = Converter.remove_bom(row)
        remarks = row['Remarks']
        remarks = Converter.convert_whitespace(remarks)
        if remarks == None:
            print "No remarks:" + row['ConstituentID'] + ":" + remarks
            continue
        if location_pattern.match(remarks) == None:
            if remarks != "NULL":
                print "NULL remarks:" + row['ConstituentID'] + ":" + remarks
            continue
        row['Remarks'] = remarks
        every_row.append(row)
    every_row = sorted(every_row, key=lambda d: d['BeginDate'])
    # put born first and died last always
    other_addresses = []
    born_addresses = []
    died_addresses = []
    for add in every_row:
        # put the active/biz ones
        if (add['AddressTypeID'] == '7' or add['AddressTypeID'] == '2'):
            other_addresses.append(add)
        # find born if any
        if (add['AddressTypeID'] == '5'):
            born_addresses.append(add)
        # find died if any
        if (add['AddressTypeID'] == '6'):
            died_addresses.append(add)
    for_real_sorted_every_row = []
    for_real_sorted_every_row.extend(born_addresses)
    for_real_sorted_every_row.extend(other_addresses)
    for_real_sorted_every_row.extend(died_addresses)
    for row in for_real_sorted_every_row:
        address = Converter.compress_address(row['Remarks'])
        height = 0
        if len(address) > 2:
            height = address.pop()
            heights.extend([row['ConAddressID'], height])
        address.append(row['ConstituentID'])
        address.append(row['ConAddressID'])
        if row['AddressTypeID'] != "NULL":
            address.append(row['AddressTypeID'])
        else:
            print "no type:" + row['ConstituentID']
            address.append("1")
        address.append(row['CountryID'])
        places.extend(address)
    locations = "[\"constituents\", [" + ",".join(places) + "]]"
    output = basepath + 'latlons.txt'
    print "\nwrote " + output
    text_file = open(output, "w")
    text_file.write(locations)
    text_file.close()
    locations = "[\"heights\", [" + ",".join(heights) + "]]"
    output = basepath + 'heights.txt'
    print "\nwrote " + output
    text_file = open(output, "w")
    text_file.write(locations)
    text_file.close()
Ejemplo n.º 10
0
def generate_base_locations():
    """
    Gets all locations/heights for photographers
    Saves in latlons.txt and heights.txt
    """
    basepath = os.environ["BASEPATH"]
    response = open(basepath + "address.csv")
    print "\n\nloaded " + basepath + "address.csv"
    reader = csv.DictReader(response)
    location_pattern = re.compile("(\-?\d+(\.\d+)?)\s*,\s*(\-?\d+(\.\d+)?).*")
    places = []
    heights = []
    every_row = []
    for row in reader:
        row = Converter.remove_bom(row)
        remarks = row["Remarks"]
        remarks = Converter.convert_whitespace(remarks)
        if remarks == None:
            print "No remarks:" + row["ConstituentID"] + ":" + remarks
            continue
        if location_pattern.match(remarks) == None:
            if remarks != "NULL":
                print "NULL remarks:" + row["ConstituentID"] + ":" + remarks
            continue
        row["Remarks"] = remarks
        every_row.append(row)
    every_row = sorted(every_row, key=lambda d: d["BeginDate"])
    # put born first and died last always
    other_addresses = []
    born_addresses = []
    died_addresses = []
    for add in every_row:
        # put the active/biz ones
        if add["AddressTypeID"] == "7" or add["AddressTypeID"] == "2":
            other_addresses.append(add)
        # find born if any
        if add["AddressTypeID"] == "5":
            born_addresses.append(add)
        # find died if any
        if add["AddressTypeID"] == "6":
            died_addresses.append(add)
    for_real_sorted_every_row = []
    for_real_sorted_every_row.extend(born_addresses)
    for_real_sorted_every_row.extend(other_addresses)
    for_real_sorted_every_row.extend(died_addresses)
    for row in for_real_sorted_every_row:
        address = Converter.compress_address(row["Remarks"])
        height = 0
        if len(address) > 2:
            height = address.pop()
            heights.extend([row["ConAddressID"], height])
        address.append(row["ConstituentID"])
        address.append(row["ConAddressID"])
        if row["AddressTypeID"] != "NULL":
            address.append(row["AddressTypeID"])
        else:
            print "no type:" + row["ConstituentID"]
            address.append("1")
        address.append(row["CountryID"])
        places.extend(address)
    locations = '["constituents", [' + ",".join(places) + "]]"
    output = basepath + "latlons.txt"
    print "\nwrote " + output
    text_file = open(output, "w")
    text_file.write(locations)
    text_file.close()
    locations = '["heights", [' + ",".join(heights) + "]]"
    output = basepath + "heights.txt"
    print "\nwrote " + output
    text_file = open(output, "w")
    text_file.write(locations)
    text_file.close()
Ejemplo n.º 11
0
def process_constituents(endpoint):
    """
    Consolidates all constituent data into a single dictionary. Pushes each dictionary as a document to Elastic.
    """
    start = timeit.default_timer()
    constituents = create_base_constituents()
    counter = 0
    tables = [
        "format", "biography", "address", "gender", "process", "role",
        "collection"
    ]
    joins = [
        "formats", "biographies", ["addresstypes", "countries"], "genders",
        "processes", "roles", "collections"
    ]
    location_pattern = re.compile("(\-?\d+(\.\d+)?)\s*,\s*(\-?\d+(\.\d+)?)")
    for table in tables:
        reader = Converter.process_csv(os.environ['BASEPATH'] + table + ".csv")
        if type(joins[counter]) is str:
            joindata = get_join_data(os.environ['BASEPATH'] + joins[counter] +
                                     ".csv")
        else:
            j0 = get_join_data(os.environ['BASEPATH'] + joins[counter][0] +
                               ".csv")
            j1 = get_join_data(os.environ['BASEPATH'] + joins[counter][1] +
                               ".csv")
            joindata = [j0, j1]
        for row in reader:
            row = Converter.remove_bom(row)
            if ((row['ConstituentID'] in constituents) == False):
                print "No constituent:" + row['ConstituentID']
                continue
            if ((table in constituents[row['ConstituentID']]) == False):
                constituents[row['ConstituentID']][table] = []
            cid = row['ConstituentID']
            if not 'ConAddressID' in row:
                del row['ConstituentID']
            else:
                row['id'] = row['ConAddressID']
            # add the value of the term id
            if 'TermID' in row:
                if ((row['TermID'] in joindata) == False):
                    print "No " + table + ":" + row['TermID']
                else:
                    row['Term'] = joindata[row['TermID']]
            if 'AddressTypeID' in row:
                if ((row['AddressTypeID'] in joindata[0]) == False):
                    print "No " + joins[counter][0] + ":" + row['AddressTypeID']
                    print joindata[0]
                else:
                    row['AddressType'] = joindata[0][row['AddressTypeID']]
                if ((row['CountryID'] in joindata[1]) == False):
                    print "No " + joins[counter][1] + ":" + row['CountryID']
                else:
                    row['Country'] = joindata[1][row['CountryID']]
            if 'Remarks' in row:
                row['Remarks'] = Converter.convert_whitespace(row['Remarks'])
                if location_pattern.match(row['Remarks']):
                    latlon = Converter.compress_address(row['Remarks'])
                    row['Remarks'] = ",".join(latlon)
                    row['Location'] = {
                        "lat": float(latlon[0]),
                        "lon": float(latlon[1])
                    }
            constituents[cid][table].append(row)
        counter = counter + 1
    end = timeit.default_timer()
    print "\n\nProcessed CSVs in " + str(end - start) + " seconds\n"
    print "\n\nPreparing indexing actions...\n"
    start = timeit.default_timer()
    # now on to elastic (index already created)
    actions = []
    addresslist = []
    for index, cid in enumerate(constituents):
        addresses = []
        if 'address' in constituents[cid]:
            # sort addresses
            addresses = Converter.sort_addresses(constituents[cid]['address'])
            constituents[cid]['addressTotal'] = len(
                constituents[cid]['address'])
            del constituents[cid]['address']
            # put the addresses
            addresslist = addresslist + addresses
        # put the constituent in there
        actions.append(create_constituent(constituents[cid]))
    end = timeit.default_timer()
    print "\n\nActions prepared in " + str((end - start) / 60) + " minutes\n"
    print "\n\nIndexing...\n"
    create_indices(endpoint)
    es = Elasticsearch([endpoint],
                       timeout=360,
                       max_retries=10,
                       retry_on_timeout=True)
    # split the actions into batches of 10k
    print "  Constituents..."
    index = 0
    n = 10000
    splitactions = split_list(actions, n)
    for actionchunk in splitactions:
        print "    actions " + str(index * n) + " to " + str((index + 1) * n)
        index = index + 1
        helpers.bulk(es, actionchunk)
    print "  Addesses..."
    index = 0
    splitaddresses = split_list(addresslist, n)
    for addresschunk in splitaddresses:
        print "    actions " + str(index * n) + " to " + str((index + 1) * n)
        index = index + 1
        helpers.bulk(
            es,
            [build_action(value, 'pic', 'address') for value in addresschunk])
    return constituents