예제 #1
0
def merger(dollars_divided, geo):
    '''
    Merges the dollars_divided and geo dataframes, coalescing the values across
    matching columns. Drops unwanted columns. Returns a dataframe.
    '''

    # Define the arguments to merge_coalesce
    keys = ['Address', 'City', 'State', 'ZipCode']
    sfx = '_R'
    how = 'left'

    # Merge dollars_divided and geo together, filling in coordinates
    df = u.merge_coalesce(dollars_divided, geo, keys, sfx, how)

    # Drop these columns
    df = df.drop([
        'ClusterID', 'VendorName_LINK1', 'VendorName_LINK2', 'Name',
        'CSDS_Vendor_ID_LINK2'
    ],
                 axis=1)

    # Drop duplicates based only on this subset
    subset = ['CSDS_Vendor_ID', 'Address', 'City', 'State', 'ZipCode']

    return df.drop_duplicates(subset=subset).reset_index(drop=True)
예제 #2
0
def try_fill(df):
    '''
    Fills in missing zip codes and coordinates as best as possible. Copies in
    values from elsewhere in the dataset and from the geocoded HQ addresses.
    Returns a dataframe.
    '''

    # Print progress report
    print('\nFilling in missing zip codes and coordinates as best as possible')

    # Fill in missing zip codes as best as possible
    targetsZ = ['ZipCode']
    keys1Z = ['Address', 'City', 'State']
    keys2Z = ['Name', 'Longitude', 'Latitude']
    df = filler(df, targetsZ, str, keys1Z, keys2Z)

    # Fill in missing longitude and latitude coordinates as best as possible
    targetsL = ['Longitude', 'Latitude']
    keys1L = ['Address', 'City', 'State', 'ZipCode']
    keys2L = ['Name']
    df = filler(df, targetsL, float, keys1L, keys2L)

    # Read in the geocoded HQ addresses and fill in zip codes and coordinates as
    # best as possible
    geo = read_geo()
    subset = ['Address', 'City', 'State']
    df = u.merge_coalesce(df.reset_index(drop=True), geo, subset)

    return df
예제 #3
0
def preprocess_contracts():
    '''
    Reads in the contract records. Preprocesses them to clean the amounts and
    keep only those over the minimum amount specified in the MIN_DOLLARS
    constant.  Imports hand-collected addresses for Cook and IL contracts and
    merges in addresses from IRS990 forms to fill in as many blanks as possible.
    Returns a dataframe.
    '''

    # Read in the contracts and clean the dollar amounts
    contracts = read_contracts()
    contracts = clean_amounts(contracts)

    # Read in the COOK addresses dataset
    cook = import_addresses('cook')

    # Fill in addresses from the COOK dataset, matching on VendorName; then,
    # standardize VendorName
    print('Coalescing COOK address matches')
    merged = u.merge_coalesce(contracts, cook, 'VendorName')
    merged['VendorName'] = merged['VendorName'].apply(stdname)

    # Read in the IRS dataset
    irs = import_addresses('irs')

    # Get a datframe of JW similarity matches >= JWSIM_THRESH between the merged
    # and irs dataframes
    sfx = '_IRS'
    jwsim = jwsim_contracts_irs(merged, irs, sfx)

    # Print progress report
    print('Coalescing IRS matches')

    # Fill in addresses from the IRS dataset
    coalesced = coalesce_matches(merged, jwsim, sfx)

    # Read in the IL addresses dataset
    il = import_addresses('il')

    # Print progress report
    print('Coalescing IL matches')

    # Fill in addresses from the IL dataset, matching on VendorName
    df = u.merge_coalesce(coalesced, il, 'VendorName')

    return df
예제 #4
0
def coalesce_matches(contracts, jwsim, suffix):
    '''
    Pulls in the addresses from IRS records previously deemed to match the IL
    agencies. Returns a dataframe.
    '''

    jwsim = trim_jwsim(jwsim, suffix)

    # Define the key on which to coalesce
    keys = ['CSDS_Contract_ID']

    # Fill in missing values in contracts from matches in jwsim, matchin on keys
    df = u.merge_coalesce(contracts, jwsim, keys, suffix)

    return df
def fix_duplicate_addresses(df, key='ClusterID', target='Address_SVC'):
    '''
    Takes in a dataframe. Attempts to fix duplicate addresses (by default, in
    the 'Address_SVC' field) if they have the same key (bby default, the
    'ClusterID' field). Returns a dataframe.
    '''

    print('\nFixing duplicate addresses')

    # Sort the target field by length, longest to shortest
    sorter = df[target].str.len().sort_values(ascending=False).index
    df = df.reindex(sorter)

    # Make a mini version of the dataframe with two fields, the key & the target
    # (which has been renamed to indicate it's the original field)
    minimized_df = df[[key, target]].drop_duplicates().dropna()
    minimized_df[target + '_Original'] = minimized_df[target]

    # Make a list of the unique values in the key field
    unique_keys = list(minimized_df[key].unique())

    # Set a flag to FALSE
    new_df_exists = False

    # OVERVIEW: Call the iter_df() function on subsets of the dataframe (one
    # subset per key) to compare and fix the address strings assigned to that
    # key.

    # For each value in the list of unique keys:
    # Make a mini dataframe that is just the rows corresponding to that key
    # If the there is more than 1 row:
    # Call iter_df() on the mini df & assign the result to local_df2
    # If the new_df_exists flag is set to TRUE:
    # Create new_df by concatenating the existing new_df and local_df2
    # else: Assign the name new_df to local_df2 and set the new_df_exists to TRUE
    for uKey in unique_keys:
        local_df = minimized_df[minimized_df[key] == uKey]
        if len(local_df) > 1:
            local_df2 = iter_df(
                local_df.copy().drop_duplicates().reset_index(drop=True),
                target)
            if new_df_exists:
                new_df = pd.concat([new_df, local_df2])
            else:
                new_df = local_df2
                new_df_exists = True

    print('Coalescing fixed addresses')

    # Rename the columns in preparation of calling merge_coalesce()
    new_cols = {target: target + '_COAL', target + '_Original': target}
    new_df = new_df.rename(columns=new_cols, index=str)

    # Rename the columns in preparation of calling merge_coalesce()
    min_cols = {target + '_Original': target + '_COAL'}
    minimized_df = minimized_df.rename(columns=min_cols, index=str)

    # Coalesce with the dfs in this order so that we keep the new values
    merged = u.merge_coalesce(new_df, minimized_df, [key, target], how='right')

    # Merge the new address strings in, drop the original field, and rename the
    # new one
    df = df.merge(merged, how='left').drop(target, axis=1)
    df = df.rename(columns={target + '_COAL': target}, index=str)

    return df