def read_svc():
    '''
    Reads in the service agency addresses. Calls the COMPARE_ADDRESSES module to
    merge duplicate addresses per agency. Counts the number of service addresses
    per organization. Returns a dataframe.
    '''

    # Print progress report
    print('\nReading in service agencies')

    # Read in the service agencies, converting zip code to string
    df = pd.read_csv(SVC, converters={'ZipCode': str})

    # Append '_SVC' to all columns except CSDS_Svc_ID
    df = u.rename_cols(df, [x for x in df.columns if x != 'CSDS_Svc_ID'],
                       '_SVC')

    # Rename a column to prepare for linking
    df = df.rename(columns={'CSDS_Svc_ID': 'CSDS_Vendor_ID_LINK2'}, index=str)

    # Use the COMPARE_ADDRESSES module to clean up multiple strings for a single
    # address record
    key = 'CSDS_Vendor_ID_LINK2'
    target = 'Address_SVC'
    fixed_addresses = ca.fix_duplicate_addresses(df, key, target)

    # Drop duplicates based on the key and target fields
    fixed_addresses = fixed_addresses.drop_duplicates(subset=[key, target])

    return fixed_addresses
예제 #2
0
def jwsim_contracts_irs(contracts, irs, suffix):
    '''
    Takes the contracts and IRS dataframes and returns a dataframe of records
    with matching names where the JW similarity is >= JWSIM_THRESH.
    '''

    # Rename the columns in IRS:
    irs = u.rename_cols(irs, irs.columns, suffix)

    # Restrict the contracts df to just those from IL
    contracts = contracts[contracts.CSDS_Contract_ID.str.startswith('IL')]

    # Take the cartesian product between the two; replace np.NaN with ''
    prod = mn.cart_prod(contracts, irs)
    prod = prod.replace(np.NaN, '')

    # Print progress report
    print('Calculating Jaro-Winkler similarity on vendor names')

    # Compute the Jaro-Winkler similarity on the VendorName cols
    col1 = 'VendorName'
    arg = ((prod, col1, col1 + suffix))
    jwsim = mn.parallelize(mn.jwsim, arg)

    # Return only the rows where JW similarity >= JWSIM_THRESH
    return jwsim[jwsim.JWSimilarity >= JWSIM_THRESH]
def linker():
    '''
    Reads in the linker file (to link HQ agencies to service agencies). Merges a
    copy of itself on cluster ID, then eliminates records that match on vendor
    ID (to produce only matches that have different vendor IDs). Returns a
    dataframe.
    '''

    # Read in the link dataframe
    link = read_linker()

    # Make two new dataframes by copying the link dataframe and renaming columns
    link1 = link.rename(columns={'VendorName': 'VendorName_LINK1'}, index=str)
    link2 = u.rename_cols(link, ['VendorName', 'CSDS_Vendor_ID'], '_LINK2')

    # Merge the two link dataframes together
    df = link1.merge(link2, how='left')

    # Drop self-matches and reset the index
    df = df[df['CSDS_Vendor_ID'] != df['CSDS_Vendor_ID_LINK2']].reset_index(
        drop=True)

    return df