def import_addresses(dataset): ''' Reads in one of three address datasets (specified with a string). Returns a dataframe. ''' print('Reading in {} addresses'.format(dataset.upper())) # Read in the COOK address dataset; rename a column if dataset == 'cook': df = ad.read_cook_addr() df = df.rename(columns={'ID': 'VendorName'}, index=str) # Read in the IRS dataset; rename a column and standardize names elif dataset == 'irs': df = ad.read_irs() df = df.rename(columns={'OrganizationName': 'VendorName'}, index=str) df['VendorName'] = df['VendorName'].apply(stdname) # Read in the IL address dataset; standardize names elif dataset == 'il': df = ad.read_il_addr() df['VendorName'] = df['VendorName'].apply(stdname) # Conver text fields to uppercase df = u.upper(df) return df
def read_contracts(): ''' Reads in the contracts dataset via the MERGE_CONTRACTS module. Returns a dataframe. ''' # Initialize an empty list to hold the dataframes dfs = [] # For every ((filename,label)) tuple: for fname_tuple in mc.FNAMES: # Read in and process the dataset df = mc.process_dataset(fname_tuple) # If the label == 'CHI': if fname_tuple[-1] == 'CHI': # Send the dataframe through the round2 address cleaner df = addclean.round2(df) # Send the Address1 field through the address cleaner df['Address1'] = df['Address1'].apply(addclean.address_cleaner) # Add the newly processed dataframe into the list dfs.append(df) # Concatenate all the dataframes merged = pd.concat(dfs) # Convert the text columns (except for the URLs) to uppercase merged = u.upper(merged) # There should be this many records in the dataframe: 6591 records return merged
def import_pb(fname): ''' Reads in the PurpleBinder dataset. Splits each record into multiple based on the number of locations contained in the locations field. Splits the location column into its component parts (Address1, Address2, City, State, & ZipCode) and then converts all the strings to uppercase. Returns a dataframe. ''' # Read in the json file df = read_pb(fname) # Split the locations into multiple rows (one row per location) splitR = split_rows(df) # Split the location column into its component parts splitC = split_cols(splitR) # Convert string columns to uppercase df_upper = u.upper(splitC) # There are serious problems with some of the geocoding in the PB data, so # drop the coordinates df_upper = df_upper.drop(['Latitude', 'Longitude'], axis=1) return df_upper
def import_dfss(fname): ''' Reads in the DFSS dataset, converting strings to uppercase. Assigns an ID. Returns a dataframe. ''' df = read_dfss(fname) df_upper = u.upper(df) return df_upper
def import_wchi(fname): ''' Reads in the West Chi dataset. Splits the address field into its component parts. Converts strings to uppercase. Returns a dataframe. ''' # Read in the WESTCHI file df = read_wc(fname) # Split addresses into their compnent parts split = split_addr(df) # Convert strings to uppercase df_upper = u.upper(split) return df_upper
def import_mc(fname, sheetname): ''' Reads in one MapsCorps dataset. Replaces str(np.NaN) with the empty string. Converts string values to uppercase. Drops duplicates. Returns a dataframe. ''' # Extracts the year from the sheetname year = get_year(sheetname) # Uses a different function to read in the file based on the year if year == 2009: df = read_2009(fname, sheetname) elif year == 2016: df = read_2016(fname, sheetname) # Replaces the string 'nan' (str(np.NaN)) with the empty string and converts # strings to uppercase df = df.replace('nan', '') df_upper = u.upper(df) return df_upper.drop_duplicates().reset_index(drop=True)