def summarize(df): t = time.time() print('') print('summarize() starting at', time.strftime('%H:%M:%S', time.localtime(t))) df_summary = pd.DataFrame(columns=df.columns) for idx, df_group in df.groupby(by=[util.SERVICE_ID]): # Copy dataframe and find last row df_meter = df_group.copy() row_n = df_meter.iloc[-1] # Remove data pertaining to old meters, if any df_meter = df_meter[df_group[util.METER_NUMBER] == row_n[ util.METER_NUMBER]] # Determine whether meter readings look reasonable sr_cur = df_meter[util.CURRENT_READING].iloc[:-1] sr_pri = df_meter[util.PRIOR_READING].shift(-1).iloc[:-1] sr_eq = sr_cur == sr_pri readings_look_good = False not in sr_eq.value_counts() # Look for wraps if readings_look_good: sr_wrap = df_meter[util.CURRENT_READING] < df_meter[ util.PRIOR_READING] vc_wrap = sr_wrap.value_counts() n_wraps = vc_wrap[True] if True in vc_wrap else 0 else: n_wraps = 0 # Calculate water usage hidden by wraps max_reading = df_meter[util.CURRENT_READING].max() wrap_extent = 10**(1 + int(math.log10(max_reading))) if ( max_reading > 0) else 0 usage_hidden_by_wraps = wrap_extent * n_wraps # Summarize readings in one row row_0 = df_meter.iloc[0] row_n[util.PRIOR_DATE] = row_0[util.PRIOR_DATE] row_n[util.PRIOR_READING] = row_0[ util.PRIOR_READING] - usage_hidden_by_wraps # Report meter status if readings_look_good: row_n[ util. METER_STATUS] = util.METER_WRAP if n_wraps > 0 else util.METER_NORMAL else: row_n[util.METER_STATUS] = util.METER_ANOMALY # Save summary row df_summary = df_summary.append(row_n) util.report_elapsed_time(prefix='summarize() done: ', start_time=t) return df_summary
def flag_ownership(df): df[util.LAST_NAME] = df[util.LAST_NAME].fillna('') df[util.FIRST_NAME] = df[util.FIRST_NAME].fillna('') df[util.MIDDLE_NAME] = df[util.MIDDLE_NAME].fillna('') df[util.OWNER_1_NAME] = df[util.OWNER_1_NAME].fillna('') df[util.OWNER_2_NAME] = df[util.OWNER_2_NAME].fillna('') df[util.OWNER_3_NAME] = df[util.OWNER_3_NAME].fillna('') t = time.time() print('') print('util.IS_HOMEOWNER starting at', time.strftime('%H:%M:%S', time.localtime(t))) df[util.IS_HOMEOWNER] = df.apply(lambda row: is_homeowner(row), axis=1) print('Found {0} homeowners'.format( df[util.IS_HOMEOWNER].value_counts()[True])) util.report_elapsed_time(prefix='util.IS_HOMEOWNER done: ', start_time=t) t = time.time() print('') print('util.IS_FAMILY starting at', time.strftime('%H:%M:%S', time.localtime(t))) df[util.IS_FAMILY] = df.apply(lambda row: is_family(row, df), axis=1) print('Found {0} family members'.format( df[util.IS_FAMILY].value_counts()[True])) util.report_elapsed_time(prefix='util.IS_FAMILY done: ', start_time=t) print('') return df
df_merge.loc[ sr_dups == True, util.RESIDENT_ID ] = 'unknown' df_merge.loc[ sr_dups == True, util.VOTER_STATUS ] = 'unknown' df_merge = df_merge.drop_duplicates( subset=[util.LAST_NAME, util.FIRST_NAME] ) # Report findings n_v = len( df_census[ df_census[util.VOTER_STATUS] == 'A' ] ) n_e = len( df_merge ) n_re = len( df_merge[ df_merge[util.RESIDENT_ID].notnull() ] ) n_ve = len( df_merge[ df_merge[util.VOTER_STATUS] == 'A' ] ) print( '' ) print( 'Number of active voters: {0}'.format( n_v ) ) print( 'Number of employees: {0}'.format( n_e ) ) print( 'Number of resident employees: {0}'.format( n_re ) ) print( 'Number of active voter employees: {0}'.format( n_ve ) ) print( '' ) print( '{0:.0f}% of town employees live in Andover.'.format( 100 * n_re / n_e ) ) print( '{0:.0f}% of town employees are active voters in Andover.'.format( 100 * n_ve / n_e ) ) print( 'Active voter employees represent {0:.0f}% of all Andover active voters.'.format( 100 * n_ve / n_v ) ) print( '' ) filename = '../analysis/employees.xlsx' print( 'Writing {0} rows to {1}'.format( len( df_merge ), filename ) ) print( 'Columns: {0}'.format( list( df_merge.columns ) ) ) # Write to spreadsheet df_merge.to_excel( filename, index=False ) # Report elapsed time util.report_elapsed_time()
report_gender_findings() print( 'Guessing missing genders...' ) df_residents[util.GENDER] = df_residents.apply( lambda row: guess_gender( row ), axis=1 ) report_gender_findings() print( '' ) # Calculate age df_residents[util.AGE] = df_residents.apply( lambda row: calculate_age( row ), axis=1 ) # Insert zoning codes t = time.time() print( 'Inserting zoning codes, starting at {0}...'.format( time.strftime( '%H:%M:%S', time.localtime( t ) ) ) ) df_zones = pd.read_sql_table( 'ZoneLookup', engine, columns=[util.NORMALIZED_STREET_NUMBER, util.LADDR_ALT_STREET_NUMBER, util.NORMALIZED_STREET_NAME, util.ZONING_CODE_1] ) df_residents = pd.merge( df_residents, df_zones, how='left', on=[util.NORMALIZED_STREET_NUMBER, util.LADDR_ALT_STREET_NUMBER, util.NORMALIZED_STREET_NAME] ) df_residents[util.ZONING_CODE_1] = df_residents.apply( lambda row: get_zoning_code( row ), axis=1 ) util.report_elapsed_time( 'Inserted zoning codes -- ', t ) print( '' ) # Load primary elections dataframe df_primary = pd.read_sql_table( 'ElectionModel_01', engine, parse_dates=True ) df_primary = df_primary.sort_values( by=[util.ELECTION_DATE] ) df_primary[util.PARTY_AFFILIATION] = df_primary[util.PARTY_AFFILIATION].str.strip() df_primary[util.PARTY_VOTED] = df_primary[util.PARTY_VOTED].str.strip() # Build dictionaries of unique primary election dates and corresponding column names dc_primary_dates = OrderedDict() dc_ballot_cols = {} for date in df_primary[util.ELECTION_DATE].unique(): date_name = date.split()[0] dc_primary_dates[date_name] = '-' dc_ballot_cols[date_name] = 'primary_ballot_' + date_name
def drop_water_noise(df_merge): t = time.time() print('') print('drop_water_noise() starting at', time.strftime('%H:%M:%S', time.localtime(t))) # Fill null cells df_merge[util.METER_NUMBER] = df_merge[util.METER_NUMBER].fillna( '').astype(str) bf_len = len(df_merge) # Get list of unique Resident/Parcel ID pairs from merged table df_merge['pair'] = df_merge[util.RESIDENT_ID].str.cat( ',' + df_merge[util.PARCEL_ID]) unique_pairs = df_merge['pair'].unique() # Establish columns to be preserved preserve_columns = df_census.columns.union(df_assessment_res.columns) # Create set of service types for comparison res_irr = set(['Residential', 'Irrigation']) # Iterate over Resident IDs for pair in unique_pairs: # Extract all rows containing this Resident ID df_pair = df_merge[df_merge['pair'] == pair] # If there are multiple rows pertaining to current resident... if len(df_pair) > 1: # The +Water merge has correlated this resident/parcel pair with multiple Meter Numbers. # Get the Parcel ID parcel_id = df_pair.iloc[0][util.PARCEL_ID] # Determine whether any service types are duplicated among the rows vc = df_pair[util.SERVICE_TYPE].value_counts() dup_service_types = vc[vc > 1] # Determine how listed services compare to expected { Residential, Irrigation } vc_idx = set(vc.index) service_type_diff = vc_idx.symmetric_difference(res_irr) # If we don't have a Parcel ID, or we have a duplicated service type, or service types are not as expected: if (parcel_id == '') or (len(dup_service_types) > 0) or (len(service_type_diff) > 0): # # We don't have enough information to determine which Meter Number is the correct one. # Rather than retain excess and misleading information in the database, we clear it out. # # Drop excess rows drop_index = df_pair.index.drop([df_pair.index[0]]) df_merge = df_merge.drop(labels=drop_index) # Clear unreliable values for col_name in df_water.columns: if col_name not in preserve_columns: df_merge.at[df_pair.index[0], col_name] = '' # Report outcome print('drop_water_noise() removed {0} rows'.format(bf_len - len(df_merge))) util.report_elapsed_time(prefix='drop_water_noise() done: ', start_time=t) return df_merge
def drop_assessment_noise(df_merge): t = time.time() print('') print('drop_assessment_noise() starting at', time.strftime('%H:%M:%S', time.localtime(t))) # Fill null cells df_merge[util.PARCEL_ID] = df_merge[util.PARCEL_ID].fillna('').astype(str) bf_len = len(df_merge) # Get list of unique Resident IDs from merged table res_ids = df_merge[util.RESIDENT_ID].unique() # Iterate over Resident IDs for res_id in res_ids: # Extract all rows containing this Resident ID df_res_id = df_merge[df_merge[util.RESIDENT_ID] == res_id] # If there are multiple rows pertaining to current resident... if len(df_res_id) > 1: # Extract rows that represent parcels owned by resident df_is_homeowner = df_res_id[df_res_id[util.IS_HOMEOWNER] == True] # If resident owns any parcels... if len(df_is_homeowner) > 0: # Extract rows that represent parcels not owned by resident df_is_not_homeowner = df_res_id[df_res_id[util.IS_HOMEOWNER] == False] # If any rows represent parcels not owned... if len(df_is_not_homeowner) > 0: # Drop them df_merge = df_merge.drop(labels=df_is_not_homeowner.index) else: # Resident owns no parcels # Extract rows that represent parcels where resident is family df_is_family = df_res_id[df_res_id[util.IS_FAMILY] == True] # If resident is family... if len(df_is_family) > 0: # Extract rows that represent parcels where resident is not family df_is_not_family = df_res_id[df_res_id[util.IS_FAMILY] == False] # If any rows represent parcels where resident is not family... if len(df_is_not_family) > 0: # Drop them df_merge = df_merge.drop(labels=df_is_not_family.index) else: # # The Census+Assessment merge had correlated this resident with multiple parcels. # However, we don't have enough information to determine which is the correct one. # Rather than retain excess and misleading information in the database, we clear it out. # # Drop excess rows drop_index = df_res_id.index.drop([df_res_id.index[0]]) df_merge = df_merge.drop(labels=drop_index) # Clear unreliable values for col_name in df_assessment_res.columns: if col_name not in df_census.columns: df_merge.loc[df_merge[util.RESIDENT_ID] == res_id, col_name] = '' # Report outcome print('drop_assessment_noise() removed {0} rows'.format(bf_len - len(df_merge))) util.report_elapsed_time(prefix='drop_assessment_noise() done: ', start_time=t) return df_merge