コード例 #1
0
ファイル: water_consumption.py プロジェクト: navkal/el
def summarize(df):

    t = time.time()
    print('')
    print('summarize() starting at',
          time.strftime('%H:%M:%S', time.localtime(t)))

    df_summary = pd.DataFrame(columns=df.columns)

    for idx, df_group in df.groupby(by=[util.SERVICE_ID]):

        # Copy dataframe and find last row
        df_meter = df_group.copy()
        row_n = df_meter.iloc[-1]

        # Remove data pertaining to old meters, if any
        df_meter = df_meter[df_group[util.METER_NUMBER] == row_n[
            util.METER_NUMBER]]

        # Determine whether meter readings look reasonable
        sr_cur = df_meter[util.CURRENT_READING].iloc[:-1]
        sr_pri = df_meter[util.PRIOR_READING].shift(-1).iloc[:-1]
        sr_eq = sr_cur == sr_pri
        readings_look_good = False not in sr_eq.value_counts()

        # Look for wraps
        if readings_look_good:
            sr_wrap = df_meter[util.CURRENT_READING] < df_meter[
                util.PRIOR_READING]
            vc_wrap = sr_wrap.value_counts()
            n_wraps = vc_wrap[True] if True in vc_wrap else 0
        else:
            n_wraps = 0

        # Calculate water usage hidden by wraps
        max_reading = df_meter[util.CURRENT_READING].max()
        wrap_extent = 10**(1 + int(math.log10(max_reading))) if (
            max_reading > 0) else 0
        usage_hidden_by_wraps = wrap_extent * n_wraps

        # Summarize readings in one row
        row_0 = df_meter.iloc[0]
        row_n[util.PRIOR_DATE] = row_0[util.PRIOR_DATE]
        row_n[util.PRIOR_READING] = row_0[
            util.PRIOR_READING] - usage_hidden_by_wraps

        # Report meter status
        if readings_look_good:
            row_n[
                util.
                METER_STATUS] = util.METER_WRAP if n_wraps > 0 else util.METER_NORMAL
        else:
            row_n[util.METER_STATUS] = util.METER_ANOMALY

        # Save summary row
        df_summary = df_summary.append(row_n)

    util.report_elapsed_time(prefix='summarize() done: ', start_time=t)

    return df_summary
コード例 #2
0
ファイル: lookup.py プロジェクト: navkal/el
def flag_ownership(df):

    df[util.LAST_NAME] = df[util.LAST_NAME].fillna('')
    df[util.FIRST_NAME] = df[util.FIRST_NAME].fillna('')
    df[util.MIDDLE_NAME] = df[util.MIDDLE_NAME].fillna('')
    df[util.OWNER_1_NAME] = df[util.OWNER_1_NAME].fillna('')
    df[util.OWNER_2_NAME] = df[util.OWNER_2_NAME].fillna('')
    df[util.OWNER_3_NAME] = df[util.OWNER_3_NAME].fillna('')

    t = time.time()
    print('')
    print('util.IS_HOMEOWNER starting at',
          time.strftime('%H:%M:%S', time.localtime(t)))

    df[util.IS_HOMEOWNER] = df.apply(lambda row: is_homeowner(row), axis=1)

    print('Found {0} homeowners'.format(
        df[util.IS_HOMEOWNER].value_counts()[True]))
    util.report_elapsed_time(prefix='util.IS_HOMEOWNER done: ', start_time=t)

    t = time.time()
    print('')
    print('util.IS_FAMILY starting at',
          time.strftime('%H:%M:%S', time.localtime(t)))

    df[util.IS_FAMILY] = df.apply(lambda row: is_family(row, df), axis=1)

    print('Found {0} family members'.format(
        df[util.IS_FAMILY].value_counts()[True]))
    util.report_elapsed_time(prefix='util.IS_FAMILY done: ', start_time=t)
    print('')

    return df
コード例 #3
0
ファイル: employees.py プロジェクト: navkal/el
    df_merge.loc[ sr_dups == True, util.RESIDENT_ID ] = 'unknown'
    df_merge.loc[ sr_dups == True, util.VOTER_STATUS ] = 'unknown'
    df_merge = df_merge.drop_duplicates( subset=[util.LAST_NAME, util.FIRST_NAME] )

    # Report findings
    n_v = len( df_census[ df_census[util.VOTER_STATUS] == 'A' ] )
    n_e = len( df_merge )
    n_re = len( df_merge[ df_merge[util.RESIDENT_ID].notnull() ] )
    n_ve = len( df_merge[ df_merge[util.VOTER_STATUS] == 'A' ] )

    print( '' )
    print( 'Number of active voters: {0}'.format( n_v ) )
    print( 'Number of employees: {0}'.format( n_e ) )
    print( 'Number of resident employees: {0}'.format( n_re ) )
    print( 'Number of active voter employees: {0}'.format( n_ve ) )
    print( '' )
    print( '{0:.0f}% of town employees live in Andover.'.format( 100 * n_re / n_e ) )
    print( '{0:.0f}% of town employees are active voters in Andover.'.format( 100 * n_ve / n_e ) )
    print( 'Active voter employees represent {0:.0f}% of all Andover active voters.'.format( 100 * n_ve / n_v ) )

    print( '' )
    filename = '../analysis/employees.xlsx'
    print( 'Writing {0} rows to {1}'.format( len( df_merge ), filename ) )
    print( 'Columns: {0}'.format( list( df_merge.columns ) ) )

    # Write to spreadsheet
    df_merge.to_excel( filename, index=False )

    # Report elapsed time
    util.report_elapsed_time()
コード例 #4
0
ファイル: residents.py プロジェクト: navkal/el
    report_gender_findings()
    print( 'Guessing missing genders...' )
    df_residents[util.GENDER] = df_residents.apply( lambda row: guess_gender( row ), axis=1 )
    report_gender_findings()
    print( '' )

    # Calculate age
    df_residents[util.AGE] = df_residents.apply( lambda row: calculate_age( row ), axis=1 )

    # Insert zoning codes
    t = time.time()
    print( 'Inserting zoning codes, starting at {0}...'.format( time.strftime( '%H:%M:%S', time.localtime( t ) ) ) )
    df_zones = pd.read_sql_table( 'ZoneLookup', engine, columns=[util.NORMALIZED_STREET_NUMBER, util.LADDR_ALT_STREET_NUMBER, util.NORMALIZED_STREET_NAME, util.ZONING_CODE_1] )
    df_residents = pd.merge( df_residents, df_zones, how='left', on=[util.NORMALIZED_STREET_NUMBER, util.LADDR_ALT_STREET_NUMBER, util.NORMALIZED_STREET_NAME] )
    df_residents[util.ZONING_CODE_1] = df_residents.apply( lambda row: get_zoning_code( row ), axis=1 )
    util.report_elapsed_time( 'Inserted zoning codes -- ', t )
    print( '' )

    # Load primary elections dataframe
    df_primary = pd.read_sql_table( 'ElectionModel_01', engine, parse_dates=True )
    df_primary = df_primary.sort_values( by=[util.ELECTION_DATE] )
    df_primary[util.PARTY_AFFILIATION] = df_primary[util.PARTY_AFFILIATION].str.strip()
    df_primary[util.PARTY_VOTED] = df_primary[util.PARTY_VOTED].str.strip()

    # Build dictionaries of unique primary election dates and corresponding column names
    dc_primary_dates = OrderedDict()
    dc_ballot_cols = {}
    for date in df_primary[util.ELECTION_DATE].unique():
        date_name = date.split()[0]
        dc_primary_dates[date_name] = '-'
        dc_ballot_cols[date_name] = 'primary_ballot_' + date_name
コード例 #5
0
ファイル: lookup.py プロジェクト: navkal/el
def drop_water_noise(df_merge):

    t = time.time()
    print('')
    print('drop_water_noise() starting at',
          time.strftime('%H:%M:%S', time.localtime(t)))

    # Fill null cells
    df_merge[util.METER_NUMBER] = df_merge[util.METER_NUMBER].fillna(
        '').astype(str)

    bf_len = len(df_merge)

    # Get list of unique Resident/Parcel ID pairs from merged table
    df_merge['pair'] = df_merge[util.RESIDENT_ID].str.cat(
        ',' + df_merge[util.PARCEL_ID])
    unique_pairs = df_merge['pair'].unique()

    # Establish columns to be preserved
    preserve_columns = df_census.columns.union(df_assessment_res.columns)

    # Create set of service types for comparison
    res_irr = set(['Residential', 'Irrigation'])

    # Iterate over Resident IDs
    for pair in unique_pairs:

        # Extract all rows containing this Resident ID
        df_pair = df_merge[df_merge['pair'] == pair]

        # If there are multiple rows pertaining to current resident...
        if len(df_pair) > 1:

            # The +Water merge has correlated this resident/parcel pair with multiple Meter Numbers.

            # Get the Parcel ID
            parcel_id = df_pair.iloc[0][util.PARCEL_ID]

            # Determine whether any service types are duplicated among the rows
            vc = df_pair[util.SERVICE_TYPE].value_counts()
            dup_service_types = vc[vc > 1]

            # Determine how listed services compare to expected { Residential, Irrigation }
            vc_idx = set(vc.index)
            service_type_diff = vc_idx.symmetric_difference(res_irr)

            # If we don't have a Parcel ID, or we have a duplicated service type, or service types are not as expected:
            if (parcel_id == '') or (len(dup_service_types) >
                                     0) or (len(service_type_diff) > 0):
                #
                # We don't have enough information to determine which Meter Number is the correct one.
                # Rather than retain excess and misleading information in the database, we clear it out.
                #

                # Drop excess rows
                drop_index = df_pair.index.drop([df_pair.index[0]])
                df_merge = df_merge.drop(labels=drop_index)

                # Clear unreliable values
                for col_name in df_water.columns:
                    if col_name not in preserve_columns:
                        df_merge.at[df_pair.index[0], col_name] = ''

    # Report outcome
    print('drop_water_noise() removed {0} rows'.format(bf_len - len(df_merge)))
    util.report_elapsed_time(prefix='drop_water_noise() done: ', start_time=t)

    return df_merge
コード例 #6
0
ファイル: lookup.py プロジェクト: navkal/el
def drop_assessment_noise(df_merge):

    t = time.time()
    print('')
    print('drop_assessment_noise() starting at',
          time.strftime('%H:%M:%S', time.localtime(t)))

    # Fill null cells
    df_merge[util.PARCEL_ID] = df_merge[util.PARCEL_ID].fillna('').astype(str)

    bf_len = len(df_merge)

    # Get list of unique Resident IDs from merged table
    res_ids = df_merge[util.RESIDENT_ID].unique()

    # Iterate over Resident IDs
    for res_id in res_ids:

        # Extract all rows containing this Resident ID
        df_res_id = df_merge[df_merge[util.RESIDENT_ID] == res_id]

        # If there are multiple rows pertaining to current resident...
        if len(df_res_id) > 1:

            # Extract rows that represent parcels owned by resident
            df_is_homeowner = df_res_id[df_res_id[util.IS_HOMEOWNER] == True]

            # If resident owns any parcels...
            if len(df_is_homeowner) > 0:

                # Extract rows that represent parcels not owned by resident
                df_is_not_homeowner = df_res_id[df_res_id[util.IS_HOMEOWNER] ==
                                                False]

                # If any rows represent parcels not owned...
                if len(df_is_not_homeowner) > 0:

                    # Drop them
                    df_merge = df_merge.drop(labels=df_is_not_homeowner.index)
            else:
                # Resident owns no parcels

                # Extract rows that represent parcels where resident is family
                df_is_family = df_res_id[df_res_id[util.IS_FAMILY] == True]

                # If resident is family...
                if len(df_is_family) > 0:

                    # Extract rows that represent parcels where resident is not family
                    df_is_not_family = df_res_id[df_res_id[util.IS_FAMILY] ==
                                                 False]

                    # If any rows represent parcels where resident is not family...
                    if len(df_is_not_family) > 0:

                        # Drop them
                        df_merge = df_merge.drop(labels=df_is_not_family.index)

                else:
                    #
                    # The Census+Assessment merge had correlated this resident with multiple parcels.
                    # However, we don't have enough information to determine which is the correct one.
                    # Rather than retain excess and misleading information in the database, we clear it out.
                    #

                    # Drop excess rows
                    drop_index = df_res_id.index.drop([df_res_id.index[0]])
                    df_merge = df_merge.drop(labels=drop_index)

                    # Clear unreliable values
                    for col_name in df_assessment_res.columns:
                        if col_name not in df_census.columns:
                            df_merge.loc[df_merge[util.RESIDENT_ID] == res_id,
                                         col_name] = ''

    # Report outcome
    print('drop_assessment_noise() removed {0} rows'.format(bf_len -
                                                            len(df_merge)))
    util.report_elapsed_time(prefix='drop_assessment_noise() done: ',
                             start_time=t)

    return df_merge