Ejemplo n.º 1
0
def preprocess(sfdf, repdf, keys):
    key_list = list()
    '''preprocessing'''
    sfdf.update(clean(sfdf.FirstName))
    sfdf.update(clean(sfdf.LastName))
    sfdf.update(clean(sfdf.Email))
    sfdf.update(clean(sfdf.MailingState))
    sfdf.update(phonenumbers(sfdf.MailingPostalCode))
    sfdf.update(clean(sfdf.MailingCity))
    sfdf.update(phonenumbers(sfdf.Phone))
    sfdf.update(clean(sfdf.CRD__c.astype(str)))

    repdf.update(clean(repdf.FirstName))
    repdf.update(clean(repdf.LastName))
    repdf.update(clean(repdf.Email))
    repdf.update(clean(repdf.MailingState))
    repdf.update(phonenumbers(repdf.MailingPostalCode))
    repdf.update(clean(repdf.MailingCity))
    repdf.update(phonenumbers(repdf.Phone))
    repdf.update(clean(repdf.CRD__c.astype(str)))
    '''key generating'''
    for df in [sfdf, repdf]:
        for key in keys:
            if len(key[:-1]) > 1:
                key_col = ''.join(
                    [''.join(c for c in s if c.isupper()) for s in key[:-1]])
                if key_col not in key_list:
                    key_list.append(key_col)
                df[key_col] = pd.Series(
                    np.add.reduce(df[key[:-1]].astype(str), axis=1))
            else:
                if key[0] not in key_list:
                    key_list.append(key[0])
    return sfdf, repdf, key_list
Ejemplo n.º 2
0
    def test_clean_phonenumbers(self):

        values = pd.Series(
            [np.nan, '0033612345678', '+1 201 123 4567', '+336-123 45678'])
        expected = pd.Series(
            [np.nan, '0033612345678', '+12011234567', '+33612345678'])

        clean_series = phonenumbers(values)

        # Check if series are identical.
        pdt.assert_series_equal(clean_series, expected)
Ejemplo n.º 3
0
def clean_data(df):
    df = df.replace(r'^\s*$', np.nan, regex=True)
    df = df[df['customer_id'].notna()]
    df['phone_clean'] = phonenumbers(df['phone'])
    df['first_name_clean'] = clean(df['first_name'])
    df['last_name_clean'] = clean(df['last_name'])
    df['address_clean'] = clean(df['address'])
    df['city_clean'] = clean(df['city'])
    df['state_clean'] = clean(df['state'])
    df['zip_clean'] = clean(df['zip'].str.split('-').str[0])
    return df
def preprocess(df1, df2):

    # set index to the id column
    df1 = df1.set_index('id')
    df2 = df2.set_index('id')

    # replace empty cells with NaN
    df1 = df1.replace("", np.nan)
    df2 = df2.replace("", np.nan)

    # drop country, locality and region
    df1 = df1.drop(['country', 'locality', 'region'], axis=1)
    df2 = df2.drop(['country', 'locality', 'region'], axis=1)

    # remove all non-numbers from phone & convert to numeric
    df1.loc[:, 'phone'] = pd.to_numeric(phonenumbers(df1.loc[:, 'phone']))
    df2.loc[:, 'phone'] = pd.to_numeric(phonenumbers(df2.loc[:, 'phone']))

    # convert postal_code to numeric
    df1.loc[:, 'postal_code'] = pd.to_numeric(df1.loc[:, 'postal_code'])
    df2.loc[:, 'postal_code'] = pd.to_numeric(df2.loc[:, 'postal_code'])

    # clean street_address & website
    df1.loc[:, 'street_address'] = clean(df1.loc[:, 'street_address'])
    df1.loc[:, 'website'] = clean(df1.loc[:, 'website'])

    df2.loc[:, 'street_address'] = clean(df2.loc[:, 'street_address'])
    df2.loc[:, 'website'] = clean(df2.loc[:, 'website'])

    # convert NaNs to 0s for numerics
    df1.loc[:,
            ['latitude', 'longitude', 'phone', 'postal_code']] = df1.loc[:, [
                'latitude', 'longitude', 'phone', 'postal_code'
            ]].replace(np.nan, 0)

    df2.loc[:,
            ['latitude', 'longitude', 'phone', 'postal_code']] = df2.loc[:, [
                'latitude', 'longitude', 'phone', 'postal_code'
            ]].replace(np.nan, 0)

    return df1, df2
Ejemplo n.º 5
0
def preprocess(sfdf, repdf):
    print('enter PREPROCESS')

    global key_list, keys
    '''preprocessing'''

    sfdf.update(clean(sfdf.FirstName))
    sfdf.update(clean(sfdf.LastName))
    sfdf.update(clean(sfdf.Email))
    sfdf.update(clean(sfdf.State))
    sfdf.update(phonenumbers(sfdf.Zip))
    sfdf.update(clean(sfdf.City))
    sfdf.update(phonenumbers(sfdf.Phone))
    sfdf.update(clean(sfdf.CRD.astype(str)))

    repdf.update(clean(repdf.FirstName))
    repdf.update(clean(repdf.LastName))
    repdf.update(clean(repdf.Email))
    repdf.update(clean(repdf.State))
    repdf.update(phonenumbers(repdf.Zip))
    repdf.update(clean(repdf.City))
    repdf.update(phonenumbers(repdf.Phone))
    repdf.update(clean(repdf.CRD.astype(str)))

    '''key generating'''

    for df in [sfdf, repdf]:
        for key in keys:
            if len(key) > 1:
                key_col = ''.join([''.join(c for c in s if c.isupper()) for s in key])
                if key_col not in key_list:
                    key_list.append(key_col)
                df[key_col] = pd.Series(np.add.reduce(df[key].astype(str), axis=1))
            else:
                if key[0] not in key_list:
                    key_list.append(key[0])
    print('exit PREPROCESS')