Beispiel #1
0
def crp_db_merge(file12, file14):
    print('Merging CRP DBs...')

    # df_crp = pd.read_csv(file12, delimiter=';', dtype={'Chassis_Number': str, 'Registration_Number': str, 'Customer': str}, parse_dates=['Vehicle_In_Date', 'Registration_Date'], infer_datetime_format=True)
    df_crp = pd.read_csv(file12, delimiter=';', dtype={'Chassis_Number': str, 'Registration_Number': str, 'Customer': str}, parse_dates=['Vehicle_In_Date', 'Registration_Date_PSE', 'Registration_Date_VHE'], infer_datetime_format=True)
    # df_dw_crp = pd.read_csv(file14, delimiter=';', dtype={'Chassis_Number': str, 'Registration_Number': str, 'Customer': str}, parse_dates=['Vehicle_In_Date', 'Registration_Date'], infer_datetime_format=True)
    df_dw_crp = pd.read_csv(file14, delimiter=';', dtype={'Chassis_Number': str, 'Registration_Number': str, 'Customer': str}, parse_dates=['Vehicle_In_Date', 'Registration_Date_PSE', 'Registration_Date_VHE'], infer_datetime_format=True)

    dfs = [df_crp, df_dw_crp]
    for df in dfs:
        df.columns = map(str.lower, df.columns)
        df['registration_number'] = df['registration_number'].str.replace('-', '')
        df.drop(df[df['slr_document_account'] == 4].index, axis=0, inplace=True)
        df.drop(df[df['slr_document_account'] == 6].index, axis=0, inplace=True)
        df.rename(columns={'(no column name)': 'total'}, inplace=True)
        df = chassis_strip(df)
        df = registration_number_flag(df)

    df_crp_concat = pd.concat(dfs, ignore_index=True)
    # print(df_crp_concat['slr_document_date'].min(), df_crp_concat['slr_document_date'].max())

    df_crp_grouped = df_crp_concat.groupby('registration_number')
    # df_crp['customer'] = df_crp_grouped['customer'].transform(lambda x: 'No values to aggregate' if pd.isnull(x).all() == True else x.fillna(method='ffill').fillna(method='bfill'))
    df_crp_concat['customer'] = df_crp_grouped['customer'].apply(lambda x: x.ffill().bfill())
    df_crp_concat['registration_date_pse'] = df_crp_grouped['registration_date_pse'].apply(lambda x: x.ffill().bfill())
    # df_crp['registration_date_vhe'] = df_crp_grouped['registration_date_vhe'].apply(lambda x: x.ffill().bfill())

    # print(df_crp_concat['slr_document_date'].min(), df_crp_concat['slr_document_date'].max())

    save_csv(df_crp_concat, 'sql_db/' + 'crp.csv')
Beispiel #2
0
def crp_cm_merge(file9):
    print('Adding CRP CM DBs...')
    df = pd.read_csv('sql_db/' + 'crp.csv', index_col=0, parse_dates=['vehicle_in_date', 'registration_date_pse', 'registration_date_vhe'], infer_datetime_format=True)
    df_cm = cm_crp_cleanup(file9)

    df_merged = pd.merge(df, df_cm, how='left', on=['chassis_number'], suffixes=('', '_y'))
    repeated_cols_left = [x for x in list(df_merged) if '_y' in x if x != 'cm_years']
    df_merged.drop(repeated_cols_left, axis=1, inplace=True)

    save_csv(df_merged, 'sql_db/' + 'crp_merged.csv')
Beispiel #3
0
def db_creation(df, warranty_years_ca, warranty_years_crp):
    print('Creating DB fields...')
    start = time.time()
    current_datetime = datetime.now()

    df['warranty_visit_pse'], df['warranty_visit_vhe'], df['contract_visit'], df['abandoned'] = 0, 0, 0, 0
    # df['regular_percentage_pse'], df['regular_percentage_vhe'] = 0, 0
    df.sort_values(by=['vehicle_in_date'], inplace=True)
    df_grouped = df.groupby(['customer', 'registration_number'], as_index=False)

    ### Predefined values:
    df.loc[df['registration_date_pse'].isnull(), 'warranty_visit_pse'], df.loc[df['registration_date_vhe'].isnull(), 'warranty_visit_vhe'] = 'NULL', 'NULL'
    df.loc[df['vehicle_in_date'].isnull(), ['warranty_visit_pse', 'warranty_visit_vhe', 'contract_visit', 'abandoned']] = 'NULL'
    df.loc[df['cm_date_end'].isnull(), 'contract_visit'] = 'NULL'
    print('1st step done at %.2f' % (time.time() - start), 'seconds')

    ### Warranty Visit?
    df_ca = df[df['nlr_code'] == 101]
    df_ca.loc[df_ca['vehicle_in_date'] < (df_ca['registration_date_pse'] + np.timedelta64(365 * warranty_years_ca, 'D')), 'warranty_visit_pse'] = 1
    df_ca.loc[df_ca['vehicle_in_date'] < (df_ca['registration_date_vhe'] + np.timedelta64(365 * warranty_years_ca, 'D')), 'warranty_visit_vhe'] = 1

    df_crp = df[df['nlr_code'] == 701]
    df_crp.loc[df_crp['vehicle_in_date'] < (df_crp['registration_date_pse'] + np.timedelta64(365 * warranty_years_crp, 'D')), 'warranty_visit_pse'] = 1
    df_crp.loc[df_crp['vehicle_in_date'] < (df_crp['registration_date_vhe'] + np.timedelta64(365 * warranty_years_crp, 'D')), 'warranty_visit_vhe'] = 1

    df_all = pd.concat([df_ca, df_crp])
    print('2nd step done at %.2f' % (time.time() - start), 'seconds')

    ### Contract Visit?
    # df_all.loc[df_all['vehicle_in_date'] < df_all['cm_date_end'], 'contract_visit'] = 1
    df_all.loc[(df_all['vehicle_in_date'] <= df_all['cm_date_end']) & (df_all['anos_viatura'] <= df_all['cm_years']) & (df_all['kms'] <= df_all['cm_km']), 'contract_visit'] = 1
    print('3rd step done at %.2f' % (time.time() - start), 'seconds')

    ### Abandonded?
    something = df_grouped.apply(time_last_visit, current_datetime=current_datetime)
    df_all = df_all.merge(something.to_frame(), on=['customer', 'registration_number'])
    df_all.drop(['abandoned'], axis=1, inplace=True)
    df_all = df_all.rename(columns={0: 'abandoned'})
    print('4th step done at %.2f' % (time.time() - start), 'seconds')

    # ### Regular Percentage? - PSE
    # something_2 = df_grouped.apply(regular_percent_pse, current_datetime=current_datetime)
    # df_all = df_all.merge(something_2.to_frame(), on=['customer', 'registration_number'])
    # df_all.drop(['regular_percentage_pse'], axis=1, inplace=True)
    # df_all = df_all.rename(columns={0: 'regular_percentage_pse'})
    # print('5th step done at %.2f' % (time.time() - start), 'seconds')
    #
    # ### Regular Percentage? - VHE
    # something_3 = df_grouped.apply(regular_percent_vhe, current_datetime=current_datetime)
    # df_all = df_all.merge(something_3.to_frame(), on=['customer', 'registration_number'])
    # df_all.drop(['regular_percentage_vhe'], axis=1, inplace=True)
    # df_all = df_all.rename(columns={0: 'regular_percentage_vhe'})
    # print('6th step done at %.2f' % (time.time() - start), 'seconds')

    save_csv(df_all, 'output/' + 'db_customer_segmentation_short.csv')
Beispiel #4
0
def db_concat():
    print('Concatenating CA and CRP DBs...')

    dtypes = {'nlr_code': int, 'slr_account': str, 'kms': int}
    parse_dates = ['vehicle_in_date', 'registration_date_pse', 'registration_date_vhe', 'cm_date_start', 'cm_date_end']

    db_ca = pd.read_csv('sql_db/' + 'ca_merged.csv', index_col=0, dtype=dtypes, parse_dates=parse_dates, infer_datetime_format=True)
    db_crp = pd.read_csv('sql_db/' + 'crp_merged.csv', index_col=0, dtype=dtypes, parse_dates=parse_dates, infer_datetime_format=True)

    # Filling for non-available data of CRP db
    # db_crp['cm_years'], db_crp['cm_km'] = 0, 0

    db = pd.concat([db_ca, db_crp], ignore_index=True, sort=False)
    first_cols = ['customer', 'slr_document_account', 'registration_number']
    rem_cols = [x for x in list(db) if x not in first_cols]
    cols = first_cols + rem_cols
    db = db[cols].sort_values(first_cols)

    save_csv(db, 'sql_db/' + 'db.csv')
Beispiel #5
0
def ca_db_merge(file11, file13):
    print('Merging CA DBs...')

    # df_ca = pd.read_csv(file11, delimiter=';', encoding='utf-8', dtype={'Chassis_Number': str, 'Registration_Number': str, 'Customer': str}, parse_dates=['Vehicle_In_Date', 'Registration_Date'], infer_datetime_format=True)
    # df_dw_ca = pd.read_csv(file13, delimiter=';', encoding='utf-8', dtype={'Chassis_Number': str, 'Registration_Number': str}, parse_dates=['Vehicle_In_Date', 'Registration_Date'], infer_datetime_format=True)
    df_ca = pd.read_csv(file11, delimiter=';', encoding='utf-8', dtype={'Chassis_Number': str, 'Registration_Number': str, 'Customer': str}, parse_dates=['Vehicle_In_Date', 'Registration_Date_PSE', 'Registration_Date_VHE'], infer_datetime_format=True)
    df_dw_ca = pd.read_csv(file13, delimiter=';', encoding='utf-8', dtype={'Chassis_Number': str, 'Registration_Number': str}, parse_dates=['Vehicle_In_Date', 'Registration_Date_PSE', 'Registration_Date_VHE'], infer_datetime_format=True)

    # print(df_ca['SLR_Document_Date'].min(), df_ca['SLR_Document_Date'].max())
    # print(df_dw_ca['SLR_Document_Date'].min(), df_dw_ca['SLR_Document_Date'].max())
    # sys.exit()

    dfs = [df_ca, df_dw_ca]
    for df in dfs:
        df.columns = map(str.lower, df.columns)
        df['registration_number'] = df['registration_number'].str.replace('-', '')
        df.drop(df[df['slr_document_account'] == 4].index, axis=0, inplace=True)
        df.drop(df[df['slr_document_account'] == 6].index, axis=0, inplace=True)
        df.rename(columns={'(no column name)': 'total'}, inplace=True)
        df = chassis_strip(df)
        df = registration_number_flag(df)

    df_ca_concat = pd.concat(dfs, ignore_index=True)

    # print(df_ca_concat[df_ca_concat['registration_number'] == '98IQ85'])
    # sys.exit()

    df_ca_grouped = df_ca_concat.groupby(['registration_number'])
    # print(df_ca_concat['slr_document_date'].min(), df_ca_concat['slr_document_date'].max())

    # df_ca['customer'] = df_ca_grouped['customer'].transform(lambda x: 'No values to aggregate' if pd.isnull(x).all() == True else x.fillna(method='ffill').fillna(method='bfill'))
    df_ca_concat['customer'] = df_ca_grouped['customer'].transform(lambda x: x.fillna(method='ffill').fillna(method='bfill'))
    df_ca_concat['registration_date_pse'] = df_ca_grouped['registration_date_pse'].transform(lambda x: x.fillna(method='ffill').fillna(method='bfill'))
    # df_ca['registration_date_vhe'] = df_ca_grouped['registration_date_vhe'].transform(lambda x: x.fillna(method='ffill').fillna(method='bfill'))

    # print(df_ca_concat[df_ca_concat['registration_number'] == '98IQ85'])
    # sys.exit()
    # print(df_ca_concat['slr_document_date'].min(), df_ca_concat['slr_document_date'].max())

    # output_file = 'sql_db/' + 'ca.csv'
    save_csv(df_ca_concat, 'sql_db/' + 'ca.csv')
Beispiel #6
0
def main():
    start = time.time()
    print('Creating DB...')

    input_file = 'sql_db/' + 'ENCOMENDA.csv'
    input_file_2 = ''
    full_db = 'output/' + 'db_full_baviera.csv'
    stock_opt_db = 'output/' + 'db_baviera_stock_optimization.csv'

    new_data_check = 0

    df = pd.read_csv(input_file,
                     delimiter=';',
                     parse_dates=['Data Compra', 'Data Venda'],
                     infer_datetime_format=True,
                     decimal=',')
    if new_data_check:
        df2 = pd.read_csv(input_file_2,
                          delimiter=';',
                          parse_dates=['Data Compra', 'Data Venda'],
                          infer_datetime_format=True,
                          decimal=',')
        df = pd.concat([df, df2], axis=0)

    df_initial = db_creation(df)
    df_second_step = db_color_replacement(df_initial)
    df_third_step = db_score_calculation(df_second_step)
    df_final = db_duplicate_removal(df_third_step)

    sel_cols = [
        'Modelo', 'Local da Venda', 'Prov', 'Cor_Interior', 'Cor_Exterior',
        'Navegação', 'Sensores', 'Caixa Auto', 'Jantes', 'buy_day',
        'buy_month', 'buy_year', 'sell_day', 'sell_month', 'sell_year',
        'price_total', 'stock_days', 'Margem', 'margem_percentagem'
    ]
    save_csv(df_final[sel_cols], stock_opt_db)

    save_csv(df_final, full_db)

    print('Runnning time: %.2f' % (time.time() - start))