def crp_db_merge(file12, file14): print('Merging CRP DBs...') # df_crp = pd.read_csv(file12, delimiter=';', dtype={'Chassis_Number': str, 'Registration_Number': str, 'Customer': str}, parse_dates=['Vehicle_In_Date', 'Registration_Date'], infer_datetime_format=True) df_crp = pd.read_csv(file12, delimiter=';', dtype={'Chassis_Number': str, 'Registration_Number': str, 'Customer': str}, parse_dates=['Vehicle_In_Date', 'Registration_Date_PSE', 'Registration_Date_VHE'], infer_datetime_format=True) # df_dw_crp = pd.read_csv(file14, delimiter=';', dtype={'Chassis_Number': str, 'Registration_Number': str, 'Customer': str}, parse_dates=['Vehicle_In_Date', 'Registration_Date'], infer_datetime_format=True) df_dw_crp = pd.read_csv(file14, delimiter=';', dtype={'Chassis_Number': str, 'Registration_Number': str, 'Customer': str}, parse_dates=['Vehicle_In_Date', 'Registration_Date_PSE', 'Registration_Date_VHE'], infer_datetime_format=True) dfs = [df_crp, df_dw_crp] for df in dfs: df.columns = map(str.lower, df.columns) df['registration_number'] = df['registration_number'].str.replace('-', '') df.drop(df[df['slr_document_account'] == 4].index, axis=0, inplace=True) df.drop(df[df['slr_document_account'] == 6].index, axis=0, inplace=True) df.rename(columns={'(no column name)': 'total'}, inplace=True) df = chassis_strip(df) df = registration_number_flag(df) df_crp_concat = pd.concat(dfs, ignore_index=True) # print(df_crp_concat['slr_document_date'].min(), df_crp_concat['slr_document_date'].max()) df_crp_grouped = df_crp_concat.groupby('registration_number') # df_crp['customer'] = df_crp_grouped['customer'].transform(lambda x: 'No values to aggregate' if pd.isnull(x).all() == True else x.fillna(method='ffill').fillna(method='bfill')) df_crp_concat['customer'] = df_crp_grouped['customer'].apply(lambda x: x.ffill().bfill()) df_crp_concat['registration_date_pse'] = df_crp_grouped['registration_date_pse'].apply(lambda x: x.ffill().bfill()) # df_crp['registration_date_vhe'] = df_crp_grouped['registration_date_vhe'].apply(lambda x: x.ffill().bfill()) # print(df_crp_concat['slr_document_date'].min(), df_crp_concat['slr_document_date'].max()) save_csv(df_crp_concat, 'sql_db/' + 'crp.csv')
def crp_cm_merge(file9): print('Adding CRP CM DBs...') df = pd.read_csv('sql_db/' + 'crp.csv', index_col=0, parse_dates=['vehicle_in_date', 'registration_date_pse', 'registration_date_vhe'], infer_datetime_format=True) df_cm = cm_crp_cleanup(file9) df_merged = pd.merge(df, df_cm, how='left', on=['chassis_number'], suffixes=('', '_y')) repeated_cols_left = [x for x in list(df_merged) if '_y' in x if x != 'cm_years'] df_merged.drop(repeated_cols_left, axis=1, inplace=True) save_csv(df_merged, 'sql_db/' + 'crp_merged.csv')
def db_creation(df, warranty_years_ca, warranty_years_crp): print('Creating DB fields...') start = time.time() current_datetime = datetime.now() df['warranty_visit_pse'], df['warranty_visit_vhe'], df['contract_visit'], df['abandoned'] = 0, 0, 0, 0 # df['regular_percentage_pse'], df['regular_percentage_vhe'] = 0, 0 df.sort_values(by=['vehicle_in_date'], inplace=True) df_grouped = df.groupby(['customer', 'registration_number'], as_index=False) ### Predefined values: df.loc[df['registration_date_pse'].isnull(), 'warranty_visit_pse'], df.loc[df['registration_date_vhe'].isnull(), 'warranty_visit_vhe'] = 'NULL', 'NULL' df.loc[df['vehicle_in_date'].isnull(), ['warranty_visit_pse', 'warranty_visit_vhe', 'contract_visit', 'abandoned']] = 'NULL' df.loc[df['cm_date_end'].isnull(), 'contract_visit'] = 'NULL' print('1st step done at %.2f' % (time.time() - start), 'seconds') ### Warranty Visit? df_ca = df[df['nlr_code'] == 101] df_ca.loc[df_ca['vehicle_in_date'] < (df_ca['registration_date_pse'] + np.timedelta64(365 * warranty_years_ca, 'D')), 'warranty_visit_pse'] = 1 df_ca.loc[df_ca['vehicle_in_date'] < (df_ca['registration_date_vhe'] + np.timedelta64(365 * warranty_years_ca, 'D')), 'warranty_visit_vhe'] = 1 df_crp = df[df['nlr_code'] == 701] df_crp.loc[df_crp['vehicle_in_date'] < (df_crp['registration_date_pse'] + np.timedelta64(365 * warranty_years_crp, 'D')), 'warranty_visit_pse'] = 1 df_crp.loc[df_crp['vehicle_in_date'] < (df_crp['registration_date_vhe'] + np.timedelta64(365 * warranty_years_crp, 'D')), 'warranty_visit_vhe'] = 1 df_all = pd.concat([df_ca, df_crp]) print('2nd step done at %.2f' % (time.time() - start), 'seconds') ### Contract Visit? # df_all.loc[df_all['vehicle_in_date'] < df_all['cm_date_end'], 'contract_visit'] = 1 df_all.loc[(df_all['vehicle_in_date'] <= df_all['cm_date_end']) & (df_all['anos_viatura'] <= df_all['cm_years']) & (df_all['kms'] <= df_all['cm_km']), 'contract_visit'] = 1 print('3rd step done at %.2f' % (time.time() - start), 'seconds') ### Abandonded? something = df_grouped.apply(time_last_visit, current_datetime=current_datetime) df_all = df_all.merge(something.to_frame(), on=['customer', 'registration_number']) df_all.drop(['abandoned'], axis=1, inplace=True) df_all = df_all.rename(columns={0: 'abandoned'}) print('4th step done at %.2f' % (time.time() - start), 'seconds') # ### Regular Percentage? - PSE # something_2 = df_grouped.apply(regular_percent_pse, current_datetime=current_datetime) # df_all = df_all.merge(something_2.to_frame(), on=['customer', 'registration_number']) # df_all.drop(['regular_percentage_pse'], axis=1, inplace=True) # df_all = df_all.rename(columns={0: 'regular_percentage_pse'}) # print('5th step done at %.2f' % (time.time() - start), 'seconds') # # ### Regular Percentage? - VHE # something_3 = df_grouped.apply(regular_percent_vhe, current_datetime=current_datetime) # df_all = df_all.merge(something_3.to_frame(), on=['customer', 'registration_number']) # df_all.drop(['regular_percentage_vhe'], axis=1, inplace=True) # df_all = df_all.rename(columns={0: 'regular_percentage_vhe'}) # print('6th step done at %.2f' % (time.time() - start), 'seconds') save_csv(df_all, 'output/' + 'db_customer_segmentation_short.csv')
def db_concat(): print('Concatenating CA and CRP DBs...') dtypes = {'nlr_code': int, 'slr_account': str, 'kms': int} parse_dates = ['vehicle_in_date', 'registration_date_pse', 'registration_date_vhe', 'cm_date_start', 'cm_date_end'] db_ca = pd.read_csv('sql_db/' + 'ca_merged.csv', index_col=0, dtype=dtypes, parse_dates=parse_dates, infer_datetime_format=True) db_crp = pd.read_csv('sql_db/' + 'crp_merged.csv', index_col=0, dtype=dtypes, parse_dates=parse_dates, infer_datetime_format=True) # Filling for non-available data of CRP db # db_crp['cm_years'], db_crp['cm_km'] = 0, 0 db = pd.concat([db_ca, db_crp], ignore_index=True, sort=False) first_cols = ['customer', 'slr_document_account', 'registration_number'] rem_cols = [x for x in list(db) if x not in first_cols] cols = first_cols + rem_cols db = db[cols].sort_values(first_cols) save_csv(db, 'sql_db/' + 'db.csv')
def ca_db_merge(file11, file13): print('Merging CA DBs...') # df_ca = pd.read_csv(file11, delimiter=';', encoding='utf-8', dtype={'Chassis_Number': str, 'Registration_Number': str, 'Customer': str}, parse_dates=['Vehicle_In_Date', 'Registration_Date'], infer_datetime_format=True) # df_dw_ca = pd.read_csv(file13, delimiter=';', encoding='utf-8', dtype={'Chassis_Number': str, 'Registration_Number': str}, parse_dates=['Vehicle_In_Date', 'Registration_Date'], infer_datetime_format=True) df_ca = pd.read_csv(file11, delimiter=';', encoding='utf-8', dtype={'Chassis_Number': str, 'Registration_Number': str, 'Customer': str}, parse_dates=['Vehicle_In_Date', 'Registration_Date_PSE', 'Registration_Date_VHE'], infer_datetime_format=True) df_dw_ca = pd.read_csv(file13, delimiter=';', encoding='utf-8', dtype={'Chassis_Number': str, 'Registration_Number': str}, parse_dates=['Vehicle_In_Date', 'Registration_Date_PSE', 'Registration_Date_VHE'], infer_datetime_format=True) # print(df_ca['SLR_Document_Date'].min(), df_ca['SLR_Document_Date'].max()) # print(df_dw_ca['SLR_Document_Date'].min(), df_dw_ca['SLR_Document_Date'].max()) # sys.exit() dfs = [df_ca, df_dw_ca] for df in dfs: df.columns = map(str.lower, df.columns) df['registration_number'] = df['registration_number'].str.replace('-', '') df.drop(df[df['slr_document_account'] == 4].index, axis=0, inplace=True) df.drop(df[df['slr_document_account'] == 6].index, axis=0, inplace=True) df.rename(columns={'(no column name)': 'total'}, inplace=True) df = chassis_strip(df) df = registration_number_flag(df) df_ca_concat = pd.concat(dfs, ignore_index=True) # print(df_ca_concat[df_ca_concat['registration_number'] == '98IQ85']) # sys.exit() df_ca_grouped = df_ca_concat.groupby(['registration_number']) # print(df_ca_concat['slr_document_date'].min(), df_ca_concat['slr_document_date'].max()) # df_ca['customer'] = df_ca_grouped['customer'].transform(lambda x: 'No values to aggregate' if pd.isnull(x).all() == True else x.fillna(method='ffill').fillna(method='bfill')) df_ca_concat['customer'] = df_ca_grouped['customer'].transform(lambda x: x.fillna(method='ffill').fillna(method='bfill')) df_ca_concat['registration_date_pse'] = df_ca_grouped['registration_date_pse'].transform(lambda x: x.fillna(method='ffill').fillna(method='bfill')) # df_ca['registration_date_vhe'] = df_ca_grouped['registration_date_vhe'].transform(lambda x: x.fillna(method='ffill').fillna(method='bfill')) # print(df_ca_concat[df_ca_concat['registration_number'] == '98IQ85']) # sys.exit() # print(df_ca_concat['slr_document_date'].min(), df_ca_concat['slr_document_date'].max()) # output_file = 'sql_db/' + 'ca.csv' save_csv(df_ca_concat, 'sql_db/' + 'ca.csv')
def main(): start = time.time() print('Creating DB...') input_file = 'sql_db/' + 'ENCOMENDA.csv' input_file_2 = '' full_db = 'output/' + 'db_full_baviera.csv' stock_opt_db = 'output/' + 'db_baviera_stock_optimization.csv' new_data_check = 0 df = pd.read_csv(input_file, delimiter=';', parse_dates=['Data Compra', 'Data Venda'], infer_datetime_format=True, decimal=',') if new_data_check: df2 = pd.read_csv(input_file_2, delimiter=';', parse_dates=['Data Compra', 'Data Venda'], infer_datetime_format=True, decimal=',') df = pd.concat([df, df2], axis=0) df_initial = db_creation(df) df_second_step = db_color_replacement(df_initial) df_third_step = db_score_calculation(df_second_step) df_final = db_duplicate_removal(df_third_step) sel_cols = [ 'Modelo', 'Local da Venda', 'Prov', 'Cor_Interior', 'Cor_Exterior', 'Navegação', 'Sensores', 'Caixa Auto', 'Jantes', 'buy_day', 'buy_month', 'buy_year', 'sell_day', 'sell_month', 'sell_year', 'price_total', 'stock_days', 'Margem', 'margem_percentagem' ] save_csv(df_final[sel_cols], stock_opt_db) save_csv(df_final, full_db) print('Runnning time: %.2f' % (time.time() - start))