def get_data(options_file_in): df = level_1_a_data_acquisition.sql_retrieve_df( options_file_in.DSN_MLG_PRD, options_file_in.sql_info['database_final'], options_file_in.sql_info['final_table'], options_file_in) df_goals = level_1_a_data_acquisition.sql_retrieve_df( options_file_in.DSN_MLG_PRD, options_file_in.sql_info['database_final'], options_file_in.sql_info['goals_table'], options_file_in) return df, df_goals
def get_data_non_cached(options_file_in, classification_flag): gamas = level_1_a_data_acquisition.sql_retrieve_df(options_file_in.DSN_SRV3_PRD, options_file_in.sql_info['database_source'], options_file_in.sql_info['commercial_version_matching'], options_file_in, query_filters={'Classification_Flag': classification_flag}) gamas['PT_PDB_Model_Desc'] = gamas['PT_PDB_Model_Desc'].str.lower() gamas.dropna(subset=['PT_PDB_Commercial_Version_Desc_Old'], inplace=True) gamas.drop_duplicates(subset=['PT_PDB_Model_Desc', 'PT_PDB_Commercial_Version_Desc_Old', 'PT_PDB_Commercial_Version_Desc_New'], inplace=True) # This removes duplicates matching rows, even the ones without corresponding Gama Viva. There is however a case where the same Gama Morta has two matches: null and a corresponding Gama Viva - 1.4 TGDi DCT Style MY19'5 + TA for model i30 SW gamas['PT_PDB_Commercial_Version_Desc_New'].fillna(' ', inplace=True) return gamas
def data_acquisition(options_file_in): try: df = pd.read_csv(options_file_in.temp_file_loc) except FileNotFoundError: print('File not found, retrieving from SQL...') df = sql_retrieve_df(options_file.DSN_MLG_DEV, options_file_in.sql_info['database_source'], options_file_in.sql_info['source_table'], options_file_in) df.to_csv(options_file_in.temp_file_loc, index=False) return df.sort_values(by=['Movement_Date', 'WIP_Number'], na_position='first')
def data_acquisition(input_file, query_filters, local=0): performance_info_append(time.time(), 'Section_A_Start') log_record('Início Secção A...', project_id) if local: try: df = read_csv(input_file, encoding='utf-8', parse_dates=['Purchase_Date', 'Sell_Date'], usecols=level_2_optionals_baviera_options. sql_to_code_renaming.keys(), infer_datetime_format=True, decimal='.') column_rename( df, list(level_2_optionals_baviera_options.sql_to_code_renaming. keys()), list(level_2_optionals_baviera_options.sql_to_code_renaming. values())) except UnicodeDecodeError: df = read_csv(input_file, encoding='latin-1', parse_dates=['Purchase_Date', 'Sell_Date'], usecols=level_2_optionals_baviera_options. sql_to_code_renaming.keys(), infer_datetime_format=True, decimal='.') column_rename( df, list(level_2_optionals_baviera_options.sql_to_code_renaming. keys()), list(level_2_optionals_baviera_options.sql_to_code_renaming. values())) else: df = sql_retrieve_df( level_2_optionals_baviera_options.DSN_MLG_PRD, level_2_optionals_baviera_options.sql_info['database'], level_2_optionals_baviera_options.sql_info['initial_table'], level_2_optionals_baviera_options, list( level_2_optionals_baviera_options.sql_to_code_renaming.keys()), query_filters, column_renaming=1, parse_dates=['Purchase_Date', 'Sell_Date']) project_units_count_checkup(df, 'Nº Stock', level_2_optionals_baviera_options, sql_check=0) log_record('Fim Secção A.', project_id) performance_info_append(time.time(), 'Section_A_End') return df
def data_acquisition(input_files, query_filters, local=0): performance_info_append(time.time(), 'Section_A_Start') df_facts, df_facts_duration, df_clients, df_pbi_categories, df_manual_classifications, keywords_df = pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame() log_record('Início Secção A...', options_file.project_id) if local: df_facts = read_csv(input_files[0], index_col=0, parse_dates=options_file.date_columns, infer_datetime_format=True) df_facts_duration = read_csv(input_files[1], index_col=0) df_clients = read_csv(input_files[2], index_col=0) df_pbi_categories = read_csv(input_files[3], index_col=0) df_manual_classifications = read_csv(input_files[4], index_col=0) keywords_df = read_csv(input_files[5], index_col=0) elif not local: df_facts = sql_retrieve_df(options_file.DSN_SRV3_PRD, options_file.sql_info['database_source'], options_file.sql_info['initial_table_facts'], options_file, options_file.sql_fact_columns, query_filters=query_filters[0], parse_dates=options_file.date_columns) df_facts_duration = sql_retrieve_df(options_file.DSN_SRV3_PRD, options_file.sql_info['database_source'], options_file.sql_info['initial_table_facts_durations'], options_file, options_file.sql_facts_durations_columns, query_filters=query_filters[1]) df_clients = sql_retrieve_df(options_file.DSN_SRV3_PRD, options_file.sql_info['database_source'], options_file.sql_info['initial_table_clients'], options_file, options_file.sql_dim_contacts_columns) df_pbi_categories = sql_retrieve_df(options_file.DSN_SRV3_PRD, options_file.sql_info['database_source'], options_file.sql_info['initial_table_pbi_categories'], options_file, options_file.sql_pbi_categories_columns, query_filters=query_filters[1]) df_manual_classifications = sql_retrieve_df(options_file.DSN_SRV3_PRD, options_file.sql_info['database_source'], options_file.sql_info['aux_table'], options_file) keywords_df = sql_retrieve_df(options_file.DSN_MLG_PRD, options_file.sql_info['database_final'], options_file.sql_info['keywords_table_str'], options_file, columns=['Keywords_PT', 'Keywords_ES']).dropna() save_csv([df_facts, df_facts_duration, df_clients, df_pbi_categories, df_manual_classifications, keywords_df], ['dbs/db_facts_initial', 'dbs/db_facts_duration', 'dbs/db_clients_initial', 'dbs/db_pbi_categories_initial', 'dbs/db_manual_classification', 'dbs/db_keywords_df']) keyword_dict, ranking_dict = sql_mapping_retrieval(options_file.DSN_MLG_PRD, options_file.sql_info['database_final'], options_file.sql_info['keywords_table'], 'Keyword_Group', options_file, multiple_columns=1) keyword_dict = keyword_dict[0] log_record('Fim Secção A...', options_file.project_id) performance_info_append(time.time(), 'Section_A_End') return df_facts, df_facts_duration, df_clients, df_pbi_categories, df_manual_classifications, keywords_df, keyword_dict, ranking_dict
def main(): # get data df_facts = sql_retrieve_df( options_file.DSN_SRV3_PRD, options_file.sql_info['database_source'], options_file.sql_info['initial_table_facts_durations'], options_file, columns=options_file.model_training_fact_cols, query_filters={'Cost_Centre': '6825'}, parse_dates=options_file.date_columns) df_labels = sql_retrieve_df(options_file.DSN_SRV3_PRD, options_file.sql_info['database_source'], options_file.sql_info['final_table'], options_file, parse_dates=options_file.date_columns) labeled_requests = first_step(df_facts, df_labels) labeled_requests = second_step(labeled_requests) labeled_requests = third_step(labeled_requests) classified_dataset, non_classified_dataset = fourth_step(labeled_requests) non_classified_df_scored = fifth_step(classified_dataset, non_classified_dataset) return non_classified_df_scored
def get_data(options_file_in): df = level_1_a_data_acquisition.sql_retrieve_df( options_file_in.DSN_MLG_PRD, options_file_in.sql_info['database_final'], options_file_in.sql_info['final_table'], options_file_in) df = level_1_b_data_processing.column_rename( df, configuration_parameters_full + extra_parameters, configuration_parameters_full_rename + extra_parameters_rename) df = df.loc[df[column_translate['Model_Code']] != '', :] df = level_1_b_data_processing.boolean_replacement(df, boolean_columns) df = df[df[column_translate['Colour_Ext']] != 'undefined'] df = df[df[column_translate['Colour_Int']] != '0'] return df[configuration_parameters_full_rename + extra_parameters_rename]
def get_data_v2(options_file_in, dsn, db, table, query_filter=None, model_flag=0): df = level_1_a_data_acquisition.sql_retrieve_df(dsn, db, table, options_file_in, query_filters=query_filter) if model_flag: df = model_lowercase(df) return df
def data_acquisition(query_filters): performance_info_append(time.time(), 'Section_A_Start') log_record('Início Secção A...', project_id) df = sql_retrieve_df( level_2_optionals_cdsu_options.DSN_MLG_PRD, level_2_optionals_cdsu_options.sql_info['database'], level_2_optionals_cdsu_options.sql_info['initial_table'], level_2_optionals_cdsu_options, list(level_2_optionals_cdsu_options.sql_to_code_renaming.keys()), query_filters, column_renaming=1, parse_dates=['Purchase_Date', 'Sell_Date']) # project_units_count_checkup(df, 'Nº Stock', level_2_optionals_cdsu_options, sql_check=0) log_record('Fim Secção A.', project_id) performance_info_append(time.time(), 'Section_A_End') return df
def main(): dgo_family_10_loc = 'dbs/dgo_familia_10_prepared.csv' dgo_family_13_loc = 'dbs/dgo_familia_13_prepared.csv' dgo_manual_classified_parts_loc = 'dbs/dgo_classified_parts.csv' log_record('Step 1 started.', options_file.project_id) master_file = sql_retrieve_df(options_file.DSN_MLG_PRD, options_file.sql_info['database_final'], options_file.sql_info['final_table'], options_file, column_renaming=1) master_file['Product_Group_DW'] = master_file['Product_Group_DW'].astype(str) log_record('Step 1 ended.', options_file.project_id) master_file = flow_step_2(master_file) control_prints(master_file, '2') master_file, keywords_df = flow_step_3(master_file) control_prints(master_file, '3') master_file = flow_step_4(master_file) control_prints(master_file, '4') master_file = flow_step_4_5(master_file, keywords_df) control_prints(master_file, '4.5') master_file, manual_classifications = flow_step_5(master_file, [dgo_family_10_loc, dgo_family_13_loc, dgo_manual_classified_parts_loc]) control_prints(master_file, '5') master_file_non_classified, master_file_other_families, master_file_classified_families = flow_step_6(master_file) control_prints(master_file_non_classified, '6a') control_prints(master_file_classified_families, '6b') control_prints(master_file_other_families, '6c') master_file_classified_families_filtered = flow_step_7(master_file_classified_families) control_prints(master_file_classified_families_filtered, '7 - master_file_classified_families_filtered') master_file_other_families_filtered = flow_step_7(master_file_other_families) control_prints(master_file_other_families_filtered, '7 - master_file_other_families_filtered') master_file_final, main_families_cm, main_families_metrics_dict, other_families_cm, other_families_metrics_dict = flow_step_8(master_file_classified_families_filtered, master_file_other_families_filtered, master_file_non_classified) control_prints(master_file_final, '8') master_file_final = flow_step_9(master_file_final) control_prints(master_file_final, '9') master_file_final = flow_step_10(master_file_final, manual_classifications) control_prints(master_file_final, '10') master_file_final.to_csv('dbs/master_file_final.csv', index=False) deployment(master_file_final, main_families_cm, other_families_cm) return main_families_metrics_dict, other_families_metrics_dict
def get_fact_pa_classifications(): pse_fact_pa_parts_classification_refs = sql_retrieve_df(options_file.DSN_MLG_PRD, options_file.sql_info['database_final'], options_file.sql_info['parts_classification_refs'], options_file, columns=['Part_Ref', 'Old_Product_Group_DW', 'New_Product_Group_DW']) return pse_fact_pa_parts_classification_refs
def get_data_non_cached(options_file_in, dsn, db, view, columns, query_filters=0): df = level_1_a_data_acquisition.sql_retrieve_df(dsn, db, view, options_file_in, columns, query_filters) return df
def data_processing(df, target_variable, oversample_check, number_of_features): performance_info_append(time.time(), 'Section_B_Start') log_record('Início Secção B...', project_id) model_mapping = {} if sql_date_comparison( level_2_optionals_baviera_options.DSN_MLG_PRD, level_2_optionals_baviera_options, level_2_optionals_baviera_options.sql_info['database'], level_2_optionals_baviera_options.sql_info['checkpoint_b_table'], 'Date', level_2_optionals_baviera_options.update_frequency_days): log_record( 'Checkpoint não encontrado ou demasiado antigo. A processar dados...', project_id) df = lowercase_column_conversion( df, ['Opcional', 'Cor', 'Interior', 'Versão' ]) # Lowercases the strings of these columns control_prints(df, '1', head=1) dict_strings_to_replace = { ('Modelo', ' - não utilizar'): '', ('Interior', '\\|'): '/', ('Cor', '|'): '', ('Interior', 'ind.'): '', ('Interior', ']'): '/', ('Interior', '\\.'): ' ', ('Interior', '\'merino\''): 'merino', ('Interior', '\' merino\''): 'merino', ('Interior', '\'vernasca\''): 'vernasca', ('Interior', 'leder'): 'leather', ('Interior', 'p '): 'pele', ('Interior', 'pelenevada'): 'pele nevada', ('Opcional', 'bi-xénon'): 'bixénon', ('Opcional', 'vidro'): 'vidros', ('Opcional', 'dacota'): 'dakota', ('Opcional', 'whites'): 'white', ('Opcional', 'beige'): 'bege', ('Interior', '\'dakota\''): 'dakota', ('Interior', 'dacota'): 'dakota', ('Interior', 'mokka'): 'mocha', ('Interior', 'beige'): 'bege', ('Interior', 'dakota\''): 'dakota', ('Interior', 'antracite/cinza/p'): 'antracite/cinza/preto', ('Interior', 'antracite/cinza/pretoreto'): 'antracite/cinza/preto', ('Interior', 'nevada\''): 'nevada', ('Interior', '"nappa"'): 'nappa', ('Interior', 'anthrazit'): 'antracite', ('Interior', 'antracito'): 'antracite', ('Interior', 'preto/laranja/preto/lara'): 'preto/laranja', ('Interior', 'anthtacite'): 'antracite', ('Interior', 'champag'): 'champagne', ('Interior', 'cri'): 'crimson', ('Modelo', 'Enter Model Details'): '', ('Registration_Number', '\.'): '', ('Interior', 'preto/m '): 'preto ', ('Interior', 'congnac/preto'): 'cognac/preto', ('Local da Venda', 'DCN'): 'DCP', ('Cor', 'blue\\|'): 'azul' } df = string_replacer( df, dict_strings_to_replace ) # Replaces the strings mentioned in dict_strings_to_replace which are typos, useless information, etc control_prints(df, '2', head=1) df.dropna(axis=0, inplace=True) # Removes all remaining NA's control_prints(df, '3') df = new_column_creation( df, [ x for x in level_2_optionals_baviera_options. configuration_parameters_full if x != 'Modelo' ], 0 ) # Creates new columns filled with zeros, which will be filled in the future dict_cols_to_take_date_info = {'buy_': 'Data Compra'} df = date_cols( df, dict_cols_to_take_date_info ) # Creates columns for the datetime columns of dict_cols_to_take_date_info, with just the day, month and year df = total_price( df ) # Creates a new column with the total cost for each configuration; df = remove_zero_price_total_vhe( df, project_id ) # Removes VHE with a price total of 0; ToDo: keep checking up if this is still necessary control_prints(df, '4') df = remove_rows( df, [df[df.Modelo.str.contains('MINI')].index], project_id ) # No need for Prov filtering, as it is already filtered in the data source; control_prints(df, '5') df = remove_rows( df, [df[df.Franchise_Code.str.contains('T|Y|R|G|C|175')].index], project_id ) # This removes Toyota Vehicles that aren't supposed to be in this model control_prints(df, '6') df = remove_rows( df, [df[(df.Colour_Ext_Code == ' ') & (df.Cor == ' ')].index], project_id, warning=1) control_prints(df, '7') df = options_scraping( df, level_2_optionals_baviera_options, model_training_check=model_training_check ) # Scrapes the optionals columns for information regarding the GPS, Auto Transmission, Posterior Parking Sensors, External and Internal colours, Model and Rim's Size control_prints(df, '8', head=0) df = remove_rows(df, [df[df.Modelo.isnull()].index], project_id, warning=1) control_prints(df, '7') df = remove_columns( df, ['Colour_Ext_Code'], project_id ) # This column was only needed for some very specific cases where no Colour_Ext_Code was available; control_prints(df, '9') project_units_count_checkup(df, 'Nº Stock', level_2_optionals_baviera_options, sql_check=1) df = color_replacement( df, level_2_optionals_baviera_options.colors_to_replace_dict, project_id) # Translates all english colors to portuguese control_prints(df, '10') df = duplicate_removal( df, subset_col='Nº Stock' ) # Removes duplicate rows, based on the Stock number. This leaves one line per configuration; control_prints(df, '11') df = remove_columns(df, [ 'Cor', 'Interior', 'Opcional', 'Custo', 'Versão', 'Franchise_Code' ], project_id) # Remove columns not needed atm; # Will probably need to also remove: stock_days, stock_days_norm, and one of the scores control_prints(df, '12') df = remove_rows( df, [df.loc[df['Local da Venda'] == 'DCV - Viat.Toy Viseu', :].index], project_id ) # Removes the vehicles sold here, as they are from another brand (Toyota) control_prints(df, '13') df = margin_calculation( df) # Calculates the margin in percentage of the total price df = score_calculation( df, [level_2_optionals_baviera_options.stock_days_threshold], level_2_optionals_baviera_options.margin_threshold, level_2_optionals_baviera_options.project_id ) # Classifies the stockdays and margin based in their respective thresholds in tow classes (0 or 1) and then creates a new_score metric, # where only configurations with 1 in both dimension, have 1 as new_score # df = new_column_creation(df, ['Local da Venda_v2'], df['Local da Venda']) control_prints(df, '14', head=1) if model_training_check: cols_to_group_layer_2 = [ 'Jantes', 'Local da Venda', 'Local da Venda_v2', 'Modelo', 'Versao', 'Tipo_Interior', 'Cor_Exterior', 'Cor_Interior', 'Motor' ] mapping_dictionaries, _ = sql_mapping_retrieval( level_2_optionals_baviera_options.DSN_MLG_PRD, level_2_optionals_baviera_options.sql_info['database'], level_2_optionals_baviera_options.sql_info['mappings'], 'Mapped_Value', level_2_optionals_baviera_options) else: mapping_sell_place = sql_retrieve_df( level_2_optionals_baviera_options.DSN_MLG_PRD, level_2_optionals_baviera_options.sql_info['database'], level_2_optionals_baviera_options. sql_info['mappings_sale_place'], level_2_optionals_baviera_options) mapping_sell_place_v2 = sql_retrieve_df( level_2_optionals_baviera_options.DSN_MLG_PRD, level_2_optionals_baviera_options.sql_info['database'], level_2_optionals_baviera_options. sql_info['mappings_sale_place_v2'], level_2_optionals_baviera_options) mapping_sell_place_fase2 = sql_retrieve_df( level_2_optionals_baviera_options.DSN_MLG_PRD, level_2_optionals_baviera_options.sql_info['database'], level_2_optionals_baviera_options. sql_info['mappings_sale_place_fase2'], level_2_optionals_baviera_options) # control_prints(df, 'mapping_testing_before', head=1) df = pd.merge(df, mapping_sell_place, left_on='Local da Venda', right_on='Original_Value', how='left') df = column_rename(df, ['Mapped_Value'], ['Local da Venda_v1']) df = df.drop(['Original_Value'], axis=1) nan_detection(df, 'Local da Venda', 'Local da Venda_v1') control_prints(df, 'mapping_testing_v1', head=1) df = pd.merge(df, mapping_sell_place_v2, left_on='Local da Venda', right_on='Original_Value', how='left') df = column_rename(df, ['Mapped_Value'], ['Local da Venda_v2']) df = df.drop(['Original_Value'], axis=1) nan_detection(df, 'Local da Venda', 'Local da Venda_v2') control_prints(df, 'mapping_testing_v2', head=1) mapping_sell_place_fase2[ 'Mapped_Value'] = mapping_sell_place_fase2[ 'Mapped_Value'].str.lower() # print(mapping_sell_place_fase2) df = pd.merge(df, mapping_sell_place_fase2, left_on='Local da Venda_v2', right_on='Mapped_Value', how='left') df = column_rename(df, ['Original_Value'], ['Local da Venda_fase2']) df = df.drop(['Mapped_Value'], axis=1) nan_detection(df, 'Local da Venda_v2', 'Local da Venda_fase2') control_prints(df, 'after mapping fase2', head=1) df = df.drop(['Local da Venda'], axis=1) df = column_rename(df, ['Local da Venda_v1', 'Local da Venda_fase2'], ['Local da Venda', 'Local da Venda_Fase2_level_2']) df = new_features( df, configuration_parameters, project_id ) # Creates a series of new features, explained in the provided pdf control_prints(df, 'after_renaming', head=1) global_variables_saving( df, level_2_optionals_baviera_options.project_id ) # Small functions to save 2 specific global variables which will be needed later log_record('Checkpoint B.1...', project_id) else: log_record('Checkpoint Found. Retrieving data...', project_id) df = sql_retrieve_df( level_2_optionals_baviera_options.DSN_MLG_PRD, level_2_optionals_baviera_options.sql_info['database'], level_2_optionals_baviera_options.sql_info['checkpoint_b_table'], level_2_optionals_baviera_options, list(level_2_optionals_baviera_options. column_checkpoint_sql_renaming.values())) df = column_rename( df, list(level_2_optionals_baviera_options. column_checkpoint_sql_renaming.values()), list(level_2_optionals_baviera_options. column_checkpoint_sql_renaming.keys())) log_record('Fim Secção B.', project_id) performance_info_append(time.time(), 'Section_B_End') return df
def get_data_sql(options_file_in, db, view): df = level_1_a_data_acquisition.sql_retrieve_df(options_file_in.DSN_MLG_PRD, db, view, options_file_in) df = df.set_index('Actual') return df