def flow_step_3(df): log_record('Step 3 started.', options_file.project_id) # Step 0 lower_case_cols = ['Part_Desc_PT', 'Part_Desc'] df = lowercase_column_conversion(df.copy(), lower_case_cols) # Step 1 df = trim_columns(df.copy(), lower_case_cols) # Step 1.5 text_cols = ['Part_Desc_PT', 'Part_Desc'] for col in text_cols: df[col] = df[col].apply(unidecode) df[col] = df[col].apply(coerce_to_unicode) df[col] = df[col].apply(remove_punctuations) df[col] = df[col].apply(lambda x: ' '.join([word for word in x.split() if word not in options_file.stop_words_common])) # Step 2 df = feature_engineering_counts(df) # df = feature_engineering_part_desc(df) # Step 3 # Removes numbers df['Part_Desc_PT'] = df['Part_Desc_PT'].str.replace(r'\d+', ' ') df['Part_Desc'] = df['Part_Desc'].str.replace(r'\d+', ' ') # Step 3.5 # Creates keywords data part_desc_keywords = keyword_generation(df, 'Part_Desc') part_desc_pt_keywords = keyword_generation(df, 'Part_Desc_PT') keywords = pd.concat([part_desc_keywords, part_desc_pt_keywords]) keywords.to_csv('dbs/keywords_per_product_group_dw.csv', index=False) # Step 4 df = product_group_dw_corrections_on_desc(df.copy()) # Step 5 rows_to_remove_regex_filter_1 = r'(?<=^B.{11})AT$|(?<=^B.{15})AT$' rows_to_remove_regex_filter_2 = r'(?<=^BM.{11})AT$|(?<=^BM.{15})AT$' filter_3 = df['Part_Ref'].str.contains(rows_to_remove_regex_filter_1, na=False) filter_4 = df['Part_Ref'].str.contains(rows_to_remove_regex_filter_2, na=False) df = df[~filter_3 & ~filter_4] # Step 6 df.loc[df['Part_Desc'] == df['Part_Desc_Copy'], 'Part_Desc_Copy'] = np.nan # df.loc[df['Part_Desc'] != df['Part_Desc_Copy'], 'Part_Desc_Copy'] = df['Product_Group_DW'] df.loc[df['Part_Desc'] != df['Part_Desc_Copy'], 'Part_Desc_Copy'] = df['Part_Desc_Copy'] # Step 7 df.loc[df['Part_Desc_Copy'].isnull(), 'New_Product_Group_DW'] = df.loc[df['Part_Desc_Copy'].isnull(), 'Product_Group_DW'] df.loc[~df['Part_Desc_Copy'].isnull(), 'New_Product_Group_DW'] = df.loc[~df['Part_Desc_Copy'].isnull(), 'Part_Desc_Copy'] # Step 8 df.drop(['Product_Group_DW', 'Part_Desc_Copy'], axis=1, inplace=True) # Step 9 df.rename(columns={'New_Product_Group_DW': 'Product_Group_DW'}, inplace=True) # Step 10 df = product_group_dw_complete_replaces(df.copy()) log_record('Step 3 ended.', options_file.project_id) return df, keywords
def data_processing(df_facts, df_facts_duration, df_clients, df_pbi_categories, keywords_df): performance_info_append(time.time(), 'Section_B_Start') log_record('Início Secção B...', options_file.project_id) dict_strings_to_replace = {('Description', 'filesibmcognoscbindatacqertmodelsfdfdeeacebedeabeeabbedrtm'): 'files ibm cognos', ('Description', 'cognosapv'): 'cognos apv', ('Description', 'caetanoautopt'): 'caetano auto pt', ('Description', 'autolinecognos'): 'autoline cognos', ('Description', 'realnao'): 'real nao', ('Description', 'booksytner'): 'book sytner'} # ('Description', 'http://'): 'http://www.', ('Summary', 'http://'): 'http://www.' # Remove PBI's categories requests log_record('Contagem inicial de pedidos: {}'.format(df_facts['Request_Num'].nunique()), options_file.project_id) pbi_categories = remove_rows(df_pbi_categories.copy(), [df_pbi_categories[~df_pbi_categories['Category_Name'].str.contains('Power BI')].index], options_file.project_id)['Category_Id'].values # Selects the Category ID's which belong to PBI log_record('Contagem de pedidos PBI: {}'.format(df_facts[df_facts['Category_Id'].isin(pbi_categories)]['Request_Num'].nunique()), options_file.project_id) df_facts = remove_rows(df_facts, [df_facts.loc[df_facts['Category_Id'].isin(pbi_categories)].index], options_file.project_id) # Removes the rows which belong to PBI; log_record('Após o filtro de pedidos PBI, a nova contagem é de: {}'.format(df_facts['Request_Num'].nunique()), options_file.project_id) # Lowercase convertion of Summary and Description df_facts = lowercase_column_conversion(df_facts, columns=['Summary', 'Description']) # Addition of Client/Assignee Information and imputation of some missing values df_facts = df_join_function(df_facts, df_facts_duration.set_index('Request_Num'), on='Request_Num') df_facts = df_join_function(df_facts, df_clients.set_index('Contact_Id'), on='Contact_Customer_Id') df_facts = value_replacement(df_facts, options_file.assignee_id_replacements) df_facts = df_join_function(df_facts, df_clients.set_index('Contact_Id'), on='Contact_Assignee_Id', lsuffix='_Customer', rsuffix='_Assignee') df_facts = value_replacement(df_facts, options_file.sla_resolution_hours_replacements) # Collection of all Client/Assignee possible names unique_clients_names_decoded = string_to_list(df_facts, ['Name_Customer']) unique_clients_login_decoded = string_to_list(df_facts, ['Login_Name_Customer']) unique_assignee_names_decoded = string_to_list(df_facts, ['Name_Assignee']) unique_assignee_login_decoded = string_to_list(df_facts, ['Login_Name_Assignee']) # Imputation of missing values for Name_Assignee Column df_facts = null_handling(df_facts, {'Name_Assignee': 'Fechados pelo Cliente'}) # Replaces resolve date by close date when the first is null and second exists df_facts = value_substitution(df_facts, non_null_column='Close_Date', null_column='Resolve_Date') # df_facts = df_facts.groupby('Request_Num').apply(close_and_resolve_date_replacements) # Currently doing nothing, hence why it's commented # Removes duplicate request numbers df_facts = duplicate_removal(df_facts, ['Request_Num']) # Removes new lines, tabs, etc; df_facts = literal_removal(df_facts, 'Description') # Replaces string errors, specified in the provided dictionary df_facts = string_replacer(df_facts, dict_strings_to_replace) df_facts = value_replacement(df_facts, {'Description': options_file.regex_dict['url']}) df_facts = value_replacement(df_facts, {'Summary': options_file.regex_dict['url']}) df_facts = value_substitution(df_facts, non_null_column='Summary', null_column='Description') # Replaces description by summary when the first is null and second exists df_facts = language_detection(df_facts, 'Description', 'Language') df_facts = string_replacer(df_facts, {('Language', 'ca'): 'es', ('Category_Id', 'pcat:'): ''}) df_facts = summary_description_null_checkup(df_facts) # Cleans requests which have the Summary and Description null stop_words_list = options_file.words_to_remove_from_description + unique_clients_names_decoded + unique_clients_login_decoded + unique_assignee_names_decoded + unique_assignee_login_decoded df_facts['Description'] = df_facts['Description'].apply(stop_words_removal, args=(stop_words_list,)) if similar_process_flag: df_facts = similar_words_handling(df_facts, keywords_df, options_file.testing_dict) df_facts = text_preprocess(df_facts, unique_clients_names_decoded + unique_clients_login_decoded + unique_assignee_names_decoded + unique_assignee_login_decoded, options_file) df_facts = value_replacement(df_facts, options_file.language_replacements) # Checkpoint B.1 - Key Words data frame creation df_facts, df_top_words = top_words_processing(df_facts, description_col='StemmedDescription') log_record('Após o processamento a contagem de pedidos é de: {}'.format(df_facts['Request_Num'].nunique()), options_file.project_id) log_record('Fim Secção B.', options_file.project_id) performance_info_append(time.time(), 'Section_B_End') return df_facts, df_top_words
def data_processing(df, target_variable, oversample_check, number_of_features): performance_info_append(time.time(), 'Section_B_Start') log_record('Início Secção B...', project_id) model_mapping = {} if sql_date_comparison( level_2_optionals_baviera_options.DSN_MLG_PRD, level_2_optionals_baviera_options, level_2_optionals_baviera_options.sql_info['database'], level_2_optionals_baviera_options.sql_info['checkpoint_b_table'], 'Date', level_2_optionals_baviera_options.update_frequency_days): log_record( 'Checkpoint não encontrado ou demasiado antigo. A processar dados...', project_id) df = lowercase_column_conversion( df, ['Opcional', 'Cor', 'Interior', 'Versão' ]) # Lowercases the strings of these columns control_prints(df, '1', head=1) dict_strings_to_replace = { ('Modelo', ' - não utilizar'): '', ('Interior', '\\|'): '/', ('Cor', '|'): '', ('Interior', 'ind.'): '', ('Interior', ']'): '/', ('Interior', '\\.'): ' ', ('Interior', '\'merino\''): 'merino', ('Interior', '\' merino\''): 'merino', ('Interior', '\'vernasca\''): 'vernasca', ('Interior', 'leder'): 'leather', ('Interior', 'p '): 'pele', ('Interior', 'pelenevada'): 'pele nevada', ('Opcional', 'bi-xénon'): 'bixénon', ('Opcional', 'vidro'): 'vidros', ('Opcional', 'dacota'): 'dakota', ('Opcional', 'whites'): 'white', ('Opcional', 'beige'): 'bege', ('Interior', '\'dakota\''): 'dakota', ('Interior', 'dacota'): 'dakota', ('Interior', 'mokka'): 'mocha', ('Interior', 'beige'): 'bege', ('Interior', 'dakota\''): 'dakota', ('Interior', 'antracite/cinza/p'): 'antracite/cinza/preto', ('Interior', 'antracite/cinza/pretoreto'): 'antracite/cinza/preto', ('Interior', 'nevada\''): 'nevada', ('Interior', '"nappa"'): 'nappa', ('Interior', 'anthrazit'): 'antracite', ('Interior', 'antracito'): 'antracite', ('Interior', 'preto/laranja/preto/lara'): 'preto/laranja', ('Interior', 'anthtacite'): 'antracite', ('Interior', 'champag'): 'champagne', ('Interior', 'cri'): 'crimson', ('Modelo', 'Enter Model Details'): '', ('Registration_Number', '\.'): '', ('Interior', 'preto/m '): 'preto ', ('Interior', 'congnac/preto'): 'cognac/preto', ('Local da Venda', 'DCN'): 'DCP', ('Cor', 'blue\\|'): 'azul' } df = string_replacer( df, dict_strings_to_replace ) # Replaces the strings mentioned in dict_strings_to_replace which are typos, useless information, etc control_prints(df, '2', head=1) df.dropna(axis=0, inplace=True) # Removes all remaining NA's control_prints(df, '3') df = new_column_creation( df, [ x for x in level_2_optionals_baviera_options. configuration_parameters_full if x != 'Modelo' ], 0 ) # Creates new columns filled with zeros, which will be filled in the future dict_cols_to_take_date_info = {'buy_': 'Data Compra'} df = date_cols( df, dict_cols_to_take_date_info ) # Creates columns for the datetime columns of dict_cols_to_take_date_info, with just the day, month and year df = total_price( df ) # Creates a new column with the total cost for each configuration; df = remove_zero_price_total_vhe( df, project_id ) # Removes VHE with a price total of 0; ToDo: keep checking up if this is still necessary control_prints(df, '4') df = remove_rows( df, [df[df.Modelo.str.contains('MINI')].index], project_id ) # No need for Prov filtering, as it is already filtered in the data source; control_prints(df, '5') df = remove_rows( df, [df[df.Franchise_Code.str.contains('T|Y|R|G|C|175')].index], project_id ) # This removes Toyota Vehicles that aren't supposed to be in this model control_prints(df, '6') df = remove_rows( df, [df[(df.Colour_Ext_Code == ' ') & (df.Cor == ' ')].index], project_id, warning=1) control_prints(df, '7') df = options_scraping( df, level_2_optionals_baviera_options, model_training_check=model_training_check ) # Scrapes the optionals columns for information regarding the GPS, Auto Transmission, Posterior Parking Sensors, External and Internal colours, Model and Rim's Size control_prints(df, '8', head=0) df = remove_rows(df, [df[df.Modelo.isnull()].index], project_id, warning=1) control_prints(df, '7') df = remove_columns( df, ['Colour_Ext_Code'], project_id ) # This column was only needed for some very specific cases where no Colour_Ext_Code was available; control_prints(df, '9') project_units_count_checkup(df, 'Nº Stock', level_2_optionals_baviera_options, sql_check=1) df = color_replacement( df, level_2_optionals_baviera_options.colors_to_replace_dict, project_id) # Translates all english colors to portuguese control_prints(df, '10') df = duplicate_removal( df, subset_col='Nº Stock' ) # Removes duplicate rows, based on the Stock number. This leaves one line per configuration; control_prints(df, '11') df = remove_columns(df, [ 'Cor', 'Interior', 'Opcional', 'Custo', 'Versão', 'Franchise_Code' ], project_id) # Remove columns not needed atm; # Will probably need to also remove: stock_days, stock_days_norm, and one of the scores control_prints(df, '12') df = remove_rows( df, [df.loc[df['Local da Venda'] == 'DCV - Viat.Toy Viseu', :].index], project_id ) # Removes the vehicles sold here, as they are from another brand (Toyota) control_prints(df, '13') df = margin_calculation( df) # Calculates the margin in percentage of the total price df = score_calculation( df, [level_2_optionals_baviera_options.stock_days_threshold], level_2_optionals_baviera_options.margin_threshold, level_2_optionals_baviera_options.project_id ) # Classifies the stockdays and margin based in their respective thresholds in tow classes (0 or 1) and then creates a new_score metric, # where only configurations with 1 in both dimension, have 1 as new_score # df = new_column_creation(df, ['Local da Venda_v2'], df['Local da Venda']) control_prints(df, '14', head=1) if model_training_check: cols_to_group_layer_2 = [ 'Jantes', 'Local da Venda', 'Local da Venda_v2', 'Modelo', 'Versao', 'Tipo_Interior', 'Cor_Exterior', 'Cor_Interior', 'Motor' ] mapping_dictionaries, _ = sql_mapping_retrieval( level_2_optionals_baviera_options.DSN_MLG_PRD, level_2_optionals_baviera_options.sql_info['database'], level_2_optionals_baviera_options.sql_info['mappings'], 'Mapped_Value', level_2_optionals_baviera_options) else: mapping_sell_place = sql_retrieve_df( level_2_optionals_baviera_options.DSN_MLG_PRD, level_2_optionals_baviera_options.sql_info['database'], level_2_optionals_baviera_options. sql_info['mappings_sale_place'], level_2_optionals_baviera_options) mapping_sell_place_v2 = sql_retrieve_df( level_2_optionals_baviera_options.DSN_MLG_PRD, level_2_optionals_baviera_options.sql_info['database'], level_2_optionals_baviera_options. sql_info['mappings_sale_place_v2'], level_2_optionals_baviera_options) mapping_sell_place_fase2 = sql_retrieve_df( level_2_optionals_baviera_options.DSN_MLG_PRD, level_2_optionals_baviera_options.sql_info['database'], level_2_optionals_baviera_options. sql_info['mappings_sale_place_fase2'], level_2_optionals_baviera_options) # control_prints(df, 'mapping_testing_before', head=1) df = pd.merge(df, mapping_sell_place, left_on='Local da Venda', right_on='Original_Value', how='left') df = column_rename(df, ['Mapped_Value'], ['Local da Venda_v1']) df = df.drop(['Original_Value'], axis=1) nan_detection(df, 'Local da Venda', 'Local da Venda_v1') control_prints(df, 'mapping_testing_v1', head=1) df = pd.merge(df, mapping_sell_place_v2, left_on='Local da Venda', right_on='Original_Value', how='left') df = column_rename(df, ['Mapped_Value'], ['Local da Venda_v2']) df = df.drop(['Original_Value'], axis=1) nan_detection(df, 'Local da Venda', 'Local da Venda_v2') control_prints(df, 'mapping_testing_v2', head=1) mapping_sell_place_fase2[ 'Mapped_Value'] = mapping_sell_place_fase2[ 'Mapped_Value'].str.lower() # print(mapping_sell_place_fase2) df = pd.merge(df, mapping_sell_place_fase2, left_on='Local da Venda_v2', right_on='Mapped_Value', how='left') df = column_rename(df, ['Original_Value'], ['Local da Venda_fase2']) df = df.drop(['Mapped_Value'], axis=1) nan_detection(df, 'Local da Venda_v2', 'Local da Venda_fase2') control_prints(df, 'after mapping fase2', head=1) df = df.drop(['Local da Venda'], axis=1) df = column_rename(df, ['Local da Venda_v1', 'Local da Venda_fase2'], ['Local da Venda', 'Local da Venda_Fase2_level_2']) df = new_features( df, configuration_parameters, project_id ) # Creates a series of new features, explained in the provided pdf control_prints(df, 'after_renaming', head=1) global_variables_saving( df, level_2_optionals_baviera_options.project_id ) # Small functions to save 2 specific global variables which will be needed later log_record('Checkpoint B.1...', project_id) else: log_record('Checkpoint Found. Retrieving data...', project_id) df = sql_retrieve_df( level_2_optionals_baviera_options.DSN_MLG_PRD, level_2_optionals_baviera_options.sql_info['database'], level_2_optionals_baviera_options.sql_info['checkpoint_b_table'], level_2_optionals_baviera_options, list(level_2_optionals_baviera_options. column_checkpoint_sql_renaming.values())) df = column_rename( df, list(level_2_optionals_baviera_options. column_checkpoint_sql_renaming.values()), list(level_2_optionals_baviera_options. column_checkpoint_sql_renaming.keys())) log_record('Fim Secção B.', project_id) performance_info_append(time.time(), 'Section_B_End') return df
def data_processing(df): performance_info_append(time.time(), 'Section_B_Start') log_record('Início Secção B...', project_id) log_record( 'Checkpoint não encontrado ou demasiado antigo. A processar dados...', project_id) df = lowercase_column_conversion( df, ['Opcional', 'Cor', 'Interior', 'Versão' ]) # Lowercases the strings of these columns dict_strings_to_replace = { ('Modelo', ' - não utilizar'): '', ('Interior', '\\|'): '/', ('Cor', '\\|'): '', ('Interior', 'ind.'): '', ('Interior', '\\]'): '/', ('Interior', '\\.'): ' ', ('Interior', '\'merino\''): 'merino', ('Interior', '\' merino\''): 'merino', ('Interior', '\'vernasca\''): 'vernasca', ('Interior', 'leder'): 'leather', ('Interior', 'p '): 'pele', ('Interior', 'pelenevada'): 'pele nevada', ('Opcional', 'bi-xénon'): 'bixénon', ('Opcional', 'bi-xenon'): 'bixénon', ('Opcional', 'vidro'): 'vidros', ('Opcional', 'dacota'): 'dakota', ('Opcional', 'whites'): 'white', ('Opcional', 'beige'): 'bege', ('Interior', '\'dakota\''): 'dakota', ('Interior', 'dacota'): 'dakota', ('Interior', 'mokka'): 'mocha', ('Interior', 'beige'): 'bege', ('Interior', 'dakota\''): 'dakota', ('Interior', 'antracite/cinza/p'): 'antracite/cinza/preto', ('Interior', 'antracite/cinza/pretoreto'): 'antracite/cinza/preto', ('Interior', 'nevada\''): 'nevada', ('Interior', '"nappa"'): 'nappa', ('Interior', 'anthrazit'): 'antracite', ('Interior', 'antracito'): 'antracite', ('Interior', 'preto/laranja/preto/lara'): 'preto/laranja', ('Interior', 'anthtacite'): 'antracite', ('Interior', 'champag'): 'champagne', ('Interior', 'cri'): 'crimson', ('Modelo', 'Enter Model Details'): '', ('Registration_Number', '\.'): '', ('Interior', 'preto/m '): 'preto ', ('Interior', 'congnac/preto'): 'cognac/preto', ('Local da Venda', 'DCN'): 'DCP', ('Cor', 'oceanao'): 'oceano', ('Cor', 'ocenao'): 'oceano', ('Interior', 'reto'): 'preto', ('Cor', 'banco'): 'branco', ('Cor', 'catanho'): 'castanho', ('Cor', 'petrìleo'): 'petróleo', ('Interior', 'ecido'): 'tecido', ('Interior', 'ege'): 'bege', ('Interior', 'inza'): 'cinza', ('Interior', 'inzento'): 'cinzento', ('Interior', 'teciso'): 'tecido', ('Opcional', 'autmático'): 'automático', ('Opcional', 'esctacionamento'): 'estacionamento', ('Opcional', 'estacionamernto'): 'estacionamento', ('Opcional', 'pct'): 'pacote', ('Opcional', 'navegaçãp'): 'navegação', ('Opcional', '\\+'): '', ('Versão', 'bussiness'): 'business', ('Versão', 'r-line'): 'rline', ('Versão', 'confortl'): 'confortline', ('Versão', 'high'): 'highline', ('Opcional', 'p/dsg'): 'para dsg', ('Opcional', 'dianteirostraseiros'): 'dianteiros traseiros', ('Opcional', 'dianteirostras'): 'dianteiros traseiros', ('Opcional', 'diant'): 'dianteiros', ('Opcional', 'dttras'): 'dianteiros traseiros', ('Opcional', 'dttrpark'): 'dianteiros traseiros park', ('Opcional', 'dianttras'): 'dianteiros traseiros', ('Opcional', 'câmara'): 'camara', ('Opcional', 'camera'): 'camara', ('Opcional', 'câmera'): 'camara', ('Versão', 'trendtline'): 'trendline', ('Versão', 'trendtline'): 'trendline', ('Versão', 'confort'): 'confortline', ('Versão', 'conftl'): 'confortline', ('Versão', 'hightline'): 'highline', ('Versão', 'bluem'): 'bluemotion', ('Versão', 'bmt'): 'bluemotion', ('Versão', 'up!bluemotion'): 'up! bluemotion', ('Versão', 'up!bluem'): 'up! bluemotion', ('Versão', 'trendl'): 'trendline', ('Versão', 'conft'): 'confortline', ('Versão', 'highlin'): 'highline', ('Versão', 'confortine'): 'confortline', ('Versão', 'cofrtl'): 'confortline', ('Versão', 'confortlline'): 'confortline', ('Versão', 'highl'): 'highline', ('Modelo', 'up!'): 'up' } control_prints(df, '1', head=1) df = string_replacer( df, dict_strings_to_replace ) # Replaces the strings mentioned in dict_strings_to_replace which are typos, useless information, etc control_prints(df, '1b', head=1) df.dropna(subset=['Cor', 'Colour_Ext_Code', 'Modelo', 'Interior'], axis=0, inplace=True) # Removes all remaining NA's control_prints(df, '2') df = new_column_creation( df, [ x for x in level_2_optionals_cdsu_options.configuration_parameters_full if x != 'Modelo' and x != 'Combustível' ], 0 ) # Creates new columns filled with zeros, which will be filled in the future df = total_price( df) # Creates a new column with the total cost for each configuration; control_prints(df, '3a', head=0) df = remove_zero_price_total_vhe( df, project_id ) # Removes VHE with a price total of 0; ToDo: keep checking up if this is still necessary control_prints(df, '3b', head=0) df = remove_rows( df, [df[df.Franchise_Code.str.contains('X')].index], project_id ) # This removes VW Commercials Vehicles that aren't supposed to be in this model df = remove_rows(df, [df[(df.Colour_Ext_Code == ' ') & (df.Cor == ' ')].index], project_id, warning=1) control_prints(df, '3c') df = options_scraping_v2( df, level_2_optionals_cdsu_options, 'Modelo' ) # Scrapes the optionals columns for information regarding the GPS, Auto Transmission, Posterior Parking Sensors, External and Internal colours, Model and Rim's Size control_prints(df, '3d', head=1, null_analysis_flag=1) df.loc[ df['Combustível'].isin(['Elétrico', 'Híbrido']), 'Motor'] = 'N/A' # Defaults the value of motorization for electric/hybrid cars; control_prints(df, '4', head=0, save=1) # df = remove_rows(df, [df[df.Modelo.isnull()].index], project_id, warning=1) df = remove_columns( df, ['Colour_Ext_Code'], project_id ) # This column was only needed for some very specific cases where no Colour_Ext_Code was available; df.to_csv('dbs/df_cdsu.csv', index=False) control_prints(df, '5', head=0, save=1) # project_units_count_checkup(df, 'Nº Stock', level_2_optionals_cdsu_options, sql_check=1) df = color_replacement( df, level_2_optionals_cdsu_options.colors_to_replace_dict, project_id) # Translates all english colors to portuguese control_prints(df, '6', head=0, save=1) df = duplicate_removal( df, subset_col='Nº Stock' ) # Removes duplicate rows, based on the Stock number. This leaves one line per configuration; control_prints(df, '7') df = remove_columns( df, ['Cor', 'Interior', 'Opcional', 'Custo', 'Versão', 'Franchise_Code'], project_id) # Remove columns not needed atm; # Will probably need to also remove: stock_days, stock_days_norm, and one of the scores # df = remove_rows(df, [df.loc[df['Local da Venda'] == 'DCV - Viat.Toy Viseu', :].index], project_id) # Removes the vehicles sold here, as they are from another brand (Toyota) df = margin_calculation( df) # Calculates the margin in percentage of the total price control_prints(df, '8') df = score_calculation( df, [level_2_optionals_cdsu_options.stock_days_threshold], level_2_optionals_cdsu_options.margin_threshold, level_2_optionals_cdsu_options.project_id ) # Classifies the stockdays and margin based in their respective thresholds in tow classes (0 or 1) and then creates a new_score metric, control_prints(df, '9') # where only configurations with 1 in both dimension, have 1 as new_score # df = new_column_creation(df, ['Local da Venda_v2'], df['Local da Venda']) # control_prints(df, '10') # cols_to_group_layer_2 = ['Local da Venda'] # mapping_dictionaries, _ = sql_mapping_retrieval(level_2_optionals_cdsu_options.DSN_MLG_PRD, level_2_optionals_cdsu_options.sql_info['database'], level_2_optionals_cdsu_options.sql_info['mappings_temp'], 'Mapped_Value', level_2_optionals_cdsu_options) # df = sell_place_parametrization(df, 'Local da Venda', 'Local da Venda_Fase2', mapping_dictionaries[2], level_2_optionals_cdsu_options.project_id) # df = col_group(df, cols_to_group_layer_2[0:2], mapping_dictionaries[0:2], project_id) # Based on the information provided by Manuel some entries were grouped as to remove small groups. The columns grouped are mentioned in cols_to_group, and their respective groups are shown in level_2_optionals_cdsu_options control_prints(df, '9b, before new features', null_analysis_flag=1) df = new_features( df, configuration_parameters, project_id ) # Creates a series of new features, explained in the provided pdf control_prints(df, '10, after new_features', null_analysis_flag=1) # global_variables_saving(df, level_2_optionals_cdsu_options.project_id) # Small functions to save 2 specific global variables which will be needed later log_record('Checkpoint B.1...', project_id) # performance_info_append(time.time(), 'checkpoint_b1') df = column_rename( df, list(level_2_optionals_cdsu_options.column_checkpoint_sql_renaming. keys()), list(level_2_optionals_cdsu_options.column_checkpoint_sql_renaming. values())) # sql_inject(df, level_2_optionals_cdsu_options.DSN_MLG_PRD, level_2_optionals_cdsu_options.sql_info['database'], level_2_optionals_cdsu_options.sql_info['checkpoint_b_table'], level_2_optionals_cdsu_options, list(level_2_optionals_cdsu_options.column_checkpoint_sql_renaming.values()), truncate=1, check_date=1) df = column_rename( df, list(level_2_optionals_cdsu_options.column_checkpoint_sql_renaming. values()), list(level_2_optionals_cdsu_options.column_checkpoint_sql_renaming. keys())) df = remove_columns(df, ['Date'], project_id) log_record('Fim Secção B.', project_id) performance_info_append(time.time(), 'Section_B_End') return df
def data_processing(df_sales, df_pdb_dim, configuration_parameters_cols, range_dates, target): performance_info_append(time.time(), 'Section_B_Start') log_record('Início Secção B...', options_file.project_id) current_date, _ = time_tags() try: df_ohe = read_csv( 'dbs/df_hyundai_dataset_ml_version_ohe_{}.csv'.format( current_date), index_col=0, dtype={ 'NDB_VATGroup_Desc': 'category', 'VAT_Number_Display': 'category', 'NDB_Contract_Dealer_Desc': 'category', 'NDB_VHE_PerformGroup_Desc': 'category', 'NDB_VHE_Team_Desc': 'category', 'Customer_Display': 'category', 'Customer_Group_Desc': 'category', 'SLR_Account_Dealer_Code': 'category', 'Product_Code': 'category', 'Sales_Type_Dealer_Code': 'category', 'Sales_Type_Code': 'category', 'Vehicle_Type_Code': 'category', 'Fuel_Type_Code': 'category', 'PT_PDB_Model_Desc': 'category', 'PT_PDB_Engine_Desc': 'category', 'PT_PDB_Transmission_Type_Desc': 'category', 'PT_PDB_Version_Desc': 'category', 'PT_PDB_Exterior_Color_Desc': 'category', 'PT_PDB_Interior_Color_Desc': 'category', 'NDB_Dealer_Code': 'category' }) df_non_ohe = read_csv( 'dbs/df_hyundai_dataset_ml_version_{}.csv'.format(current_date), index_col=0, dtype={ 'NDB_VATGroup_Desc': 'category', 'VAT_Number_Display': 'category', 'NDB_Contract_Dealer_Desc': 'category', 'NDB_VHE_PerformGroup_Desc': 'category', 'NDB_VHE_Team_Desc': 'category', 'Customer_Display': 'category', 'Customer_Group_Desc': 'category', 'SLR_Account_Dealer_Code': 'category', 'Product_Code': 'category', 'Sales_Type_Dealer_Code': 'category', 'Sales_Type_Code': 'category', 'Vehicle_Type_Code': 'category', 'Fuel_Type_Code': 'category', 'PT_PDB_Model_Desc': 'category', 'PT_PDB_Engine_Desc': 'category', 'PT_PDB_Transmission_Type_Desc': 'category', 'PT_PDB_Version_Desc': 'category', 'PT_PDB_Exterior_Color_Desc': 'category', 'PT_PDB_Interior_Color_Desc': 'category', 'NDB_Dealer_Code': 'category' }) df_sales = read_csv( 'dbs/df_hyundai_dataset_all_info_{}.csv'.format(current_date), index_col=0, dtype={ 'SLR_Account_Dealer_Code': object, 'Immobilized_Number': object }, parse_dates=options_file.date_columns) log_record( 'Dados do dia atual foram encontrados. A passar para a próxima secção...', options_file.project_id) except FileNotFoundError: log_record('Dados do dia atual não foram encontrados. A processar...', options_file.project_id) # Step 1 - Dataset cleaning and transforming to 1 line per sale columns_to_convert_to_datetime = [ 'Ship_Arrival_Date', 'SLR_Document_Date_CHS', 'Registration_Request_Date', 'SLR_Document_Date_RGN' ] for column in columns_to_convert_to_datetime: df_sales[column] = pd.to_datetime(df_sales[column]) # Filtering log_record( '1 - Contagem Inicial de Chassis únicos: {}'.format( df_sales['Chassis_Number'].nunique()), options_file.project_id) log_record( '1 - Contagem Inicial de Matrículas únicas: {}'.format( df_sales['Registration_Number'].nunique()), options_file.project_id) print( 'Removal of 49-VG-94 Registration Plate, which presents two Chassis Number' ) df_sales = df_sales[~( df_sales['Registration_Number'] == '49-VG-94')].copy() # Sorting df_sales.sort_values([ 'Ship_Arrival_Date', 'SLR_Document_Date_CHS', 'Registration_Request_Date', 'SLR_Document_Date_RGN' ]) df_sales['No_Registration_Number_Flag'] = 0 df_sales['Registration_Number_No_SLR_Document_RGN_Flag'] = 0 df_sales['SLR_Document_RGN_Flag'] = 0 df_sales['Undefined_VHE_Status'] = 0 df_sales_grouped_3 = df_sales.groupby( ['Chassis_Number', 'Registration_Number']) df_sales = na_fill_hyundai(df_sales_grouped_3) # New Column Creation # df_sales_grouped = df_sales.groupby(['VehicleData_Code']) # df_sales['Quantity_Sold'] = df_sales_grouped['Quantity_CHS'].transform('sum') # df_sales['Quantity_Sold'] = df_sales['Quantity_Sold'].astype(np.int64, errors='ignore') # df_sales_unique_chassis = df_sales.drop_duplicates(subset=['VehicleData_Code', 'Chassis_Number']).copy() # df_sales_grouped_2 = df_sales_unique_chassis.groupby(['VehicleData_Code']) # df_sales['Average_DaysInStock_Global'] = df_sales_grouped_2['DaysInStock_Global'].transform('mean').round(3) # df_sales.to_csv('dbs/df_sales_importador_processed_{}.csv'.format(current_date)) # Step 2: BI Processing # print('Number of unique Chassis: {} and number of rows: {}'.format(df_sales['Chassis_Number'].nunique(), df_sales.shape[0])) df_sales = df_join_function( df_sales, df_pdb_dim[['VehicleData_Code'] + configuration_parameters_cols + range_dates].set_index('VehicleData_Code'), on='VehicleData_Code', how='left') df_sales = update_new_gamas(df_sales, df_pdb_dim) df_sales = lowercase_column_conversion(df_sales, configuration_parameters_cols) # Filtering rows with no relevant information # print('1 - Number of unique Chassis: {} and number of rows: {}'.format(df_sales['Chassis_Number'].nunique(), df_sales.shape[0])) # df_sales = df_sales[df_sales['NLR_Code'] == 702] # Escolha de viaturas apenas Hyundai # log_record('1 - Contagem de Chassis únicos: {} com o seguinte número de linhas: {}'.format(df_sales['Chassis_Number'].nunique(), df_sales.shape[0]), options_file.project_id) df_sales = df_sales[df_sales['VehicleData_Code'] != 1] log_record( '2 - Remoção de Viaturas não parametrizadas - Contagem de Chassis únicos: {} com o seguinte número de linhas: {}' .format(df_sales['Chassis_Number'].nunique(), df_sales.shape[0]), options_file.project_id) df_sales = df_sales[df_sales['Sales_Type_Dealer_Code'] != 'Demo'] log_record( '3 - Remoção de Viaturas de Demonstração - Contagem de Chassis únicos: {} com o seguinte número de linhas: {}' .format(df_sales['Chassis_Number'].nunique(), df_sales.shape[0]), options_file.project_id) # df_sales = df_sales[df_sales['Sales_Type_Code_DMS'].isin(['RAC', 'STOCK', 'VENDA'])] # log_record('4 - Seleção de apenas Viaturas de RAC, Stock e Venda - Contagem de Chassis únicos: {} com o seguinte número de linhas: {}'.format(df_sales['Chassis_Number'].nunique(), df_sales.shape[0]), options_file.project_id) df_sales = df_sales[~df_sales['Dispatch_Type_Code']. isin(['AMBULÂNCIA', 'TAXI', 'PSP'])] log_record( '5 - Remoção de Viaturas Especiais (Ambulâncias, Táxis, PSP) - Contagem de Chassis únicos: {} com o seguinte número de linhas: {}' .format(df_sales['Chassis_Number'].nunique(), df_sales.shape[0]), options_file.project_id) df_sales = df_sales[ df_sales['DaysInStock_Global'] >= 0] # Filters rows where, for some odd reason, the days in stock are negative log_record( '6 - Remoção de Viaturas com Dias em Stock Global negativos - Contagem de Chassis únicos: {} com o seguinte número de linhas: {}' .format(df_sales['Chassis_Number'].nunique(), df_sales.shape[0]), options_file.project_id) df_sales = df_sales[ df_sales['Registration_Number'] != 'G.FORCE'] # Filters rows where, for some odd reason, the days in stock are negative log_record( '7 - Remoção de Viaturas com Matrículas Inválidas (G.Force) - Contagem de Chassis únicos: {} com o seguinte número de linhas: {}' .format(df_sales['Chassis_Number'].nunique(), df_sales.shape[0]), options_file.project_id) # df_sales = df_sales[df_sales['Customer_Group_Code'].notnull()] # Filters rows where there is no client information; # log_record('8 - Remoção de Viaturas sem informação de cliente - Contagem de Chassis únicos: {} com o seguinte número de linhas: {}'.format(df_sales['Chassis_Number'].nunique(), df_sales.shape[0]), options_file.project_id) df_sales = df_sales[df_sales['DaysInStock_Distributor'].notnull()] log_record( '9 - Remoção de Viaturas sem informação de Dias em Stock - Distribuidor - Contagem de Chassis únicos: {} com o seguinte número de linhas: {}' .format(df_sales['Chassis_Number'].nunique(), df_sales.shape[0]), options_file.project_id) df_sales = df_sales[df_sales['DaysInStock_Dealer'].notnull()] log_record( '10 - Remoção de Viaturas sem informação de Dias em Stock - Dealer - Contagem de Chassis únicos: {} com o seguinte número de linhas: {}' .format(df_sales['Chassis_Number'].nunique(), df_sales.shape[0]), options_file.project_id) df_sales = df_sales[df_sales['PT_PDB_Model_Desc'] != 'não definido'] log_record( '11 - Remoção de Viaturas sem informação de Modelo na PDB - Contagem de Chassis únicos: {} com o seguinte número de linhas: {}' .format(df_sales['Chassis_Number'].nunique(), df_sales.shape[0]), options_file.project_id) df_sales = new_features(df_sales, configuration_parameters_cols, options_file.project_id) # Specific Measures Calculation df_sales = measures_calculation_hyundai(df_sales) # Fill values df_sales['Total_Discount_%'] = df_sales['Total_Discount_%'].replace( [np.inf, np.nan, -np.inf], 0) # Is this correct? This is caused by Total Sales = 0 df_sales['Fixed_Margin_I_%'] = df_sales['Fixed_Margin_I_%'].replace( [np.inf, np.nan, -np.inf], 0) # Is this correct? This is caused by Total Net Sales = 0 df_sales = lowercase_column_conversion( df_sales, configuration_parameters_cols ) # Lowercases the strings of these columns # df_sales = parameter_processing_hyundai(df_sales, options_file, configuration_parameters_cols) translation_dictionaries = [ options_file.transmission_translation, options_file.ext_color_translation, options_file.int_color_translation ] # grouping_dictionaries = [options_file.motor_grouping, options_file.transmission_grouping, options_file.version_grouping, options_file.ext_color_grouping, options_file.int_color_grouping] # Parameter Translation # df_sales = col_group(df_sales, [x for x in configuration_parameters_cols if 'Model' not in x], translation_dictionaries, options_file.project_id) df_sales = col_group(df_sales, [ 'PT_PDB_Transmission_Type_Desc', 'PT_PDB_Exterior_Color_Desc', 'PT_PDB_Interior_Color_Desc' ], translation_dictionaries, options_file.project_id) df_sales = df_sales[ df_sales['PT_PDB_Version_Desc'] != 'NÃO_PARAMETRIZADOS'] log_record( '9 - Remoção de Viaturas sem versão parametrizada - Contagem de Chassis únicos: {} com o seguinte número de linhas: {}' .format(df_sales['Chassis_Number'].nunique(), df_sales.shape[0]), options_file.project_id) project_units_count_checkup(df_sales, 'Chassis_Number', options_file, sql_check=1) # Parameter Grouping print('### NO GROUPING ###') # df_sales = col_group(df_sales, [x for x in configuration_parameters_cols if 'Model' not in x], grouping_dictionaries, options_file.project_id) log_record( 'Contagem de VehicleData_Code únicos: {}'.format( df_sales['VehicleData_Code'].nunique()), options_file.project_id) df_sales_grouped_conf_cols = df_sales.groupby( configuration_parameters_cols) log_record( 'Contagem de Configurações: {}'.format( len(df_sales_grouped_conf_cols)), options_file.project_id) # New VehicleData_Code Creation df_sales['ML_VehicleData_Code'] = df_sales.groupby( configuration_parameters_cols).ngroup() # df_sales.to_csv('dbs/df_hyundai_dataset_all_info_{}.csv'.format(current_date)) log_record('Fim Secção B.', options_file.project_id) performance_info_append(time.time(), 'Section_B_End') return df_sales