Exemple #1
0
def get_data(options_file_in):
    df = level_1_a_data_acquisition.sql_retrieve_df(
        options_file_in.DSN_MLG_PRD,
        options_file_in.sql_info['database_final'],
        options_file_in.sql_info['final_table'], options_file_in)
    df_goals = level_1_a_data_acquisition.sql_retrieve_df(
        options_file_in.DSN_MLG_PRD,
        options_file_in.sql_info['database_final'],
        options_file_in.sql_info['goals_table'], options_file_in)

    return df, df_goals
Exemple #2
0
def get_data_non_cached(options_file_in, classification_flag):
    gamas = level_1_a_data_acquisition.sql_retrieve_df(options_file_in.DSN_SRV3_PRD, options_file_in.sql_info['database_source'], options_file_in.sql_info['commercial_version_matching'], options_file_in, query_filters={'Classification_Flag': classification_flag})

    gamas['PT_PDB_Model_Desc'] = gamas['PT_PDB_Model_Desc'].str.lower()
    gamas.dropna(subset=['PT_PDB_Commercial_Version_Desc_Old'], inplace=True)
    gamas.drop_duplicates(subset=['PT_PDB_Model_Desc', 'PT_PDB_Commercial_Version_Desc_Old', 'PT_PDB_Commercial_Version_Desc_New'], inplace=True)  # This removes duplicates matching rows, even the ones without corresponding Gama Viva. There is however a case where the same Gama Morta has two matches: null and a corresponding Gama Viva - 1.4 TGDi DCT Style MY19'5 + TA for model i30 SW
    gamas['PT_PDB_Commercial_Version_Desc_New'].fillna(' ', inplace=True)

    return gamas
Exemple #3
0
def data_acquisition(options_file_in):

    try:
        df = pd.read_csv(options_file_in.temp_file_loc)
    except FileNotFoundError:
        print('File not found, retrieving from SQL...')
        df = sql_retrieve_df(options_file.DSN_MLG_DEV, options_file_in.sql_info['database_source'], options_file_in.sql_info['source_table'], options_file_in)
        df.to_csv(options_file_in.temp_file_loc, index=False)

    return df.sort_values(by=['Movement_Date', 'WIP_Number'], na_position='first')
Exemple #4
0
def data_acquisition(input_file, query_filters, local=0):
    performance_info_append(time.time(), 'Section_A_Start')
    log_record('Início Secção A...', project_id)

    if local:
        try:
            df = read_csv(input_file,
                          encoding='utf-8',
                          parse_dates=['Purchase_Date', 'Sell_Date'],
                          usecols=level_2_optionals_baviera_options.
                          sql_to_code_renaming.keys(),
                          infer_datetime_format=True,
                          decimal='.')
            column_rename(
                df,
                list(level_2_optionals_baviera_options.sql_to_code_renaming.
                     keys()),
                list(level_2_optionals_baviera_options.sql_to_code_renaming.
                     values()))
        except UnicodeDecodeError:
            df = read_csv(input_file,
                          encoding='latin-1',
                          parse_dates=['Purchase_Date', 'Sell_Date'],
                          usecols=level_2_optionals_baviera_options.
                          sql_to_code_renaming.keys(),
                          infer_datetime_format=True,
                          decimal='.')
            column_rename(
                df,
                list(level_2_optionals_baviera_options.sql_to_code_renaming.
                     keys()),
                list(level_2_optionals_baviera_options.sql_to_code_renaming.
                     values()))
    else:
        df = sql_retrieve_df(
            level_2_optionals_baviera_options.DSN_MLG_PRD,
            level_2_optionals_baviera_options.sql_info['database'],
            level_2_optionals_baviera_options.sql_info['initial_table'],
            level_2_optionals_baviera_options,
            list(
                level_2_optionals_baviera_options.sql_to_code_renaming.keys()),
            query_filters,
            column_renaming=1,
            parse_dates=['Purchase_Date', 'Sell_Date'])
        project_units_count_checkup(df,
                                    'Nº Stock',
                                    level_2_optionals_baviera_options,
                                    sql_check=0)

    log_record('Fim Secção A.', project_id)
    performance_info_append(time.time(), 'Section_A_End')

    return df
Exemple #5
0
def data_acquisition(input_files, query_filters, local=0):
    performance_info_append(time.time(), 'Section_A_Start')
    df_facts, df_facts_duration, df_clients, df_pbi_categories, df_manual_classifications, keywords_df = pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame()
    log_record('Início Secção A...', options_file.project_id)

    if local:
        df_facts = read_csv(input_files[0], index_col=0, parse_dates=options_file.date_columns, infer_datetime_format=True)
        df_facts_duration = read_csv(input_files[1], index_col=0)
        df_clients = read_csv(input_files[2], index_col=0)
        df_pbi_categories = read_csv(input_files[3], index_col=0)
        df_manual_classifications = read_csv(input_files[4], index_col=0)
        keywords_df = read_csv(input_files[5], index_col=0)
    elif not local:
        df_facts = sql_retrieve_df(options_file.DSN_SRV3_PRD, options_file.sql_info['database_source'],  options_file.sql_info['initial_table_facts'], options_file,  options_file.sql_fact_columns, query_filters=query_filters[0], parse_dates=options_file.date_columns)
        df_facts_duration = sql_retrieve_df(options_file.DSN_SRV3_PRD, options_file.sql_info['database_source'], options_file.sql_info['initial_table_facts_durations'], options_file, options_file.sql_facts_durations_columns, query_filters=query_filters[1])
        df_clients = sql_retrieve_df(options_file.DSN_SRV3_PRD, options_file.sql_info['database_source'], options_file.sql_info['initial_table_clients'], options_file, options_file.sql_dim_contacts_columns)
        df_pbi_categories = sql_retrieve_df(options_file.DSN_SRV3_PRD, options_file.sql_info['database_source'], options_file.sql_info['initial_table_pbi_categories'], options_file, options_file.sql_pbi_categories_columns, query_filters=query_filters[1])
        df_manual_classifications = sql_retrieve_df(options_file.DSN_SRV3_PRD, options_file.sql_info['database_source'], options_file.sql_info['aux_table'], options_file)
        keywords_df = sql_retrieve_df(options_file.DSN_MLG_PRD, options_file.sql_info['database_final'], options_file.sql_info['keywords_table_str'], options_file, columns=['Keywords_PT', 'Keywords_ES']).dropna()

        save_csv([df_facts, df_facts_duration, df_clients, df_pbi_categories, df_manual_classifications, keywords_df], ['dbs/db_facts_initial', 'dbs/db_facts_duration', 'dbs/db_clients_initial', 'dbs/db_pbi_categories_initial', 'dbs/db_manual_classification', 'dbs/db_keywords_df'])

    keyword_dict, ranking_dict = sql_mapping_retrieval(options_file.DSN_MLG_PRD, options_file.sql_info['database_final'], options_file.sql_info['keywords_table'], 'Keyword_Group', options_file, multiple_columns=1)
    keyword_dict = keyword_dict[0]

    log_record('Fim Secção A...', options_file.project_id)
    performance_info_append(time.time(), 'Section_A_End')

    return df_facts, df_facts_duration, df_clients, df_pbi_categories, df_manual_classifications, keywords_df, keyword_dict, ranking_dict
Exemple #6
0
def main():
    # get data
    df_facts = sql_retrieve_df(
        options_file.DSN_SRV3_PRD,
        options_file.sql_info['database_source'],
        options_file.sql_info['initial_table_facts_durations'],
        options_file,
        columns=options_file.model_training_fact_cols,
        query_filters={'Cost_Centre': '6825'},
        parse_dates=options_file.date_columns)
    df_labels = sql_retrieve_df(options_file.DSN_SRV3_PRD,
                                options_file.sql_info['database_source'],
                                options_file.sql_info['final_table'],
                                options_file,
                                parse_dates=options_file.date_columns)

    labeled_requests = first_step(df_facts, df_labels)
    labeled_requests = second_step(labeled_requests)
    labeled_requests = third_step(labeled_requests)
    classified_dataset, non_classified_dataset = fourth_step(labeled_requests)
    non_classified_df_scored = fifth_step(classified_dataset,
                                          non_classified_dataset)

    return non_classified_df_scored
Exemple #7
0
def get_data(options_file_in):
    df = level_1_a_data_acquisition.sql_retrieve_df(
        options_file_in.DSN_MLG_PRD,
        options_file_in.sql_info['database_final'],
        options_file_in.sql_info['final_table'], options_file_in)
    df = level_1_b_data_processing.column_rename(
        df, configuration_parameters_full + extra_parameters,
        configuration_parameters_full_rename + extra_parameters_rename)
    df = df.loc[df[column_translate['Model_Code']] != '', :]

    df = level_1_b_data_processing.boolean_replacement(df, boolean_columns)

    df = df[df[column_translate['Colour_Ext']] != 'undefined']
    df = df[df[column_translate['Colour_Int']] != '0']

    return df[configuration_parameters_full_rename + extra_parameters_rename]
Exemple #8
0
def get_data_v2(options_file_in,
                dsn,
                db,
                table,
                query_filter=None,
                model_flag=0):

    df = level_1_a_data_acquisition.sql_retrieve_df(dsn,
                                                    db,
                                                    table,
                                                    options_file_in,
                                                    query_filters=query_filter)

    if model_flag:
        df = model_lowercase(df)

    return df
Exemple #9
0
def data_acquisition(query_filters):
    performance_info_append(time.time(), 'Section_A_Start')
    log_record('Início Secção A...', project_id)

    df = sql_retrieve_df(
        level_2_optionals_cdsu_options.DSN_MLG_PRD,
        level_2_optionals_cdsu_options.sql_info['database'],
        level_2_optionals_cdsu_options.sql_info['initial_table'],
        level_2_optionals_cdsu_options,
        list(level_2_optionals_cdsu_options.sql_to_code_renaming.keys()),
        query_filters,
        column_renaming=1,
        parse_dates=['Purchase_Date', 'Sell_Date'])
    # project_units_count_checkup(df, 'Nº Stock', level_2_optionals_cdsu_options, sql_check=0)

    log_record('Fim Secção A.', project_id)
    performance_info_append(time.time(), 'Section_A_End')

    return df
Exemple #10
0
def main():
    dgo_family_10_loc = 'dbs/dgo_familia_10_prepared.csv'
    dgo_family_13_loc = 'dbs/dgo_familia_13_prepared.csv'
    dgo_manual_classified_parts_loc = 'dbs/dgo_classified_parts.csv'

    log_record('Step 1 started.', options_file.project_id)
    master_file = sql_retrieve_df(options_file.DSN_MLG_PRD, options_file.sql_info['database_final'], options_file.sql_info['final_table'], options_file, column_renaming=1)
    master_file['Product_Group_DW'] = master_file['Product_Group_DW'].astype(str)
    log_record('Step 1 ended.', options_file.project_id)
    master_file = flow_step_2(master_file)
    control_prints(master_file, '2')
    master_file, keywords_df = flow_step_3(master_file)
    control_prints(master_file, '3')
    master_file = flow_step_4(master_file)
    control_prints(master_file, '4')
    master_file = flow_step_4_5(master_file, keywords_df)
    control_prints(master_file, '4.5')
    master_file, manual_classifications = flow_step_5(master_file, [dgo_family_10_loc, dgo_family_13_loc, dgo_manual_classified_parts_loc])
    control_prints(master_file, '5')
    master_file_non_classified, master_file_other_families, master_file_classified_families = flow_step_6(master_file)
    control_prints(master_file_non_classified, '6a')
    control_prints(master_file_classified_families, '6b')
    control_prints(master_file_other_families, '6c')
    master_file_classified_families_filtered = flow_step_7(master_file_classified_families)
    control_prints(master_file_classified_families_filtered, '7 - master_file_classified_families_filtered')
    master_file_other_families_filtered = flow_step_7(master_file_other_families)
    control_prints(master_file_other_families_filtered, '7 - master_file_other_families_filtered')
    master_file_final, main_families_cm, main_families_metrics_dict, other_families_cm, other_families_metrics_dict = flow_step_8(master_file_classified_families_filtered, master_file_other_families_filtered, master_file_non_classified)
    control_prints(master_file_final, '8')
    master_file_final = flow_step_9(master_file_final)
    control_prints(master_file_final, '9')
    master_file_final = flow_step_10(master_file_final, manual_classifications)
    control_prints(master_file_final, '10')
    master_file_final.to_csv('dbs/master_file_final.csv', index=False)
    deployment(master_file_final, main_families_cm, other_families_cm)

    return main_families_metrics_dict, other_families_metrics_dict
Exemple #11
0
def get_fact_pa_classifications():
    pse_fact_pa_parts_classification_refs = sql_retrieve_df(options_file.DSN_MLG_PRD, options_file.sql_info['database_final'], options_file.sql_info['parts_classification_refs'], options_file, columns=['Part_Ref', 'Old_Product_Group_DW', 'New_Product_Group_DW'])

    return pse_fact_pa_parts_classification_refs
Exemple #12
0
def get_data_non_cached(options_file_in, dsn, db, view, columns, query_filters=0):
    df = level_1_a_data_acquisition.sql_retrieve_df(dsn, db, view, options_file_in, columns, query_filters)

    return df
Exemple #13
0
def data_processing(df, target_variable, oversample_check, number_of_features):
    performance_info_append(time.time(), 'Section_B_Start')
    log_record('Início Secção B...', project_id)
    model_mapping = {}

    if sql_date_comparison(
            level_2_optionals_baviera_options.DSN_MLG_PRD,
            level_2_optionals_baviera_options,
            level_2_optionals_baviera_options.sql_info['database'],
            level_2_optionals_baviera_options.sql_info['checkpoint_b_table'],
            'Date', level_2_optionals_baviera_options.update_frequency_days):
        log_record(
            'Checkpoint não encontrado ou demasiado antigo. A processar dados...',
            project_id)

        df = lowercase_column_conversion(
            df, ['Opcional', 'Cor', 'Interior', 'Versão'
                 ])  # Lowercases the strings of these columns

        control_prints(df, '1', head=1)
        dict_strings_to_replace = {
            ('Modelo', ' - não utilizar'): '',
            ('Interior', '\\|'): '/',
            ('Cor', '|'): '',
            ('Interior', 'ind.'): '',
            ('Interior', ']'): '/',
            ('Interior', '\\.'): ' ',
            ('Interior', '\'merino\''): 'merino',
            ('Interior', '\' merino\''): 'merino',
            ('Interior', '\'vernasca\''): 'vernasca',
            ('Interior', 'leder'): 'leather',
            ('Interior', 'p '): 'pele',
            ('Interior', 'pelenevada'): 'pele nevada',
            ('Opcional', 'bi-xénon'): 'bixénon',
            ('Opcional', 'vidro'): 'vidros',
            ('Opcional', 'dacota'): 'dakota',
            ('Opcional', 'whites'): 'white',
            ('Opcional', 'beige'): 'bege',
            ('Interior', '\'dakota\''): 'dakota',
            ('Interior', 'dacota'): 'dakota',
            ('Interior', 'mokka'): 'mocha',
            ('Interior', 'beige'): 'bege',
            ('Interior', 'dakota\''): 'dakota',
            ('Interior', 'antracite/cinza/p'): 'antracite/cinza/preto',
            ('Interior', 'antracite/cinza/pretoreto'): 'antracite/cinza/preto',
            ('Interior', 'nevada\''): 'nevada',
            ('Interior', '"nappa"'): 'nappa',
            ('Interior', 'anthrazit'): 'antracite',
            ('Interior', 'antracito'): 'antracite',
            ('Interior', 'preto/laranja/preto/lara'): 'preto/laranja',
            ('Interior', 'anthtacite'): 'antracite',
            ('Interior', 'champag'): 'champagne',
            ('Interior', 'cri'): 'crimson',
            ('Modelo', 'Enter Model Details'): '',
            ('Registration_Number', '\.'): '',
            ('Interior', 'preto/m '): 'preto ',
            ('Interior', 'congnac/preto'): 'cognac/preto',
            ('Local da Venda', 'DCN'): 'DCP',
            ('Cor', 'blue\\|'): 'azul'
        }

        df = string_replacer(
            df, dict_strings_to_replace
        )  # Replaces the strings mentioned in dict_strings_to_replace which are typos, useless information, etc
        control_prints(df, '2', head=1)

        df.dropna(axis=0, inplace=True)  # Removes all remaining NA's
        control_prints(df, '3')

        df = new_column_creation(
            df, [
                x for x in level_2_optionals_baviera_options.
                configuration_parameters_full if x != 'Modelo'
            ], 0
        )  # Creates new columns filled with zeros, which will be filled in the future

        dict_cols_to_take_date_info = {'buy_': 'Data Compra'}
        df = date_cols(
            df, dict_cols_to_take_date_info
        )  # Creates columns for the datetime columns of dict_cols_to_take_date_info, with just the day, month and year
        df = total_price(
            df
        )  # Creates a new column with the total cost for each configuration;
        df = remove_zero_price_total_vhe(
            df, project_id
        )  # Removes VHE with a price total of 0; ToDo: keep checking up if this is still necessary
        control_prints(df, '4')
        df = remove_rows(
            df, [df[df.Modelo.str.contains('MINI')].index], project_id
        )  # No need for Prov filtering, as it is already filtered in the data source;
        control_prints(df, '5')
        df = remove_rows(
            df, [df[df.Franchise_Code.str.contains('T|Y|R|G|C|175')].index],
            project_id
        )  # This removes Toyota Vehicles that aren't supposed to be in this model
        control_prints(df, '6')
        df = remove_rows(
            df, [df[(df.Colour_Ext_Code == ' ') & (df.Cor == ' ')].index],
            project_id,
            warning=1)
        control_prints(df, '7')

        df = options_scraping(
            df,
            level_2_optionals_baviera_options,
            model_training_check=model_training_check
        )  # Scrapes the optionals columns for information regarding the GPS, Auto Transmission, Posterior Parking Sensors, External and Internal colours, Model and Rim's Size
        control_prints(df, '8', head=0)
        df = remove_rows(df, [df[df.Modelo.isnull()].index],
                         project_id,
                         warning=1)
        control_prints(df, '7')
        df = remove_columns(
            df, ['Colour_Ext_Code'], project_id
        )  # This column was only needed for some very specific cases where no Colour_Ext_Code was available;
        control_prints(df, '9')

        project_units_count_checkup(df,
                                    'Nº Stock',
                                    level_2_optionals_baviera_options,
                                    sql_check=1)

        df = color_replacement(
            df, level_2_optionals_baviera_options.colors_to_replace_dict,
            project_id)  # Translates all english colors to portuguese
        control_prints(df, '10')

        df = duplicate_removal(
            df, subset_col='Nº Stock'
        )  # Removes duplicate rows, based on the Stock number. This leaves one line per configuration;
        control_prints(df, '11')

        df = remove_columns(df, [
            'Cor', 'Interior', 'Opcional', 'Custo', 'Versão', 'Franchise_Code'
        ], project_id)  # Remove columns not needed atm;
        # Will probably need to also remove: stock_days, stock_days_norm, and one of the scores
        control_prints(df, '12')

        df = remove_rows(
            df,
            [df.loc[df['Local da Venda'] == 'DCV - Viat.Toy Viseu', :].index],
            project_id
        )  # Removes the vehicles sold here, as they are from another brand (Toyota)
        control_prints(df, '13')

        df = margin_calculation(
            df)  # Calculates the margin in percentage of the total price
        df = score_calculation(
            df, [level_2_optionals_baviera_options.stock_days_threshold],
            level_2_optionals_baviera_options.margin_threshold,
            level_2_optionals_baviera_options.project_id
        )  # Classifies the stockdays and margin based in their respective thresholds in tow classes (0 or 1) and then creates a new_score metric,
        # where only configurations with 1 in both dimension, have 1 as new_score
        # df = new_column_creation(df, ['Local da Venda_v2'], df['Local da Venda'])
        control_prints(df, '14', head=1)

        if model_training_check:
            cols_to_group_layer_2 = [
                'Jantes', 'Local da Venda', 'Local da Venda_v2', 'Modelo',
                'Versao', 'Tipo_Interior', 'Cor_Exterior', 'Cor_Interior',
                'Motor'
            ]
            mapping_dictionaries, _ = sql_mapping_retrieval(
                level_2_optionals_baviera_options.DSN_MLG_PRD,
                level_2_optionals_baviera_options.sql_info['database'],
                level_2_optionals_baviera_options.sql_info['mappings'],
                'Mapped_Value', level_2_optionals_baviera_options)
        else:
            mapping_sell_place = sql_retrieve_df(
                level_2_optionals_baviera_options.DSN_MLG_PRD,
                level_2_optionals_baviera_options.sql_info['database'],
                level_2_optionals_baviera_options.
                sql_info['mappings_sale_place'],
                level_2_optionals_baviera_options)
            mapping_sell_place_v2 = sql_retrieve_df(
                level_2_optionals_baviera_options.DSN_MLG_PRD,
                level_2_optionals_baviera_options.sql_info['database'],
                level_2_optionals_baviera_options.
                sql_info['mappings_sale_place_v2'],
                level_2_optionals_baviera_options)
            mapping_sell_place_fase2 = sql_retrieve_df(
                level_2_optionals_baviera_options.DSN_MLG_PRD,
                level_2_optionals_baviera_options.sql_info['database'],
                level_2_optionals_baviera_options.
                sql_info['mappings_sale_place_fase2'],
                level_2_optionals_baviera_options)

            # control_prints(df, 'mapping_testing_before', head=1)

            df = pd.merge(df,
                          mapping_sell_place,
                          left_on='Local da Venda',
                          right_on='Original_Value',
                          how='left')
            df = column_rename(df, ['Mapped_Value'], ['Local da Venda_v1'])
            df = df.drop(['Original_Value'], axis=1)
            nan_detection(df, 'Local da Venda', 'Local da Venda_v1')
            control_prints(df, 'mapping_testing_v1', head=1)

            df = pd.merge(df,
                          mapping_sell_place_v2,
                          left_on='Local da Venda',
                          right_on='Original_Value',
                          how='left')
            df = column_rename(df, ['Mapped_Value'], ['Local da Venda_v2'])
            df = df.drop(['Original_Value'], axis=1)
            nan_detection(df, 'Local da Venda', 'Local da Venda_v2')
            control_prints(df, 'mapping_testing_v2', head=1)

            mapping_sell_place_fase2[
                'Mapped_Value'] = mapping_sell_place_fase2[
                    'Mapped_Value'].str.lower()

            # print(mapping_sell_place_fase2)
            df = pd.merge(df,
                          mapping_sell_place_fase2,
                          left_on='Local da Venda_v2',
                          right_on='Mapped_Value',
                          how='left')
            df = column_rename(df, ['Original_Value'],
                               ['Local da Venda_fase2'])
            df = df.drop(['Mapped_Value'], axis=1)
            nan_detection(df, 'Local da Venda_v2', 'Local da Venda_fase2')
            control_prints(df, 'after mapping fase2', head=1)

        df = df.drop(['Local da Venda'], axis=1)
        df = column_rename(df, ['Local da Venda_v1', 'Local da Venda_fase2'],
                           ['Local da Venda', 'Local da Venda_Fase2_level_2'])

        df = new_features(
            df, configuration_parameters, project_id
        )  # Creates a series of new features, explained in the provided pdf
        control_prints(df, 'after_renaming', head=1)
        global_variables_saving(
            df, level_2_optionals_baviera_options.project_id
        )  # Small functions to save 2 specific global variables which will be needed later

        log_record('Checkpoint B.1...', project_id)
    else:
        log_record('Checkpoint Found. Retrieving data...', project_id)
        df = sql_retrieve_df(
            level_2_optionals_baviera_options.DSN_MLG_PRD,
            level_2_optionals_baviera_options.sql_info['database'],
            level_2_optionals_baviera_options.sql_info['checkpoint_b_table'],
            level_2_optionals_baviera_options,
            list(level_2_optionals_baviera_options.
                 column_checkpoint_sql_renaming.values()))
        df = column_rename(
            df,
            list(level_2_optionals_baviera_options.
                 column_checkpoint_sql_renaming.values()),
            list(level_2_optionals_baviera_options.
                 column_checkpoint_sql_renaming.keys()))

    log_record('Fim Secção B.', project_id)
    performance_info_append(time.time(), 'Section_B_End')
    return df
Exemple #14
0
def get_data_sql(options_file_in, db, view):
    df = level_1_a_data_acquisition.sql_retrieve_df(options_file_in.DSN_MLG_PRD, db, view, options_file_in)

    df = df.set_index('Actual')

    return df