コード例 #1
0
def flow_step_3(df):
    log_record('Step 3 started.', options_file.project_id)

    # Step 0
    lower_case_cols = ['Part_Desc_PT', 'Part_Desc']
    df = lowercase_column_conversion(df.copy(), lower_case_cols)

    # Step 1
    df = trim_columns(df.copy(), lower_case_cols)

    # Step 1.5
    text_cols = ['Part_Desc_PT', 'Part_Desc']
    for col in text_cols:
        df[col] = df[col].apply(unidecode)
        df[col] = df[col].apply(coerce_to_unicode)
        df[col] = df[col].apply(remove_punctuations)
        df[col] = df[col].apply(lambda x: ' '.join([word for word in x.split() if word not in options_file.stop_words_common]))

    # Step 2
    df = feature_engineering_counts(df)
    # df = feature_engineering_part_desc(df)

    # Step 3
    # Removes numbers
    df['Part_Desc_PT'] = df['Part_Desc_PT'].str.replace(r'\d+', ' ')
    df['Part_Desc'] = df['Part_Desc'].str.replace(r'\d+', ' ')

    # Step 3.5
    # Creates keywords data
    part_desc_keywords = keyword_generation(df, 'Part_Desc')
    part_desc_pt_keywords = keyword_generation(df, 'Part_Desc_PT')
    keywords = pd.concat([part_desc_keywords, part_desc_pt_keywords])
    keywords.to_csv('dbs/keywords_per_product_group_dw.csv', index=False)

    # Step 4
    df = product_group_dw_corrections_on_desc(df.copy())

    # Step 5
    rows_to_remove_regex_filter_1 = r'(?<=^B.{11})AT$|(?<=^B.{15})AT$'
    rows_to_remove_regex_filter_2 = r'(?<=^BM.{11})AT$|(?<=^BM.{15})AT$'
    filter_3 = df['Part_Ref'].str.contains(rows_to_remove_regex_filter_1, na=False)
    filter_4 = df['Part_Ref'].str.contains(rows_to_remove_regex_filter_2, na=False)
    df = df[~filter_3 & ~filter_4]

    # Step 6
    df.loc[df['Part_Desc'] == df['Part_Desc_Copy'], 'Part_Desc_Copy'] = np.nan
    # df.loc[df['Part_Desc'] != df['Part_Desc_Copy'], 'Part_Desc_Copy'] = df['Product_Group_DW']
    df.loc[df['Part_Desc'] != df['Part_Desc_Copy'], 'Part_Desc_Copy'] = df['Part_Desc_Copy']

    # Step 7
    df.loc[df['Part_Desc_Copy'].isnull(), 'New_Product_Group_DW'] = df.loc[df['Part_Desc_Copy'].isnull(), 'Product_Group_DW']
    df.loc[~df['Part_Desc_Copy'].isnull(), 'New_Product_Group_DW'] = df.loc[~df['Part_Desc_Copy'].isnull(), 'Part_Desc_Copy']

    # Step 8
    df.drop(['Product_Group_DW', 'Part_Desc_Copy'], axis=1, inplace=True)

    # Step 9
    df.rename(columns={'New_Product_Group_DW': 'Product_Group_DW'}, inplace=True)

    # Step 10
    df = product_group_dw_complete_replaces(df.copy())

    log_record('Step 3 ended.', options_file.project_id)
    return df, keywords
コード例 #2
0
def data_processing(df_facts, df_facts_duration, df_clients, df_pbi_categories, keywords_df):
    performance_info_append(time.time(), 'Section_B_Start')
    log_record('Início Secção B...', options_file.project_id)

    dict_strings_to_replace = {('Description', 'filesibmcognoscbindatacqertmodelsfdfdeeacebedeabeeabbedrtm'): 'files ibm cognos', ('Description', 'cognosapv'): 'cognos apv', ('Description', 'caetanoautopt'): 'caetano auto pt',
                               ('Description', 'autolinecognos'): 'autoline cognos', ('Description', 'realnao'): 'real nao', ('Description', 'booksytner'): 'book sytner'}  # ('Description', 'http://'): 'http://www.', ('Summary', 'http://'): 'http://www.'

    # Remove PBI's categories requests
    log_record('Contagem inicial de pedidos: {}'.format(df_facts['Request_Num'].nunique()), options_file.project_id)
    pbi_categories = remove_rows(df_pbi_categories.copy(), [df_pbi_categories[~df_pbi_categories['Category_Name'].str.contains('Power BI')].index], options_file.project_id)['Category_Id'].values  # Selects the Category ID's which belong to PBI
    log_record('Contagem de pedidos PBI: {}'.format(df_facts[df_facts['Category_Id'].isin(pbi_categories)]['Request_Num'].nunique()), options_file.project_id)
    df_facts = remove_rows(df_facts, [df_facts.loc[df_facts['Category_Id'].isin(pbi_categories)].index], options_file.project_id)  # Removes the rows which belong to PBI;
    log_record('Após o filtro de pedidos PBI, a nova contagem é de: {}'.format(df_facts['Request_Num'].nunique()), options_file.project_id)

    # Lowercase convertion of Summary and Description
    df_facts = lowercase_column_conversion(df_facts, columns=['Summary', 'Description'])

    # Addition of Client/Assignee Information and imputation of some missing values
    df_facts = df_join_function(df_facts, df_facts_duration.set_index('Request_Num'), on='Request_Num')
    df_facts = df_join_function(df_facts, df_clients.set_index('Contact_Id'), on='Contact_Customer_Id')
    df_facts = value_replacement(df_facts, options_file.assignee_id_replacements)
    df_facts = df_join_function(df_facts, df_clients.set_index('Contact_Id'), on='Contact_Assignee_Id', lsuffix='_Customer', rsuffix='_Assignee')
    df_facts = value_replacement(df_facts, options_file.sla_resolution_hours_replacements)

    # Collection of all Client/Assignee possible names
    unique_clients_names_decoded = string_to_list(df_facts, ['Name_Customer'])
    unique_clients_login_decoded = string_to_list(df_facts, ['Login_Name_Customer'])
    unique_assignee_names_decoded = string_to_list(df_facts, ['Name_Assignee'])
    unique_assignee_login_decoded = string_to_list(df_facts, ['Login_Name_Assignee'])

    # Imputation of missing values for Name_Assignee Column
    df_facts = null_handling(df_facts, {'Name_Assignee': 'Fechados pelo Cliente'})

    # Replaces resolve date by close date when the first is null and second exists
    df_facts = value_substitution(df_facts, non_null_column='Close_Date', null_column='Resolve_Date')

    # df_facts = df_facts.groupby('Request_Num').apply(close_and_resolve_date_replacements)  # Currently doing nothing, hence why it's commented

    # Removes duplicate request numbers
    df_facts = duplicate_removal(df_facts, ['Request_Num'])

    # Removes new lines, tabs, etc;
    df_facts = literal_removal(df_facts, 'Description')

    # Replaces string errors, specified in the provided dictionary
    df_facts = string_replacer(df_facts, dict_strings_to_replace)

    df_facts = value_replacement(df_facts, {'Description': options_file.regex_dict['url']})
    df_facts = value_replacement(df_facts, {'Summary': options_file.regex_dict['url']})
    df_facts = value_substitution(df_facts, non_null_column='Summary', null_column='Description')  # Replaces description by summary when the first is null and second exists

    df_facts = language_detection(df_facts, 'Description', 'Language')
    df_facts = string_replacer(df_facts, {('Language', 'ca'): 'es', ('Category_Id', 'pcat:'): ''})

    df_facts = summary_description_null_checkup(df_facts)  # Cleans requests which have the Summary and Description null

    stop_words_list = options_file.words_to_remove_from_description + unique_clients_names_decoded + unique_clients_login_decoded + unique_assignee_names_decoded + unique_assignee_login_decoded
    df_facts['Description'] = df_facts['Description'].apply(stop_words_removal, args=(stop_words_list,))

    if similar_process_flag:
        df_facts = similar_words_handling(df_facts, keywords_df, options_file.testing_dict)

    df_facts = text_preprocess(df_facts, unique_clients_names_decoded + unique_clients_login_decoded + unique_assignee_names_decoded + unique_assignee_login_decoded, options_file)

    df_facts = value_replacement(df_facts, options_file.language_replacements)

    # Checkpoint B.1 - Key Words data frame creation

    df_facts, df_top_words = top_words_processing(df_facts, description_col='StemmedDescription')

    log_record('Após o processamento a contagem de pedidos é de: {}'.format(df_facts['Request_Num'].nunique()), options_file.project_id)
    log_record('Fim Secção B.', options_file.project_id)
    performance_info_append(time.time(), 'Section_B_End')

    return df_facts, df_top_words
コード例 #3
0
def data_processing(df, target_variable, oversample_check, number_of_features):
    performance_info_append(time.time(), 'Section_B_Start')
    log_record('Início Secção B...', project_id)
    model_mapping = {}

    if sql_date_comparison(
            level_2_optionals_baviera_options.DSN_MLG_PRD,
            level_2_optionals_baviera_options,
            level_2_optionals_baviera_options.sql_info['database'],
            level_2_optionals_baviera_options.sql_info['checkpoint_b_table'],
            'Date', level_2_optionals_baviera_options.update_frequency_days):
        log_record(
            'Checkpoint não encontrado ou demasiado antigo. A processar dados...',
            project_id)

        df = lowercase_column_conversion(
            df, ['Opcional', 'Cor', 'Interior', 'Versão'
                 ])  # Lowercases the strings of these columns

        control_prints(df, '1', head=1)
        dict_strings_to_replace = {
            ('Modelo', ' - não utilizar'): '',
            ('Interior', '\\|'): '/',
            ('Cor', '|'): '',
            ('Interior', 'ind.'): '',
            ('Interior', ']'): '/',
            ('Interior', '\\.'): ' ',
            ('Interior', '\'merino\''): 'merino',
            ('Interior', '\' merino\''): 'merino',
            ('Interior', '\'vernasca\''): 'vernasca',
            ('Interior', 'leder'): 'leather',
            ('Interior', 'p '): 'pele',
            ('Interior', 'pelenevada'): 'pele nevada',
            ('Opcional', 'bi-xénon'): 'bixénon',
            ('Opcional', 'vidro'): 'vidros',
            ('Opcional', 'dacota'): 'dakota',
            ('Opcional', 'whites'): 'white',
            ('Opcional', 'beige'): 'bege',
            ('Interior', '\'dakota\''): 'dakota',
            ('Interior', 'dacota'): 'dakota',
            ('Interior', 'mokka'): 'mocha',
            ('Interior', 'beige'): 'bege',
            ('Interior', 'dakota\''): 'dakota',
            ('Interior', 'antracite/cinza/p'): 'antracite/cinza/preto',
            ('Interior', 'antracite/cinza/pretoreto'): 'antracite/cinza/preto',
            ('Interior', 'nevada\''): 'nevada',
            ('Interior', '"nappa"'): 'nappa',
            ('Interior', 'anthrazit'): 'antracite',
            ('Interior', 'antracito'): 'antracite',
            ('Interior', 'preto/laranja/preto/lara'): 'preto/laranja',
            ('Interior', 'anthtacite'): 'antracite',
            ('Interior', 'champag'): 'champagne',
            ('Interior', 'cri'): 'crimson',
            ('Modelo', 'Enter Model Details'): '',
            ('Registration_Number', '\.'): '',
            ('Interior', 'preto/m '): 'preto ',
            ('Interior', 'congnac/preto'): 'cognac/preto',
            ('Local da Venda', 'DCN'): 'DCP',
            ('Cor', 'blue\\|'): 'azul'
        }

        df = string_replacer(
            df, dict_strings_to_replace
        )  # Replaces the strings mentioned in dict_strings_to_replace which are typos, useless information, etc
        control_prints(df, '2', head=1)

        df.dropna(axis=0, inplace=True)  # Removes all remaining NA's
        control_prints(df, '3')

        df = new_column_creation(
            df, [
                x for x in level_2_optionals_baviera_options.
                configuration_parameters_full if x != 'Modelo'
            ], 0
        )  # Creates new columns filled with zeros, which will be filled in the future

        dict_cols_to_take_date_info = {'buy_': 'Data Compra'}
        df = date_cols(
            df, dict_cols_to_take_date_info
        )  # Creates columns for the datetime columns of dict_cols_to_take_date_info, with just the day, month and year
        df = total_price(
            df
        )  # Creates a new column with the total cost for each configuration;
        df = remove_zero_price_total_vhe(
            df, project_id
        )  # Removes VHE with a price total of 0; ToDo: keep checking up if this is still necessary
        control_prints(df, '4')
        df = remove_rows(
            df, [df[df.Modelo.str.contains('MINI')].index], project_id
        )  # No need for Prov filtering, as it is already filtered in the data source;
        control_prints(df, '5')
        df = remove_rows(
            df, [df[df.Franchise_Code.str.contains('T|Y|R|G|C|175')].index],
            project_id
        )  # This removes Toyota Vehicles that aren't supposed to be in this model
        control_prints(df, '6')
        df = remove_rows(
            df, [df[(df.Colour_Ext_Code == ' ') & (df.Cor == ' ')].index],
            project_id,
            warning=1)
        control_prints(df, '7')

        df = options_scraping(
            df,
            level_2_optionals_baviera_options,
            model_training_check=model_training_check
        )  # Scrapes the optionals columns for information regarding the GPS, Auto Transmission, Posterior Parking Sensors, External and Internal colours, Model and Rim's Size
        control_prints(df, '8', head=0)
        df = remove_rows(df, [df[df.Modelo.isnull()].index],
                         project_id,
                         warning=1)
        control_prints(df, '7')
        df = remove_columns(
            df, ['Colour_Ext_Code'], project_id
        )  # This column was only needed for some very specific cases where no Colour_Ext_Code was available;
        control_prints(df, '9')

        project_units_count_checkup(df,
                                    'Nº Stock',
                                    level_2_optionals_baviera_options,
                                    sql_check=1)

        df = color_replacement(
            df, level_2_optionals_baviera_options.colors_to_replace_dict,
            project_id)  # Translates all english colors to portuguese
        control_prints(df, '10')

        df = duplicate_removal(
            df, subset_col='Nº Stock'
        )  # Removes duplicate rows, based on the Stock number. This leaves one line per configuration;
        control_prints(df, '11')

        df = remove_columns(df, [
            'Cor', 'Interior', 'Opcional', 'Custo', 'Versão', 'Franchise_Code'
        ], project_id)  # Remove columns not needed atm;
        # Will probably need to also remove: stock_days, stock_days_norm, and one of the scores
        control_prints(df, '12')

        df = remove_rows(
            df,
            [df.loc[df['Local da Venda'] == 'DCV - Viat.Toy Viseu', :].index],
            project_id
        )  # Removes the vehicles sold here, as they are from another brand (Toyota)
        control_prints(df, '13')

        df = margin_calculation(
            df)  # Calculates the margin in percentage of the total price
        df = score_calculation(
            df, [level_2_optionals_baviera_options.stock_days_threshold],
            level_2_optionals_baviera_options.margin_threshold,
            level_2_optionals_baviera_options.project_id
        )  # Classifies the stockdays and margin based in their respective thresholds in tow classes (0 or 1) and then creates a new_score metric,
        # where only configurations with 1 in both dimension, have 1 as new_score
        # df = new_column_creation(df, ['Local da Venda_v2'], df['Local da Venda'])
        control_prints(df, '14', head=1)

        if model_training_check:
            cols_to_group_layer_2 = [
                'Jantes', 'Local da Venda', 'Local da Venda_v2', 'Modelo',
                'Versao', 'Tipo_Interior', 'Cor_Exterior', 'Cor_Interior',
                'Motor'
            ]
            mapping_dictionaries, _ = sql_mapping_retrieval(
                level_2_optionals_baviera_options.DSN_MLG_PRD,
                level_2_optionals_baviera_options.sql_info['database'],
                level_2_optionals_baviera_options.sql_info['mappings'],
                'Mapped_Value', level_2_optionals_baviera_options)
        else:
            mapping_sell_place = sql_retrieve_df(
                level_2_optionals_baviera_options.DSN_MLG_PRD,
                level_2_optionals_baviera_options.sql_info['database'],
                level_2_optionals_baviera_options.
                sql_info['mappings_sale_place'],
                level_2_optionals_baviera_options)
            mapping_sell_place_v2 = sql_retrieve_df(
                level_2_optionals_baviera_options.DSN_MLG_PRD,
                level_2_optionals_baviera_options.sql_info['database'],
                level_2_optionals_baviera_options.
                sql_info['mappings_sale_place_v2'],
                level_2_optionals_baviera_options)
            mapping_sell_place_fase2 = sql_retrieve_df(
                level_2_optionals_baviera_options.DSN_MLG_PRD,
                level_2_optionals_baviera_options.sql_info['database'],
                level_2_optionals_baviera_options.
                sql_info['mappings_sale_place_fase2'],
                level_2_optionals_baviera_options)

            # control_prints(df, 'mapping_testing_before', head=1)

            df = pd.merge(df,
                          mapping_sell_place,
                          left_on='Local da Venda',
                          right_on='Original_Value',
                          how='left')
            df = column_rename(df, ['Mapped_Value'], ['Local da Venda_v1'])
            df = df.drop(['Original_Value'], axis=1)
            nan_detection(df, 'Local da Venda', 'Local da Venda_v1')
            control_prints(df, 'mapping_testing_v1', head=1)

            df = pd.merge(df,
                          mapping_sell_place_v2,
                          left_on='Local da Venda',
                          right_on='Original_Value',
                          how='left')
            df = column_rename(df, ['Mapped_Value'], ['Local da Venda_v2'])
            df = df.drop(['Original_Value'], axis=1)
            nan_detection(df, 'Local da Venda', 'Local da Venda_v2')
            control_prints(df, 'mapping_testing_v2', head=1)

            mapping_sell_place_fase2[
                'Mapped_Value'] = mapping_sell_place_fase2[
                    'Mapped_Value'].str.lower()

            # print(mapping_sell_place_fase2)
            df = pd.merge(df,
                          mapping_sell_place_fase2,
                          left_on='Local da Venda_v2',
                          right_on='Mapped_Value',
                          how='left')
            df = column_rename(df, ['Original_Value'],
                               ['Local da Venda_fase2'])
            df = df.drop(['Mapped_Value'], axis=1)
            nan_detection(df, 'Local da Venda_v2', 'Local da Venda_fase2')
            control_prints(df, 'after mapping fase2', head=1)

        df = df.drop(['Local da Venda'], axis=1)
        df = column_rename(df, ['Local da Venda_v1', 'Local da Venda_fase2'],
                           ['Local da Venda', 'Local da Venda_Fase2_level_2'])

        df = new_features(
            df, configuration_parameters, project_id
        )  # Creates a series of new features, explained in the provided pdf
        control_prints(df, 'after_renaming', head=1)
        global_variables_saving(
            df, level_2_optionals_baviera_options.project_id
        )  # Small functions to save 2 specific global variables which will be needed later

        log_record('Checkpoint B.1...', project_id)
    else:
        log_record('Checkpoint Found. Retrieving data...', project_id)
        df = sql_retrieve_df(
            level_2_optionals_baviera_options.DSN_MLG_PRD,
            level_2_optionals_baviera_options.sql_info['database'],
            level_2_optionals_baviera_options.sql_info['checkpoint_b_table'],
            level_2_optionals_baviera_options,
            list(level_2_optionals_baviera_options.
                 column_checkpoint_sql_renaming.values()))
        df = column_rename(
            df,
            list(level_2_optionals_baviera_options.
                 column_checkpoint_sql_renaming.values()),
            list(level_2_optionals_baviera_options.
                 column_checkpoint_sql_renaming.keys()))

    log_record('Fim Secção B.', project_id)
    performance_info_append(time.time(), 'Section_B_End')
    return df
コード例 #4
0
def data_processing(df):
    performance_info_append(time.time(), 'Section_B_Start')
    log_record('Início Secção B...', project_id)

    log_record(
        'Checkpoint não encontrado ou demasiado antigo. A processar dados...',
        project_id)

    df = lowercase_column_conversion(
        df, ['Opcional', 'Cor', 'Interior', 'Versão'
             ])  # Lowercases the strings of these columns

    dict_strings_to_replace = {
        ('Modelo', ' - não utilizar'): '',
        ('Interior', '\\|'): '/',
        ('Cor', '\\|'): '',
        ('Interior', 'ind.'): '',
        ('Interior', '\\]'): '/',
        ('Interior', '\\.'): ' ',
        ('Interior', '\'merino\''): 'merino',
        ('Interior', '\' merino\''): 'merino',
        ('Interior', '\'vernasca\''): 'vernasca',
        ('Interior', 'leder'): 'leather',
        ('Interior', 'p '): 'pele',
        ('Interior', 'pelenevada'): 'pele nevada',
        ('Opcional', 'bi-xénon'): 'bixénon',
        ('Opcional', 'bi-xenon'): 'bixénon',
        ('Opcional', 'vidro'): 'vidros',
        ('Opcional', 'dacota'): 'dakota',
        ('Opcional', 'whites'): 'white',
        ('Opcional', 'beige'): 'bege',
        ('Interior', '\'dakota\''): 'dakota',
        ('Interior', 'dacota'): 'dakota',
        ('Interior', 'mokka'): 'mocha',
        ('Interior', 'beige'): 'bege',
        ('Interior', 'dakota\''): 'dakota',
        ('Interior', 'antracite/cinza/p'): 'antracite/cinza/preto',
        ('Interior', 'antracite/cinza/pretoreto'): 'antracite/cinza/preto',
        ('Interior', 'nevada\''): 'nevada',
        ('Interior', '"nappa"'): 'nappa',
        ('Interior', 'anthrazit'): 'antracite',
        ('Interior', 'antracito'): 'antracite',
        ('Interior', 'preto/laranja/preto/lara'): 'preto/laranja',
        ('Interior', 'anthtacite'): 'antracite',
        ('Interior', 'champag'): 'champagne',
        ('Interior', 'cri'): 'crimson',
        ('Modelo', 'Enter Model Details'): '',
        ('Registration_Number', '\.'): '',
        ('Interior', 'preto/m '): 'preto ',
        ('Interior', 'congnac/preto'): 'cognac/preto',
        ('Local da Venda', 'DCN'): 'DCP',
        ('Cor', 'oceanao'): 'oceano',
        ('Cor', 'ocenao'): 'oceano',
        ('Interior', 'reto'): 'preto',
        ('Cor', 'banco'): 'branco',
        ('Cor', 'catanho'): 'castanho',
        ('Cor', 'petrìleo'): 'petróleo',
        ('Interior', 'ecido'): 'tecido',
        ('Interior', 'ege'): 'bege',
        ('Interior', 'inza'): 'cinza',
        ('Interior', 'inzento'): 'cinzento',
        ('Interior', 'teciso'): 'tecido',
        ('Opcional', 'autmático'): 'automático',
        ('Opcional', 'esctacionamento'): 'estacionamento',
        ('Opcional', 'estacionamernto'): 'estacionamento',
        ('Opcional', 'pct'): 'pacote',
        ('Opcional', 'navegaçãp'): 'navegação',
        ('Opcional', '\\+'): '',
        ('Versão', 'bussiness'): 'business',
        ('Versão', 'r-line'): 'rline',
        ('Versão', 'confortl'): 'confortline',
        ('Versão', 'high'): 'highline',
        ('Opcional', 'p/dsg'): 'para dsg',
        ('Opcional', 'dianteirostraseiros'): 'dianteiros traseiros',
        ('Opcional', 'dianteirostras'): 'dianteiros traseiros',
        ('Opcional', 'diant'): 'dianteiros',
        ('Opcional', 'dttras'): 'dianteiros traseiros',
        ('Opcional', 'dttrpark'): 'dianteiros traseiros park',
        ('Opcional', 'dianttras'): 'dianteiros traseiros',
        ('Opcional', 'câmara'): 'camara',
        ('Opcional', 'camera'): 'camara',
        ('Opcional', 'câmera'): 'camara',
        ('Versão', 'trendtline'): 'trendline',
        ('Versão', 'trendtline'): 'trendline',
        ('Versão', 'confort'): 'confortline',
        ('Versão', 'conftl'): 'confortline',
        ('Versão', 'hightline'): 'highline',
        ('Versão', 'bluem'): 'bluemotion',
        ('Versão', 'bmt'): 'bluemotion',
        ('Versão', 'up!bluemotion'): 'up! bluemotion',
        ('Versão', 'up!bluem'): 'up! bluemotion',
        ('Versão', 'trendl'): 'trendline',
        ('Versão', 'conft'): 'confortline',
        ('Versão', 'highlin'): 'highline',
        ('Versão', 'confortine'): 'confortline',
        ('Versão', 'cofrtl'): 'confortline',
        ('Versão', 'confortlline'): 'confortline',
        ('Versão', 'highl'): 'highline',
        ('Modelo', 'up!'): 'up'
    }

    control_prints(df, '1', head=1)
    df = string_replacer(
        df, dict_strings_to_replace
    )  # Replaces the strings mentioned in dict_strings_to_replace which are typos, useless information, etc
    control_prints(df, '1b', head=1)
    df.dropna(subset=['Cor', 'Colour_Ext_Code', 'Modelo', 'Interior'],
              axis=0,
              inplace=True)  # Removes all remaining NA's
    control_prints(df, '2')

    df = new_column_creation(
        df, [
            x for x in
            level_2_optionals_cdsu_options.configuration_parameters_full
            if x != 'Modelo' and x != 'Combustível'
        ], 0
    )  # Creates new columns filled with zeros, which will be filled in the future

    df = total_price(
        df)  # Creates a new column with the total cost for each configuration;
    control_prints(df, '3a', head=0)

    df = remove_zero_price_total_vhe(
        df, project_id
    )  # Removes VHE with a price total of 0; ToDo: keep checking up if this is still necessary
    control_prints(df, '3b', head=0)

    df = remove_rows(
        df, [df[df.Franchise_Code.str.contains('X')].index], project_id
    )  # This removes VW Commercials Vehicles that aren't supposed to be in this model
    df = remove_rows(df,
                     [df[(df.Colour_Ext_Code == ' ') & (df.Cor == ' ')].index],
                     project_id,
                     warning=1)
    control_prints(df, '3c')

    df = options_scraping_v2(
        df, level_2_optionals_cdsu_options, 'Modelo'
    )  # Scrapes the optionals columns for information regarding the GPS, Auto Transmission, Posterior Parking Sensors, External and Internal colours, Model and Rim's Size
    control_prints(df, '3d', head=1, null_analysis_flag=1)
    df.loc[
        df['Combustível'].isin(['Elétrico', 'Híbrido']),
        'Motor'] = 'N/A'  # Defaults the value of motorization for electric/hybrid cars;
    control_prints(df, '4', head=0, save=1)

    # df = remove_rows(df, [df[df.Modelo.isnull()].index], project_id, warning=1)
    df = remove_columns(
        df, ['Colour_Ext_Code'], project_id
    )  # This column was only needed for some very specific cases where no Colour_Ext_Code was available;
    df.to_csv('dbs/df_cdsu.csv', index=False)
    control_prints(df, '5', head=0, save=1)

    # project_units_count_checkup(df, 'Nº Stock', level_2_optionals_cdsu_options, sql_check=1)

    df = color_replacement(
        df, level_2_optionals_cdsu_options.colors_to_replace_dict,
        project_id)  # Translates all english colors to portuguese
    control_prints(df, '6', head=0, save=1)

    df = duplicate_removal(
        df, subset_col='Nº Stock'
    )  # Removes duplicate rows, based on the Stock number. This leaves one line per configuration;
    control_prints(df, '7')

    df = remove_columns(
        df,
        ['Cor', 'Interior', 'Opcional', 'Custo', 'Versão', 'Franchise_Code'],
        project_id)  # Remove columns not needed atm;
    # Will probably need to also remove: stock_days, stock_days_norm, and one of the scores

    # df = remove_rows(df, [df.loc[df['Local da Venda'] == 'DCV - Viat.Toy Viseu', :].index], project_id)  # Removes the vehicles sold here, as they are from another brand (Toyota)

    df = margin_calculation(
        df)  # Calculates the margin in percentage of the total price
    control_prints(df, '8')

    df = score_calculation(
        df, [level_2_optionals_cdsu_options.stock_days_threshold],
        level_2_optionals_cdsu_options.margin_threshold,
        level_2_optionals_cdsu_options.project_id
    )  # Classifies the stockdays and margin based in their respective thresholds in tow classes (0 or 1) and then creates a new_score metric,
    control_prints(df, '9')
    # where only configurations with 1 in both dimension, have 1 as new_score

    # df = new_column_creation(df, ['Local da Venda_v2'], df['Local da Venda'])
    # control_prints(df, '10')

    # cols_to_group_layer_2 = ['Local da Venda']
    # mapping_dictionaries, _ = sql_mapping_retrieval(level_2_optionals_cdsu_options.DSN_MLG_PRD, level_2_optionals_cdsu_options.sql_info['database'], level_2_optionals_cdsu_options.sql_info['mappings_temp'], 'Mapped_Value', level_2_optionals_cdsu_options)
    # df = sell_place_parametrization(df, 'Local da Venda', 'Local da Venda_Fase2', mapping_dictionaries[2], level_2_optionals_cdsu_options.project_id)

    # df = col_group(df, cols_to_group_layer_2[0:2], mapping_dictionaries[0:2], project_id)  # Based on the information provided by Manuel some entries were grouped as to remove small groups. The columns grouped are mentioned in cols_to_group, and their respective groups are shown in level_2_optionals_cdsu_options

    control_prints(df, '9b, before new features', null_analysis_flag=1)
    df = new_features(
        df, configuration_parameters, project_id
    )  # Creates a series of new features, explained in the provided pdf
    control_prints(df, '10, after new_features', null_analysis_flag=1)

    # global_variables_saving(df, level_2_optionals_cdsu_options.project_id)  # Small functions to save 2 specific global variables which will be needed later

    log_record('Checkpoint B.1...', project_id)
    # performance_info_append(time.time(), 'checkpoint_b1')
    df = column_rename(
        df,
        list(level_2_optionals_cdsu_options.column_checkpoint_sql_renaming.
             keys()),
        list(level_2_optionals_cdsu_options.column_checkpoint_sql_renaming.
             values()))
    # sql_inject(df, level_2_optionals_cdsu_options.DSN_MLG_PRD, level_2_optionals_cdsu_options.sql_info['database'], level_2_optionals_cdsu_options.sql_info['checkpoint_b_table'], level_2_optionals_cdsu_options, list(level_2_optionals_cdsu_options.column_checkpoint_sql_renaming.values()), truncate=1, check_date=1)
    df = column_rename(
        df,
        list(level_2_optionals_cdsu_options.column_checkpoint_sql_renaming.
             values()),
        list(level_2_optionals_cdsu_options.column_checkpoint_sql_renaming.
             keys()))
    df = remove_columns(df, ['Date'], project_id)

    log_record('Fim Secção B.', project_id)

    performance_info_append(time.time(), 'Section_B_End')

    return df
コード例 #5
0
def data_processing(df_sales, df_pdb_dim, configuration_parameters_cols,
                    range_dates, target):
    performance_info_append(time.time(), 'Section_B_Start')
    log_record('Início Secção B...', options_file.project_id)
    current_date, _ = time_tags()

    try:
        df_ohe = read_csv(
            'dbs/df_hyundai_dataset_ml_version_ohe_{}.csv'.format(
                current_date),
            index_col=0,
            dtype={
                'NDB_VATGroup_Desc': 'category',
                'VAT_Number_Display': 'category',
                'NDB_Contract_Dealer_Desc': 'category',
                'NDB_VHE_PerformGroup_Desc': 'category',
                'NDB_VHE_Team_Desc': 'category',
                'Customer_Display': 'category',
                'Customer_Group_Desc': 'category',
                'SLR_Account_Dealer_Code': 'category',
                'Product_Code': 'category',
                'Sales_Type_Dealer_Code': 'category',
                'Sales_Type_Code': 'category',
                'Vehicle_Type_Code': 'category',
                'Fuel_Type_Code': 'category',
                'PT_PDB_Model_Desc': 'category',
                'PT_PDB_Engine_Desc': 'category',
                'PT_PDB_Transmission_Type_Desc': 'category',
                'PT_PDB_Version_Desc': 'category',
                'PT_PDB_Exterior_Color_Desc': 'category',
                'PT_PDB_Interior_Color_Desc': 'category',
                'NDB_Dealer_Code': 'category'
            })
        df_non_ohe = read_csv(
            'dbs/df_hyundai_dataset_ml_version_{}.csv'.format(current_date),
            index_col=0,
            dtype={
                'NDB_VATGroup_Desc': 'category',
                'VAT_Number_Display': 'category',
                'NDB_Contract_Dealer_Desc': 'category',
                'NDB_VHE_PerformGroup_Desc': 'category',
                'NDB_VHE_Team_Desc': 'category',
                'Customer_Display': 'category',
                'Customer_Group_Desc': 'category',
                'SLR_Account_Dealer_Code': 'category',
                'Product_Code': 'category',
                'Sales_Type_Dealer_Code': 'category',
                'Sales_Type_Code': 'category',
                'Vehicle_Type_Code': 'category',
                'Fuel_Type_Code': 'category',
                'PT_PDB_Model_Desc': 'category',
                'PT_PDB_Engine_Desc': 'category',
                'PT_PDB_Transmission_Type_Desc': 'category',
                'PT_PDB_Version_Desc': 'category',
                'PT_PDB_Exterior_Color_Desc': 'category',
                'PT_PDB_Interior_Color_Desc': 'category',
                'NDB_Dealer_Code': 'category'
            })
        df_sales = read_csv(
            'dbs/df_hyundai_dataset_all_info_{}.csv'.format(current_date),
            index_col=0,
            dtype={
                'SLR_Account_Dealer_Code': object,
                'Immobilized_Number': object
            },
            parse_dates=options_file.date_columns)

        log_record(
            'Dados do dia atual foram encontrados. A passar para a próxima secção...',
            options_file.project_id)
    except FileNotFoundError:
        log_record('Dados do dia atual não foram encontrados. A processar...',
                   options_file.project_id)

        # Step 1 - Dataset cleaning and transforming to 1 line per sale
        columns_to_convert_to_datetime = [
            'Ship_Arrival_Date', 'SLR_Document_Date_CHS',
            'Registration_Request_Date', 'SLR_Document_Date_RGN'
        ]
        for column in columns_to_convert_to_datetime:
            df_sales[column] = pd.to_datetime(df_sales[column])

        # Filtering
        log_record(
            '1 - Contagem Inicial de Chassis únicos: {}'.format(
                df_sales['Chassis_Number'].nunique()), options_file.project_id)
        log_record(
            '1 - Contagem Inicial de Matrículas únicas: {}'.format(
                df_sales['Registration_Number'].nunique()),
            options_file.project_id)

        print(
            'Removal of 49-VG-94 Registration Plate, which presents two Chassis Number'
        )
        df_sales = df_sales[~(
            df_sales['Registration_Number'] == '49-VG-94')].copy()

        # Sorting
        df_sales.sort_values([
            'Ship_Arrival_Date', 'SLR_Document_Date_CHS',
            'Registration_Request_Date', 'SLR_Document_Date_RGN'
        ])

        df_sales['No_Registration_Number_Flag'] = 0
        df_sales['Registration_Number_No_SLR_Document_RGN_Flag'] = 0
        df_sales['SLR_Document_RGN_Flag'] = 0
        df_sales['Undefined_VHE_Status'] = 0

        df_sales_grouped_3 = df_sales.groupby(
            ['Chassis_Number', 'Registration_Number'])
        df_sales = na_fill_hyundai(df_sales_grouped_3)

        # New Column Creation
        # df_sales_grouped = df_sales.groupby(['VehicleData_Code'])
        # df_sales['Quantity_Sold'] = df_sales_grouped['Quantity_CHS'].transform('sum')
        # df_sales['Quantity_Sold'] = df_sales['Quantity_Sold'].astype(np.int64, errors='ignore')

        # df_sales_unique_chassis = df_sales.drop_duplicates(subset=['VehicleData_Code', 'Chassis_Number']).copy()
        # df_sales_grouped_2 = df_sales_unique_chassis.groupby(['VehicleData_Code'])
        # df_sales['Average_DaysInStock_Global'] = df_sales_grouped_2['DaysInStock_Global'].transform('mean').round(3)

        # df_sales.to_csv('dbs/df_sales_importador_processed_{}.csv'.format(current_date))

        # Step 2: BI Processing
        # print('Number of unique Chassis: {} and number of rows: {}'.format(df_sales['Chassis_Number'].nunique(), df_sales.shape[0]))
        df_sales = df_join_function(
            df_sales,
            df_pdb_dim[['VehicleData_Code'] + configuration_parameters_cols +
                       range_dates].set_index('VehicleData_Code'),
            on='VehicleData_Code',
            how='left')
        df_sales = update_new_gamas(df_sales, df_pdb_dim)

        df_sales = lowercase_column_conversion(df_sales,
                                               configuration_parameters_cols)

        # Filtering rows with no relevant information
        # print('1 - Number of unique Chassis: {} and number of rows: {}'.format(df_sales['Chassis_Number'].nunique(), df_sales.shape[0]))
        # df_sales = df_sales[df_sales['NLR_Code'] == 702]  # Escolha de viaturas apenas Hyundai
        # log_record('1 - Contagem de Chassis únicos: {} com o seguinte número de linhas: {}'.format(df_sales['Chassis_Number'].nunique(), df_sales.shape[0]), options_file.project_id)
        df_sales = df_sales[df_sales['VehicleData_Code'] != 1]
        log_record(
            '2 - Remoção de Viaturas não parametrizadas - Contagem de Chassis únicos: {} com o seguinte número de linhas: {}'
            .format(df_sales['Chassis_Number'].nunique(),
                    df_sales.shape[0]), options_file.project_id)
        df_sales = df_sales[df_sales['Sales_Type_Dealer_Code'] != 'Demo']
        log_record(
            '3 - Remoção de Viaturas de Demonstração - Contagem de Chassis únicos: {} com o seguinte número de linhas: {}'
            .format(df_sales['Chassis_Number'].nunique(),
                    df_sales.shape[0]), options_file.project_id)
        # df_sales = df_sales[df_sales['Sales_Type_Code_DMS'].isin(['RAC', 'STOCK', 'VENDA'])]
        # log_record('4 - Seleção de apenas Viaturas de RAC, Stock e Venda - Contagem de Chassis únicos: {} com o seguinte número de linhas: {}'.format(df_sales['Chassis_Number'].nunique(), df_sales.shape[0]), options_file.project_id)
        df_sales = df_sales[~df_sales['Dispatch_Type_Code'].
                            isin(['AMBULÂNCIA', 'TAXI', 'PSP'])]
        log_record(
            '5 - Remoção de Viaturas Especiais (Ambulâncias, Táxis, PSP) - Contagem de Chassis únicos: {} com o seguinte número de linhas: {}'
            .format(df_sales['Chassis_Number'].nunique(),
                    df_sales.shape[0]), options_file.project_id)
        df_sales = df_sales[
            df_sales['DaysInStock_Global'] >=
            0]  # Filters rows where, for some odd reason, the days in stock are negative
        log_record(
            '6 - Remoção de Viaturas com Dias em Stock Global negativos - Contagem de Chassis únicos: {} com o seguinte número de linhas: {}'
            .format(df_sales['Chassis_Number'].nunique(),
                    df_sales.shape[0]), options_file.project_id)
        df_sales = df_sales[
            df_sales['Registration_Number'] !=
            'G.FORCE']  # Filters rows where, for some odd reason, the days in stock are negative
        log_record(
            '7 - Remoção de Viaturas com Matrículas Inválidas (G.Force) - Contagem de Chassis únicos: {} com o seguinte número de linhas: {}'
            .format(df_sales['Chassis_Number'].nunique(),
                    df_sales.shape[0]), options_file.project_id)
        # df_sales = df_sales[df_sales['Customer_Group_Code'].notnull()]  # Filters rows where there is no client information;
        # log_record('8 - Remoção de Viaturas sem informação de cliente - Contagem de Chassis únicos: {} com o seguinte número de linhas: {}'.format(df_sales['Chassis_Number'].nunique(), df_sales.shape[0]), options_file.project_id)
        df_sales = df_sales[df_sales['DaysInStock_Distributor'].notnull()]
        log_record(
            '9 - Remoção de Viaturas sem informação de Dias em Stock - Distribuidor - Contagem de Chassis únicos: {} com o seguinte número de linhas: {}'
            .format(df_sales['Chassis_Number'].nunique(),
                    df_sales.shape[0]), options_file.project_id)
        df_sales = df_sales[df_sales['DaysInStock_Dealer'].notnull()]
        log_record(
            '10 - Remoção de Viaturas sem informação de Dias em Stock - Dealer - Contagem de Chassis únicos: {} com o seguinte número de linhas: {}'
            .format(df_sales['Chassis_Number'].nunique(),
                    df_sales.shape[0]), options_file.project_id)
        df_sales = df_sales[df_sales['PT_PDB_Model_Desc'] != 'não definido']
        log_record(
            '11 - Remoção de Viaturas sem informação de Modelo na PDB - Contagem de Chassis únicos: {} com o seguinte número de linhas: {}'
            .format(df_sales['Chassis_Number'].nunique(),
                    df_sales.shape[0]), options_file.project_id)

        df_sales = new_features(df_sales, configuration_parameters_cols,
                                options_file.project_id)

        # Specific Measures Calculation
        df_sales = measures_calculation_hyundai(df_sales)

        # Fill values
        df_sales['Total_Discount_%'] = df_sales['Total_Discount_%'].replace(
            [np.inf, np.nan, -np.inf],
            0)  # Is this correct? This is caused by Total Sales = 0
        df_sales['Fixed_Margin_I_%'] = df_sales['Fixed_Margin_I_%'].replace(
            [np.inf, np.nan, -np.inf],
            0)  # Is this correct? This is caused by Total Net Sales = 0

        df_sales = lowercase_column_conversion(
            df_sales, configuration_parameters_cols
        )  # Lowercases the strings of these columns

        # df_sales = parameter_processing_hyundai(df_sales, options_file, configuration_parameters_cols)

        translation_dictionaries = [
            options_file.transmission_translation,
            options_file.ext_color_translation,
            options_file.int_color_translation
        ]
        # grouping_dictionaries = [options_file.motor_grouping, options_file.transmission_grouping, options_file.version_grouping, options_file.ext_color_grouping, options_file.int_color_grouping]

        # Parameter Translation
        # df_sales = col_group(df_sales, [x for x in configuration_parameters_cols if 'Model' not in x], translation_dictionaries, options_file.project_id)
        df_sales = col_group(df_sales, [
            'PT_PDB_Transmission_Type_Desc', 'PT_PDB_Exterior_Color_Desc',
            'PT_PDB_Interior_Color_Desc'
        ], translation_dictionaries, options_file.project_id)
        df_sales = df_sales[
            df_sales['PT_PDB_Version_Desc'] != 'NÃO_PARAMETRIZADOS']
        log_record(
            '9 - Remoção de Viaturas sem versão parametrizada - Contagem de Chassis únicos: {} com o seguinte número de linhas: {}'
            .format(df_sales['Chassis_Number'].nunique(),
                    df_sales.shape[0]), options_file.project_id)
        project_units_count_checkup(df_sales,
                                    'Chassis_Number',
                                    options_file,
                                    sql_check=1)

        # Parameter Grouping
        print('### NO GROUPING ###')
        # df_sales = col_group(df_sales, [x for x in configuration_parameters_cols if 'Model' not in x], grouping_dictionaries, options_file.project_id)

        log_record(
            'Contagem de VehicleData_Code únicos: {}'.format(
                df_sales['VehicleData_Code'].nunique()),
            options_file.project_id)
        df_sales_grouped_conf_cols = df_sales.groupby(
            configuration_parameters_cols)

        log_record(
            'Contagem de Configurações: {}'.format(
                len(df_sales_grouped_conf_cols)), options_file.project_id)

        # New VehicleData_Code Creation
        df_sales['ML_VehicleData_Code'] = df_sales.groupby(
            configuration_parameters_cols).ngroup()
        # df_sales.to_csv('dbs/df_hyundai_dataset_all_info_{}.csv'.format(current_date))

    log_record('Fim Secção B.', options_file.project_id)
    performance_info_append(time.time(), 'Section_B_End')
    return df_sales