Esempio n. 1
0
def sold_cars_evolution(input_dir):
    print('7 - Evolution of cars sold per month per model')

    df = pd.read_csv(input_dir + 'db_baviera_grouped2.csv',
                     index_col=0).dropna()
    total_cars_sold_per_month = [0] * 42
    # print(df.head())
    # sys.exit()

    plt.subplots(figsize=(1500 / my_dpi, 900 / my_dpi), dpi=my_dpi)
    models_list = list(df['Modelo_new'].unique())
    # models_list.remove('Outros')
    for model in models_list:
        number_cars_sold = []
        for year in [2015, 2016, 2017, 2018]:
            for month in range(1, 13):
                df_month = df[(df['Modelo_new'] == model)
                              & (df['sell_year'] == year) &
                              (df['sell_month'] == month)]
                if year == 2018 and month > 6:
                    continue

                number_cars_sold.append(df_month.shape[0])
        total_cars_sold_per_month = [
            total_cars_sold_per_month + cars_sold
            for (total_cars_sold_per_month,
                 cars_sold) in zip(total_cars_sold_per_month, number_cars_sold)
        ]
        print(model, number_cars_sold)
        plt.plot(number_cars_sold, '-o', label=model)

    all_pairs = []
    for year in [15, 16, 17, 18]:
        i = 0
        for month in [
                'Jan', 'Fev', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep',
                'Oct', 'Nov', 'Dec'
        ]:
            i += 1
            if year == 18 and i > 6:
                continue
            pair = [month, year]
            all_pairs.append(pair[0])

    plt.plot(total_cars_sold_per_month, '-o', label='Month Total')
    plt.grid()
    plt.legend()
    plt.vlines(x=[11.5, 23.5, 35.5], ymin=0, ymax=300, color='red')
    plt.ylabel('Number of Sold Cars')
    plt.xlabel('Month, Year')
    plt.title('Number of Sold Cars per model, per month')
    plt.tight_layout()
    plt.xticks(range(0, 42), all_pairs, rotation=30)
    wm = plt.get_current_fig_manager()
    wm.window.wm_geometry("-1500-10")
    save_fig('7_sold_cars_per_model_per_month_with_total')
    # print(total_cars_sold_per_month)
    plt.show()
Esempio n. 2
0
def feature_selection_evaluation(input_dir, features_min, features_max,
                                 feat_sel_metric, model):
    print('Feature Selection Performance Evaluation')

    f, ax = plt.subplots(2, figsize=(1400 / my_dpi, 1000 / my_dpi), dpi=my_dpi)
    recall_class_0, recall_class_1, accuracy, specificity = [], [], [], []
    for value in range(features_min, features_max + 1):
        cr = pd.read_csv(input_dir + 'class_report_classification_' + model +
                         '_target_score_class_scoring_recall_oversample_' +
                         str(value) + '_features_' + str(feat_sel_metric) +
                         '.csv',
                         index_col=0)
        pe = pd.read_csv(input_dir + 'performance_evaluation_classification_' +
                         model +
                         '_target_score_class_scoring_recall_oversample_' +
                         str(value) + '_features_' + str(feat_sel_metric) +
                         '.csv')
        recall_class_0.append(cr.loc['0.0', 'recall']), recall_class_1.append(
            cr.loc['1.0', 'recall'])
        accuracy.append(pe.loc[0, 'accuracy']), specificity.append(
            pe.loc[0, 'specificity'])

    ax[0].plot(recall_class_0, '-o',
               label='Recall Class 0'), ax[0].plot(recall_class_1,
                                                   '-o',
                                                   label='Recall Class 1')
    ax[1].plot(accuracy, '-o',
               label='Accuracy'), ax[1].plot(specificity,
                                             '-o',
                                             label='Specificity')
    ax[0].legend(), ax[1].legend()

    i = 0
    ax[0].set_title('Feature Selection Evaluation - Criteria: ' +
                    feat_sel_metric)
    for graph in ax:
        graph.grid(), graph.set_xlabel(
            'Number of Selected Features'), graph.set_ylabel('Performance')
        plt.sca(ax[i])
        plt.xticks(range(features_max - features_min + 1),
                   range(features_min, features_max + 1))
        plt.yticks(np.arange(0.0, 1.01, 0.05), np.arange(0.0, 1.01, 0.05))
        ax[i].yaxis.set_major_formatter(FormatStrFormatter('%.2f'))
        i += 1
        plt.xlim([-1, 29])

    plt.tight_layout()
    save_fig('6_feature_selection_evaluation_' + feat_sel_metric +
             '_mini_removed')
    wm = plt.get_current_fig_manager()
    wm.window.wm_geometry("-1500-10")
    plt.show()
Esempio n. 3
0
def performance_evaluation_plot_all(input_dir, targets):

    f, ax = plt.subplots(2,
                         2,
                         figsize=(1800 / my_dpi, 800 / my_dpi),
                         dpi=my_dpi)
    pos = [list(i) for i in itertools.product([0, 1], repeat=2)]
    for target in targets:
        none = pd.read_csv(input_dir + 'class_report_target_' + str(target) +
                           '.csv',
                           index_col=0)
        # cols_grouped = pd.read_csv(input_dir + 'class_report_target_' + str(target) + '_group_cols.csv', index_col=0)
        # cols_grouped_prov_and_type = pd.read_csv(input_dir + 'class_report_target_' + str(target) + '_group_cols_prov_and_type.csv', index_col=0)
        # prov_and_type = pd.read_csv(input_dir + 'class_report_target_' + str(target) + '_prov_and_type.csv', index_col=0)

        acc_none = pd.read_csv(input_dir + 'performance_evaluation_target_' +
                               str(target) + '.csv')
        # acc_cols_grouped = pd.read_csv(input_dir + 'performance_evaluation_target_' + str(target) + '_group_cols.csv')
        # acc_cols_grouped_prov_and_type = pd.read_csv(input_dir + 'performance_evaluation_target_' + str(target) + '_group_cols_prov_and_type.csv')
        # acc_prov_and_type = pd.read_csv(input_dir + 'performance_evaluation_target_' + str(target) + '_prov_and_type.csv')

        class_reports = [
            none, cols_grouped, prov_and_type, cols_grouped_prov_and_type
        ]
        acc = [
            acc_none, acc_cols_grouped, acc_prov_and_type,
            acc_cols_grouped_prov_and_type
        ]

        precision, recall, f1_score, accuracy, micro, macro, i = [], [], [], [], [], [], 0
        for group in class_reports:
            precision.append(group.loc['avg/total', 'precision'])
            recall.append(group.loc['avg/total', 'recall'])
            f1_score.append(group.loc['avg/total', 'f1-score'])
            micro.append(acc[i]['micro_f1'])
            macro.append(acc[i]['macro_f1'])
            accuracy.append(acc[i]['accuracy'])
            i += 1

        ax[pos[targets.index(target)][0],
           pos[targets.index(target)][1]].plot(precision, label='Precision')
        ax[pos[targets.index(target)][0],
           pos[targets.index(target)][1]].plot(recall, label='Recall')
        ax[pos[targets.index(target)][0],
           pos[targets.index(target)][1]].plot(f1_score, label='F1 Score')
        ax[pos[targets.index(target)][0],
           pos[targets.index(target)][1]].plot(accuracy, label='Accuracy')
        plt.sca(ax[pos[targets.index(target)][0],
                   pos[targets.index(target)][1]])
        plt.xticks(range(0, 4), [
            'Standard', 'Grouping Columns', 'w/Prov and Tipo Enc',
            'Grouping Cols and w/Prov and Tipo Enc'
        ])
        plt.title('Target - ' + str(target))

    for row, col in ax:
        row.legend(), col.legend()
        row.grid(), col.grid()
        row.set_xlabel('Approach Type'), col.set_xlabel('Approach Type')
        row.set_ylabel('Performance'), col.set_ylabel('Performance')

    plt.tight_layout()
    wm = plt.get_current_fig_manager()
    wm.window.wm_geometry("-1500-100")
    save_fig('2_performance_evaluation_target_all_targets')
    plt.show()
Esempio n. 4
0
def performance_evaluation_plot(input_dir):
    print('Performance Evaluation Plot')
    target_1 = 'score_class'
    target_2 = 'stock_class1'
    target_3 = 'stock_class2'
    target_4 = 'margem_class1'

    # none = pd.read_csv(input_dir + 'class_report_target_' + str(target) + '.csv', index_col=0)
    # cols_grouped = pd.read_csv(input_dir + 'class_report_target_' + str(target) + '_group_cols.csv', index_col=0)
    # cols_grouped_prov_and_type = pd.read_csv(input_dir + 'class_report_target_' + str(target) + '_group_cols_prov_and_type.csv', index_col=0)
    # prov_and_type = pd.read_csv(input_dir + 'class_report_target_' + str(target) + '_prov_and_type.csv', index_col=0)
    none = pd.read_csv(input_dir + 'class_report_target_' + str(target_1) +
                       '.csv',
                       index_col=0)
    cols_grouped = pd.read_csv(input_dir + 'class_report_target_' +
                               str(target_2) + '.csv',
                               index_col=0)
    cols_grouped_prov_and_type = pd.read_csv(
        input_dir + 'class_report_target_' + str(target_3) + '.csv',
        index_col=0)
    prov_and_type = pd.read_csv(input_dir + 'class_report_target_' +
                                str(target_4) + '.csv',
                                index_col=0)

    # acc_none = pd.read_csv(input_dir + 'performance_evaluation_target_' + str(target) + '.csv')
    # acc_cols_grouped = pd.read_csv(input_dir + 'performance_evaluation_target_' + str(target) + '_group_cols.csv')
    # acc_cols_grouped_prov_and_type = pd.read_csv(input_dir + 'performance_evaluation_target_' + str(target) + '_group_cols_prov_and_type.csv')
    # acc_prov_and_type = pd.read_csv(input_dir + 'performance_evaluation_target_' + str(target) + '_prov_and_type.csv')

    acc_none = pd.read_csv(input_dir + 'performance_evaluation_target_' +
                           str(target_1) + '.csv')
    acc_cols_grouped = pd.read_csv(input_dir +
                                   'performance_evaluation_target_' +
                                   str(target_2) + '.csv')
    acc_cols_grouped_prov_and_type = pd.read_csv(
        input_dir + 'performance_evaluation_target_' + str(target_3) + '.csv')
    acc_prov_and_type = pd.read_csv(input_dir +
                                    'performance_evaluation_target_' +
                                    str(target_4) + '.csv')

    class_reports = [
        none, cols_grouped, prov_and_type, cols_grouped_prov_and_type
    ]
    # class_reports = [none]

    acc = [
        acc_none, acc_cols_grouped, acc_prov_and_type,
        acc_cols_grouped_prov_and_type
    ]
    # acc = [acc_none]

    precision, recall, f1_score, accuracy, micro, macro, i = [], [], [], [], [], [], 0
    for group in class_reports:
        precision.append(group.loc['avg/total', 'precision'])
        recall.append(group.loc['avg/total', 'recall'])
        f1_score.append(group.loc['avg/total', 'f1-score'])
        micro.append(acc[i]['micro_f1'])
        macro.append(acc[i]['macro_f1'])
        accuracy.append(acc[i]['accuracy'])
        i += 1

    plt.subplots(figsize=(1400 / my_dpi, 800 / my_dpi), dpi=my_dpi)
    plt.plot(precision, label='Precision')
    plt.plot(recall, label='Recall')
    plt.plot(f1_score, label='F1 Score')
    plt.plot(accuracy, label='Accuracy')
    plt.xticks(
        range(0, 4),
        ['Score Class', 'stock_class1', 'stock_class2', 'margem_class1'])
    plt.tight_layout()
    plt.grid()
    plt.legend()
    plt.xlabel('Approach Type')
    plt.ylabel('Performance')
    plt.title('Classification Performance vs Target Class')
    wm = plt.get_current_fig_manager()
    wm.window.wm_geometry("-1500-100")
    save_fig('performance_evaluation_classification')
    plt.show()
Esempio n. 5
0
def score_distribution_alt():
    stock_days = 1
    margin = 0
    # f, ax = plt.subplots(figsize=(1400 / my_dpi, 800 / my_dpi), dpi=my_dpi)

    dtypes = {
        'Modelo': str,
        'Prov': str,
        'Local da Venda': str,
        'Margem': float,
        'Navegação': int,
        'Sensores': int,
        'Cor_Interior': str,
        'Caixa Auto': int,
        'Cor_Exterior': str,
        'Jantes': str,
        'stock_days': int,
        'margem_percentagem': float
    }
    cols_to_use = [
        'Unnamed: 0', 'Modelo', 'Prov', 'Local da Venda', 'Cor_Interior',
        'Cor_Exterior', 'Navegação', 'Sensores', 'Caixa Auto', 'Jantes',
        'stock_days', 'Margem', 'margem_percentagem'
    ]

    # df = pd.read_csv('output/' + 'db_baviera_stock_optimization.csv', usecols=cols_to_use, encoding='utf-8', delimiter=',', index_col=0, dtype=dtypes)
    df = pd.read_csv('output/' + 'db_full_baviera.csv',
                     usecols=cols_to_use,
                     encoding='utf-8',
                     delimiter=',',
                     index_col=0,
                     dtype=dtypes)

    df['stock_days_norm'] = (df['stock_days'] - df['stock_days'].min()) / (
        df['stock_days'].max() - df['stock_days'].min())
    df['inv_stock_days_norm'] = 1 - df['stock_days_norm']

    df['margem_percentagem_norm'] = (
        df['margem_percentagem'] - df['margem_percentagem'].min()) / (
            df['margem_percentagem'].max() - df['margem_percentagem'].min())
    df['score'] = df['inv_stock_days_norm'] * df['margem_percentagem_norm']

    # plt.plot(df['stock_days'], range(df.shape[0]), 'o')

    # hist, bin_edges = np.histogram(df['stock_days'], bins=np.arange(0, 1000, 100))
    # print(hist, '\n', bin_edges[:-1])
    # plt.bar(bin_edges[:-1], hist, width=5)

    if stock_days:
        # n, bin_edge = np.histogram(df['stock_days'], bins=np.arange(0, 1000, 10))
        # bincenters = 0.5 * (bin_edge[1:] + bin_edge[:-1])
        fig = plt.figure(figsize=(1400 / my_dpi, 800 / my_dpi), dpi=my_dpi)
        ax1 = fig.add_subplot(111)
        ax2 = ax1.twiny()

        new_tick_locations = np.array([
            df['stock_days'].min(), 10, 30, 45, 60, 90, 120, 150, 180, 200,
            300, 400, 500, 600, 700, 800, 900, df['stock_days'].max()
        ])
        # ax1.plot(bincenters, n, '-')
        ax1.plot(df['stock_days'], range(df.shape[0]), 'o')
        ax1.set_xlabel('Stock Days')
        ax1.set_xticks(new_tick_locations)
        ax1.set_xticklabels(new_tick_locations, rotation=30)

        ax2.set_xlim(ax1.get_xlim())
        ax2.set_xticks(new_tick_locations)
        ax2.set_xticklabels(tick_function(new_tick_locations, stock_days,
                                          margin),
                            rotation=30)
        ax2.set_xlabel('Stock Days Normalized')

        ax1.set_ylabel('Absolute Frequency')
        plt.axvline(x=45, color='red')
        wm = plt.get_current_fig_manager()
        wm.window.wm_geometry("-1500-100")
        plt.grid()
        plt.tight_layout()
        save_fig('4a_score_distribution_stock_days')
        plt.show()

    if margin:
        fig = plt.figure(figsize=(1400 / my_dpi, 800 / my_dpi), dpi=my_dpi)
        ax1 = fig.add_subplot(111)
        ax2 = ax1.twiny()

        # new_tick_locations = np.array([df['margem_percentagem'].min(), -0.5, 0, 0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.0, df['margem_percentagem'].max()])
        new_tick_locations = np.array([
            df['margem_percentagem'].min(), -10, -5, 0, 3.5, 5, 10,
            df['margem_percentagem'].max()
        ])

        # ax1.plot(bincenters, n, '-')
        ax1.plot(df['margem_percentagem'], range(df.shape[0]), 'o')
        ax1.set_xlabel('Margin')
        ax1.set_xticks(new_tick_locations)
        ax1.set_xticklabels(new_tick_locations, rotation=30)

        ax2.set_xlim(ax1.get_xlim())
        ax2.set_xticks(new_tick_locations)
        ax2.set_xticklabels(tick_function(new_tick_locations, stock_days,
                                          margin),
                            rotation=30)
        ax2.set_xlabel('Margin Normalized')

        ax1.set_ylabel('Absolute Frequency')
        plt.axvline(x=3.5, color='red')
        wm = plt.get_current_fig_manager()
        wm.window.wm_geometry("-1500-100")
        plt.grid()
        plt.tight_layout()
        save_fig('4a_score_distribution_margin')
        plt.show()
Esempio n. 6
0
def classification_report_plot(input_dir, oversample, score, target):

    # models = ['dt', 'rf', 'lr', 'knn', 'svm', 'ab', 'gc', 'voting']
    models = ['dt', 'rf', 'lr', 'ab', 'gc', 'bayes', 'neural', 'voting']
    # models_name = ['Dec. Tree', 'Rand. Forest', 'Log Reg', 'KNN', 'SVM', 'Adaboost', 'Gradient', 'Voting']
    models_name = [
        'Dec. Tree', 'Rand. Forest', 'Log Reg', 'AdaBoost', 'Gradient',
        'Bayes', 'ANN', 'Voting'
    ]
    criterium = ['chi2', 'f_classif', 'mutual_info_classif']
    # for criteria in criterium:
    f, ax = plt.subplots(3, figsize=(1400 / my_dpi, 1000 / my_dpi), dpi=my_dpi)
    for model in models:
        model_label = models_name[models.index(model)]
        if oversample:
            cr = pd.read_csv(input_dir + 'class_report_classification_' +
                             str(model) + '_target_score_class_scoring_' +
                             str(score) + '_oversample.csv',
                             index_col=0)
            pe = pd.read_csv(input_dir +
                             'performance_evaluation_classification_' +
                             str(model) + '_target_score_class_scoring_' +
                             str(score) + '_oversample.csv')
            # cr = pd.read_csv(input_dir + 'class_report_classification_' + str(model) + '_target_score_class_scoring_' + str(score) + '_oversample_10_features_' + str(criteria) + '.csv', index_col=0)
            # pe = pd.read_csv(input_dir + 'performance_evaluation_classification_' + str(model) + '_target_score_class_scoring_' + str(score) + '_oversample_10_features_' + str(criteria) + '.csv')
        if not oversample:
            cr = pd.read_csv(input_dir + 'class_report_classification_' +
                             str(model) + '_target_' + str(target) +
                             '_scoring_' + str(score) + '.csv',
                             index_col=0)
            pe = pd.read_csv(input_dir +
                             'performance_evaluation_classification_' +
                             str(model) + '_target_' + str(target) +
                             '_scoring_' + str(score) + '.csv')

        ax[0].plot(pe.loc[0, :][:-1].values)
        ax[1].plot(cr.loc['avg/total', :][:-1].values, label=model_label)
        ax[2].plot(cr.loc[:, 'recall'][:-1].values)

    ax[1].legend()
    for graph in ax:
        graph.grid(), graph.set_xlabel('Metrics'), graph.set_ylabel(
            'Performance')

    plt.sca(ax[0])
    plt.xticks(range(4), ['Micro F1', 'Macro F1', 'Accuracy', 'Specificity'])
    plt.yticks(np.arange(0.0, 1.01, 0.05), np.arange(0.0, 1.01, 0.05))
    ax[0].yaxis.set_major_formatter(FormatStrFormatter('%.2f'))

    plt.sca(ax[1])
    plt.xticks(range(3), ['Precision', 'Recall/Sensitivity', 'F1 Score'])
    plt.yticks(np.arange(0.0, 1.01, 0.05), np.arange(0.0, 1.01, 0.05))
    ax[1].yaxis.set_major_formatter(FormatStrFormatter('%.2f'))

    plt.sca(ax[2])
    plt.xticks(range(2), ['Recall - Class 0', 'Recall - Class 1'])
    plt.yticks(np.arange(0.0, 1.01, 0.05), np.arange(0.0, 1.01, 0.05))
    ax[2].yaxis.set_major_formatter(FormatStrFormatter('%.2f'))
    # plt.ylim([0.2, 1.0])

    plt.tight_layout()

    if oversample:
        ax[0].set_title('Model Performance - Oversampled')
        save_fig('5_classification_performance_target_' + str(target) +
                 '_scoring_' + str(score) + '_oversampled')
        # ax[0].set_title('Model Performance - Oversampled - ' + str(criteria))
        # save_fig('5_classification_performance_' + str(score) + '_oversampled_' + str(criteria))
    if not oversample:
        ax[0].set_title('Model Performance')
        save_fig('5_classification_performance_target_' + str(target) +
                 '_scoring_' + str(score))
        # ax[0].set_title('Model Performance - ' + str(criteria))
        # save_fig('5_classification_performance_' + str(score) + '_' + str(criteria))

    wm = plt.get_current_fig_manager()
    wm.window.wm_geometry("-1500-10")
    plt.show()