コード例 #1
0
ファイル: dspPlots.py プロジェクト: ToinoMF/datascience
    def plot_forest_features(self,
                             drop: bool = True,
                             norm: bool = False,
                             thresholds: list = None,
                             idx: int = -1):
        """
        Plot the forest features obtained previously.
        :return: --- <class 'NoneType'>
        """
        if thresholds is None:
            thresholds = [1]
        else:
            thresholds = sorted(thresholds)

        max_features = ['sqrt', 'log2']
        n_estimators = [5, 10, 25, 50, 75, 100, 150, 200, 250, 300]

        thr_len = len(thresholds)
        fig, axs = plt.subplots(thr_len, 2, figsize=(10, 4), squeeze=False)

        for i in range(thr_len):
            aux = self.compute_forest_features(drop, norm, thresholds[i], idx)
            for k in range(len(max_features)):
                layout = pF.LayoutStyleObject(
                    title='Random Forests with %s features' % max_features[k] +
                    '[T=' + str(thresholds[i]) + ']',
                    xlabel='Number of estimators',
                    ylabel='Accuracy',
                    grid=True)
                pF.multiple_line_chart(axs[i, k],
                                       n_estimators,
                                       aux[k],
                                       layout=layout,
                                       percentage=True)
        plt.show()
コード例 #2
0
ファイル: dspPlots.py プロジェクト: ToinoMF/datascience
 def plot_histogram(self, col_index: list = None, n_graphs: int = 5):
     """
     Plots the histogram distribution for the given index columns, displaced in a matrix with n_graphs length.
     :return: --- <class 'NoneType'>
     """
     self.process_timer = time.time()
     if col_index is None:
         col_index = [2, 3, 4, 5, 6]
     columns = (self.data.iloc[:, col_index]).columns
     rows, cols = pF.choose_grid(len(columns) - 1, n_graphs)
     fig, axs = plt.subplots(rows,
                             cols,
                             figsize=(cols * n_graphs, rows * n_graphs),
                             squeeze=False)
     i, j = 0, 0
     for n in range(len(columns)):
         pF.histogram(axs[i, j],
                      self.data[columns[n]],
                      layout=pF.LayoutStyleObject(title='Histogram for %s' %
                                                  columns[n],
                                                  xlabel='',
                                                  ylabel='',
                                                  grid=True))
         i, j = (i + 1, 0) if (n + 1) % cols == 0 else (i, j + 1)
     plt.show()
     self.get_timer("plot_histogram")
コード例 #3
0
ファイル: dspPlots.py プロジェクト: ToinoMF/datascience
    def plot_knn_var_threshold(self,
                               drop: bool = True,
                               norm: bool = False,
                               thresholds: list = None,
                               n_iter: int = 20,
                               parity: bool = False,
                               smote: bool = False):
        """
        Plots the knn accuracy as function of n for given thresholds list.
        :return: --- <class 'NoneType'>
        """
        if thresholds is None:
            thresholds = [
                0.81, 0.83, 0.85, 0.87, 0.89, 0.91, 0.93, 0.95, 0.97, 0.99
            ]
        else:
            thresholds = sorted(thresholds)
        if parity:
            nvalues = [2 * i + 2 for i in range(round(n_iter / 2))]
        else:
            nvalues = [2 * i + 1 for i in range(round(n_iter / 2))]
        if drop:
            x = len(thresholds)
            fig, axs = plt.subplots(int(x / 5),
                                    int(x / (x / 5)),
                                    figsize=(15, 6),
                                    squeeze=False)
            fig.subplots_adjust(hspace=.5, wspace=.001)
            axs = axs.ravel()

            for j in range(int(len(thresholds))):
                axs[j].set_title('n')
                layout = pF.LayoutStyleObject(
                    title='KNN variants - threshold ' + str(thresholds[j]),
                    xlabel='n',
                    ylabel='accuracy',
                    grid=True)
                pF.multiple_line_chart(axs[j],
                                       nvalues,
                                       self.compute_knn(drop,
                                                        norm,
                                                        thresholds[j],
                                                        n_iter,
                                                        parity,
                                                        smote=smote),
                                       layout=layout,
                                       percentage=True,
                                       legends=True)
            fig.tight_layout()
            plt.show()
        else:
            plt.figure()
            pF.multiple_line_chart(plt.gca(),
                                   nvalues,
                                   self.compute_knn(drop, norm, thresholds),
                                   title='KNN variants',
                                   xlabel='n',
                                   ylabel='accuracy',
                                   percentage=True)
            plt.show()
コード例 #4
0
ファイル: dspPlots.py プロジェクト: ToinoMF/datascience
 def plot_num_var_threshold(self,
                            tmin: float = 0,
                            tmax: float = 1,
                            step: float = 0.01):
     """
     Plots the number of variables as a function of the threshold defined to drop.
     Goes from 'tmin' to 'tmax' on 'step' steps.
     :return: --- <class 'NoneType'>
     """
     indexes = [0, 1, 2, 3, 4, 5, 6, 7]
     plt.figure(figsize=(7, 5))
     for i in indexes:
         print(i)
         vals = self.compute_num_var_threshold(tmin=tmin,
                                               tmax=tmax,
                                               step=step,
                                               idx=i)
         pF.single_line_chart(
             plt.gca(),
             vals[0],
             vals[1],
             layout=pF.LayoutStyleObject(
                 title='Number of Variables as a function of the Threshold',
                 xlabel='Threshold',
                 ylabel='Number of Variables',
                 grid=True),
             plotstyle=pF.PlotStyleObject(color='Blue',
                                          marker='o',
                                          alpha=0.5))
     plt.show()
コード例 #5
0
    def plot_tree_criteria(self,
                           drop: bool = True,
                           norm: bool = False,
                           thresholds: list = None,
                           idx: int = -1):
        """
        For the given thresholds, plots the decision trees criteria, according to 'entropy' and 'gini' criteria.
        :return: --- <class 'NoneType'>
        """
        if thresholds is None:
            thresholds = [1]
        else:
            thresholds = sorted(thresholds)

        criteria = ['entropy', 'gini']
        min_samples_leaf = [.10]

        thr_len = len(thresholds)
        fig, axs = plt.subplots(thr_len, 2, figsize=(10, 4), squeeze=False)
        for i in range(thr_len):
            for k in range(len(criteria)):
                layout = pF.LayoutStyleObject(
                    title='Decision Trees with %s criteria' % criteria[k] +
                    '[T=' + str(thresholds[i]) + ']',
                    xlabel='Number of estimators',
                    ylabel='Accuracy',
                    grid=True)
                pF.multiple_line_chart(axs[i, k],
                                       min_samples_leaf,
                                       self.compute_tree_criteria(
                                           drop, norm, thresholds[i], idx)[k],
                                       layout=layout,
                                       percentage=True)
        plt.show()
コード例 #6
0
ファイル: dataSetPD.py プロジェクト: ToinoMF/datascience
def draw_plots_none(var_x, var_y, colors):
    aux_title = ''
    for i in range(len(var_y)):
        aux_title += var_y[i]
        aux_title += '/'
    aux_title = aux_title[:-1]

    # define layout style object
    layout = pF.LayoutStyleObject(aux_title + ' vs ' + var_x,
                                  var_x,
                                  aux_title,
                                  grid=True)

    # define first plot style object
    n = 0
    plot_style = {}
    for i in range(len(var_y)):
        plot_style[str(n)] = pF.PlotStyleObject(color=colors[n],
                                                legend=var_y[i],
                                                marker='o',
                                                markersize=5,
                                                linewidth=0)
        n += 1

    # define data to plot
    n = 0
    xvalues = {}
    if var_x == 'ID':
        for i in range(len(var_y)):
            xvalues[str(n)] = data[var_y[i]].index.values
            n += 1
    else:
        for i in range(len(var_y)):
            xvalues[str(n)] = data[var_x].values
            n += 1
    n = 0
    yvalues = {}
    for i in range(len(var_y)):
        yvalues[str(n)] = data[var_y[i]].values
        n += 1

    # define figure
    plt.figure(figsize=(13, 6))
    pF.multiple_plots(plt.gca(),
                      xvalues,
                      yvalues,
                      layout,
                      plot_style,
                      legends=True)
    plt.show()
コード例 #7
0
ファイル: dspPlots.py プロジェクト: ToinoMF/datascience
    def plot_naive_bayes_var_threshold(self,
                                       drop: bool = True,
                                       norm: bool = False,
                                       thresholds: list = None,
                                       smote: bool = False):
        """
        Plots the knn accuracy as function of n for given thresholds list.
        :return: --- <class 'NoneType'>
        """
        if thresholds is None:
            thresholds = [
                0.81, 0.83, 0.85, 0.87, 0.89, 0.91, 0.93, 0.95, 0.97, 0.99
            ]
        else:
            thresholds = sorted(thresholds)
        if norm:
            x_vals = ['Gaussian', 'Multinomial', 'Bernouly']
            y_vals = [[], [], []]
        else:
            x_vals = ['Gaussian', 'Bernouly']
            y_vals = [[], []]
        for i in thresholds:
            aux = self.compute_naive_bayes(drop, norm, i, smote=smote)
            y_vals[0].append(aux[1][0])
            y_vals[1].append(aux[1][1])
            if norm:
                y_vals[2].append(aux[1][2])

        plt.figure()
        layout = pF.LayoutStyleObject(
            title='Naive Bayes accuracy as function of threshold',
            xlabel='Threshold',
            ylabel='Accuracy',
            grid=True)
        pF.multiple_bar_chart(plt.gca(),
                              thresholds,
                              y_vals,
                              x_vals,
                              layout=layout,
                              legends=True,
                              percentage=True)
        plt.show()