Esempio n. 1
0
def chi_2_heatmap(df,
                  X,
                  Y,
                  save_as_img=False,
                  title='Chi-2 contingency table'):
    cont = contingency_table(df, X, Y)
    # Chi-2
    ni = cont.loc[:, ["Total"]]  # ni
    nj = cont.loc[["Total"], :]
    n = len(df)
    indep = ni.dot(nj) / n
    nij = cont.fillna(0)
    xi_ij = (nij - indep)**2 / indep
    xi_n = xi_ij.sum().sum()
    table = xi_ij / xi_n
    ax = sns.heatmap(table.iloc[:-1, :-1],
                     annot=nij.iloc[:-1, :-1],
                     cmap='Blues',
                     fmt='g')
    ax.set_xlabel(Y, labelpad=20)
    ax.set_ylabel(X, labelpad=20)
    ax.set_title(title, pad=20)
    if save_as_img:
        plt.tight_layout()
        plt.savefig('{}_{}_chi2_heatmap.jpg'.format(X, Y))
Esempio n. 2
0
def plot_confusion_matrix(cm, class_names):
    """
    Returns a matplotlib figure containing the plotted confusion matrix.

    Args:
        cm (array, shape = [n, n]): a confusion matrix of integer classes
        class_names (array, shape = [n]): String names of the integer classes
    """
    size = len(class_names)
    figure = plt.figure(figsize=(size, size))
    plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
    plt.title("Confusion matrix")
    plt.colorbar()
    tick_marks = np.arange(len(class_names))
    plt.xticks(tick_marks, class_names, rotation=45)
    plt.yticks(tick_marks, class_names)

    # Compute the labels from the normalized confusion matrix.
    labels = np.around(cm.astype('float') / cm.sum(axis=1)[:, np.newaxis],
                       decimals=2)

    # Use white text if squares are dark; otherwise black.
    threshold = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        color = "white" if cm[i, j] > threshold else "black"
        plt.text(j, i, labels[i, j], horizontalalignment="center", color=color)

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    return figure
Esempio n. 3
0
def plot_residuals(df, x, y, save_as_img=False):
    ax = sns.residplot(x, y, lowess=True, data=df)
    ax.set_xlabel(x, labelpad=20)
    ax.set_ylabel(y, labelpad=20)
    ax.set_title('Residuals dispersion', pad=20)
    if save_as_img:
        plt.tight_layout()
        plt.savefig('{}_{}_residplot.jpg'.format(x, y))
Esempio n. 4
0
def plot_linear_regression(df, x, y, save_as_img=False):
    ax = sns.regplot(x, y, data=df)
    ax.set_xlabel(x, labelpad=20)
    ax.set_ylabel(y, labelpad=20)
    ax.set_title('Linear regression ({} & {})'.format(x, y), pad=20)
    if save_as_img:
        plt.tight_layout()
        plt.savefig('{}_{}_linear_reg.jpg'.format(x, y))
Esempio n. 5
0
 def plot_dendrogram(self, plot_size=(10, 25), title_pad=20, xlabel_pad=20, orient='left', leaf_text_size=16,
                     save_as_img=False, filename='cah', file_type='jpg'):
     plt.figure(figsize=plot_size)
     plt.title('Hierarchical Clustering Dendrogram', pad=title_pad)
     plt.xlabel('distance', labelpad=xlabel_pad)
     self.dendrogram_data = dendrogram(self.Z,
                                       labels=self.categories,
                                       orientation=orient,
                                       leaf_font_size=leaf_text_size)
     if save_as_img:
         plt.tight_layout()
         plt.savefig(f'{filename}.{file_type}')
     plt.show()
Esempio n. 6
0
    def scree_plot(self,
                   threshold=None,
                   save_as_img=False):  # (% Explained Variance)
        """

        """
        scree = self.evr * 100
        plt.bar(np.arange(len(scree)) + 1, scree)
        if threshold is not None:
            scree_freq = scree / scree.sum()
            scree_cumsum = np.cumsum(scree_freq)
            # Number of features needed for threshold cumulative importance
            n_features = np.min(np.where(scree_cumsum > threshold)) + 1
            threshold_percentage = 100 * threshold
            threshold_legend = '{} features required for {:.0f}% of inertia.'.format(
                n_features, threshold_percentage)
            # Threshold  vertical line plot
            plt.vlines(n_features,
                       ymin=0,
                       ymax=threshold_percentage,
                       linestyles='--',
                       colors='red')
            plt.plot(np.arange(len(scree)) + 1,
                     scree.cumsum(),
                     c="red",
                     marker='o',
                     label=threshold_legend)
            plt.legend(loc='lower right', fontsize=12)
        else:
            plt.plot(np.arange(len(scree)) + 1,
                     scree.cumsum(),
                     c="red",
                     marker='o')
        plt.xlabel("Inertia axis rank", labelpad=20)
        plt.ylabel("Inertia (%)", labelpad=20)
        plt.title("Scree plot" +
                  "\n(Kaiser criterion = {} : Elbow criterion = {})".format(
                      self.kaiser_criterion(),
                      elbow_criterion(total_inertia=self.evr)),
                  pad=20)
        if save_as_img:
            plt.tight_layout()
            plt.savefig('scree.jpg')
        plt.show(block=False)
Esempio n. 7
0
 def scree_plot(self,
                pair_comp=False,
                save_as_img=False):  # (% Explained Variance)
     scree = self.pca.explained_variance_ratio_ * 100
     plt.bar(np.arange(len(scree)) + 1, scree)
     plt.plot(np.arange(len(scree)) + 1,
              scree.cumsum(),
              c="red",
              marker='o')
     plt.xlabel("rang de l'axe d'inertie", labelpad=20)
     plt.ylabel("pourcentage d'inertie", labelpad=20)
     plt.title("Eboulis des valeurs propres" +
               "\n(Kaiser criterion = {} : Elbow criterion = {})".format(
                   self.kaiser_criterion(pair_comp),
                   self.elbow_criterion(pair_comp)),
               pad=20)
     if save_as_img:
         plt.tight_layout()
         plt.savefig('scree.jpg')
     plt.show(block=False)
Esempio n. 8
0
def correlation_matrix(df,
                       as_chart=True,
                       precision=2,
                       title=None,
                       rotate=90,
                       save_as_img=False,
                       size=(16, 12)):
    """
    """
    corr = df.corr()
    if as_chart:
        colormap = plt.cm.RdBu
        plt.figure(figsize=size)
        if title is None:
            title = 'Pearson Correlation of Features'
        plt.title(title, y=1.05, size=15, pad=20)
        mask = np.triu(np.ones_like(corr, dtype=np.bool))
        ax = sns.heatmap(corr,
                         linewidths=0.5,
                         vmax=1.0,
                         square=True,
                         cmap=colormap,
                         linecolor='white',
                         annot=True,
                         mask=mask,
                         cbar_kws={"shrink": .5},
                         fmt='.{}f'.format(precision))

        ax.set_xlim(0, df.shape[1] - 1)
        ax.set_ylim(df.shape[1], 1)
        plt.xticks(rotation=rotate)
        if save_as_img:
            plt.tight_layout()
            plt.savefig('corr_matrix.jpg')
        plt.show()
    else:
        return corr.style.background_gradient(
            cmap='coolwarm').set_precision(precision)
Esempio n. 9
0
    def plot_correlation_circle(self,
                                n_plan=None,
                                labels=None,
                                label_rotation=0,
                                lims=None,
                                save_as_img=False,
                                plot_size=(10, 8)):
        """

        """
        factorial_plan_nb = self.default_factorial_plan_nb if n_plan is None else n_plan
        # Build a list of tuples (example : [(0, 1), (2, 3), ... ])
        axis_ranks = [(x, x + 1) for x in range(0, factorial_plan_nb, 2)]
        pcs = self.pca.components_
        for d1, d2 in axis_ranks:
            if d2 < self.n_comp:
                fig, ax = plt.subplots(figsize=plot_size)
                # Fix factorial plan limits
                if lims is not None:
                    xmin, xmax, ymin, ymax = lims
                elif pcs.shape[1] < 30:
                    xmin, xmax, ymin, ymax = -1, 1, -1, 1
                else:
                    xmin, xmax, ymin, ymax = min(pcs[d1, :]), max(
                        pcs[d1, :]), min(pcs[d2, :]), max(pcs[d2, :])
                # affichage des flèches
                # s'il y a plus de 30 flèches, on n'affiche pas le triangle à leur extrémité
                if pcs.shape[1] < 30:
                    plt.quiver(np.zeros(pcs.shape[1]),
                               np.zeros(pcs.shape[1]),
                               pcs[d1, :],
                               pcs[d2, :],
                               angles='xy',
                               scale_units='xy',
                               scale=1,
                               color="grey")
                    # (doc : https://matplotlib.org/api/_as_gen/matplotlib.pyplot.quiver.html)
                else:
                    lines = [[[0, 0], [x, y]] for x, y in pcs[[d1, d2]].T]
                    ax.add_collection(
                        LineCollection(lines, axes=ax, alpha=.1,
                                       color='black'))
                # Display variables labels
                if labels is not None:
                    for i, (x, y) in enumerate(pcs[[d1, d2]].T):
                        if xmin <= x <= xmax and ymin <= y <= ymax:
                            plt.text(x,
                                     y,
                                     labels[i],
                                     fontsize='14',
                                     ha='center',
                                     va='center',
                                     rotation=label_rotation,
                                     color="blue",
                                     alpha=0.5)  # fontsize : 14
                # Plot circle
                circle = plt.Circle((0, 0), 1, facecolor='none', edgecolor='b')
                plt.gca().add_artist(circle)
                # définition des limites du graphique
                plt.xlim(xmin, xmax)
                plt.ylim(ymin, ymax)
                # affichage des lignes horizontales et verticales
                plt.plot([-1, 1], [0, 0], color='grey', ls='--')
                plt.plot([0, 0], [-1, 1], color='grey', ls='--')
                # Axes labels with % explained variance
                plt.xlabel('F{} ({}%)'.format(d1 + 1,
                                              round(100 * self.evr[d1], 1)),
                           labelpad=20)
                plt.ylabel('F{} ({}%)'.format(d2 + 1,
                                              round(100 * self.evr[d2], 1)),
                           labelpad=20)
                plt.title("Cercle des corrélations (F{} et F{})".format(
                    d1 + 1, d2 + 1),
                          pad=20)
                if save_as_img:
                    plt.tight_layout()
                    plt.savefig(
                        'corr_circle_{}.jpg'.format(1 if d1 == 0 else d1))
                plt.show(block=False)
Esempio n. 10
0
 def plot_factorial_planes(self,
                           n_plan=None,
                           X_projected=None,
                           labels=None,
                           alpha=1,
                           illustrative_var=None,
                           illustrative_var_title=None,
                           save_as_img=False,
                           plot_size=(10, 8)):
     """
     :param: axis_nb: the total number of axes to display (default is kaiser criterion divided by 2)
     """
     X_projected = self.X_projected if X_projected is None else X_projected
     factorial_plan_nb = self.default_factorial_plan_nb if n_plan is None else n_plan
     axis_ranks = [(x, x + 1) for x in range(0, factorial_plan_nb, 2)]
     for d1, d2 in axis_ranks:
         if d2 < self.n_comp:
             fig = plt.figure(figsize=plot_size)
             # Display data points
             if illustrative_var is None:
                 plt.scatter(X_projected[:, d1],
                             X_projected[:, d2],
                             alpha=alpha)
             else:
                 illustrative_var = np.array(illustrative_var)
                 for value in np.unique(illustrative_var):
                     selected = np.where(illustrative_var == value)
                     plt.scatter(X_projected[selected, d1],
                                 X_projected[selected, d2],
                                 alpha=alpha,
                                 label=value)
                 plt.legend(title=illustrative_var_title
                            if illustrative_var_title is not None else None)
             # Display data points labels
             if labels is not None:
                 for i, (x, y) in enumerate(X_projected[:, [d1, d2]]):
                     plt.text(x,
                              y,
                              labels[i],
                              fontsize='12',
                              ha='center',
                              va='bottom')
                     # Fix factorial plan limits
             boundary = np.max(np.abs(X_projected[:, [d1, d2]])) * 1.1
             plt.xlim([-boundary, boundary])
             plt.ylim([-boundary, boundary])
             # Display horizontal & vertical lines
             plt.plot([-100, 100], [0, 0], color='grey', ls='--')
             plt.plot([0, 0], [-100, 100], color='grey', ls='--')
             # Axes labels with % explained variance
             plt.xlabel('F{} ({}%)'.format(d1 + 1,
                                           round(100 * self.evr[d1], 1)),
                        labelpad=20)
             plt.ylabel('F{} ({}%)'.format(d2 + 1,
                                           round(100 * self.evr[d2], 1)),
                        labelpad=20)
             plt.title("Projection des individus (sur F{} et F{})".format(
                 d1 + 1, d2 + 1),
                       pad=20)
             if save_as_img:
                 plt.tight_layout()
                 plt.savefig(
                     'factorial_plan_{}.jpg'.format(1 if d1 == 0 else d1))
             plt.show(block=False)