Exemple #1
0
    def jointplots_df(self,df0, idx_target=0):

        columns = df0.columns.to_numpy()
        target = columns[idx_target]
        idx = numpy.delete(numpy.arange(0, len(columns)), idx_target)
        pal = 'tab10'

        for i in range(len(idx)):
            c = columns[idx[i]]
            df = df0[[target, c]]
            df = df.dropna()
            df = tools_DF.hash_categoricals(df)

            fig = plt.figure()
            fig = self.turn_light_mode(fig)
            plt.grid(color=self.clr_grid)
            J = seaborn.histplot(data=df, x=c, hue=target, palette=pal,element='poly',legend=True)

            legend = J._get_patches_for_fill.axes.legend_
            self.recolor_legend_seaborn(legend)
            plt.savefig(self.folder_out + 'plot_%02d_%02d_%s.png' % (i, i,c),facecolor=fig.get_facecolor())
            plt.close()

        for i in range(len(idx)-1):
            for j in range(i+1,len(idx)):
                c1, c2 = columns[idx[i]], columns[idx[j]]
                df = df0[[target, c1, c2]]
                df = df.dropna()
                df = tools_DF.hash_categoricals(df)

                fig = plt.figure()
                fig = self.turn_light_mode(fig)
                J = seaborn.jointplot(data=df, x=c1, y=c2, hue=target,palette=pal,edgecolor=None)
                J.ax_joint.grid(color=self.clr_grid)
                J.ax_joint.set_facecolor(self.clr_bg)
                J.ax_marg_x.set_facecolor(self.clr_bg)
                J.ax_marg_y.set_facecolor(self.clr_bg)
                J.ax_joint.xaxis.label.set_color(self.clr_font)
                J.ax_joint.yaxis.label.set_color(self.clr_font)

                legend = J.ax_joint.legend()
                self.recolor_legend_plt(legend)

                plt.savefig(self.folder_out + 'pairplot_%02d_%02d_%s_%s.png'%(i,j,c1,c2),facecolor=fig.get_facecolor())
                plt.close(fig)


        return
    def get_roc(self, df0, idx_target, plots_train, plots_test):

        columns = df0.columns.to_numpy()
        target = columns[idx_target]
        df = df0.dropna()
        df[target] = (df[target] <= 0).astype(int)
        df = tools_DF.hash_categoricals(df)

        for i, C in enumerate([
                classifier_LM.classifier_LM(),
                classifier_SVM.classifier_SVM(),
                classifier_RF.classifier_RF(),
                classifier_KNN.classifier_KNN()
        ]):
            ML = tools_ML_v2.ML(C, self.folder_out, self.P.dark_mode)
            ML.E2E_train_test_df(df, idx_target, do_pca=False)

            URL = next(tempfile._get_candidate_names()) + '.png'
            os.rename(self.folder_out + 'ROC_train.png', self.folder_out + URL)
            plots_train[i] = [html.Img(src=self.app.get_asset_url(URL))]

            URL = next(tempfile._get_candidate_names()) + '.png'
            os.rename(self.folder_out + 'ROC_test.png', self.folder_out + URL)
            plots_test[i] = [html.Img(src=self.app.get_asset_url(URL))]

        return plots_train, plots_test
Exemple #3
0
def ex_14_hash_categoricasl(df):
    df = df.dropna()
    print(df.head())
    print()
    df = tools_DF.hash_categoricals(df)
    print(df.head())
    return
    def get_pairplots(self, df0, idx_target, pairplots):

        FI = tools_feature_importance.evaluate_feature_importance(
            df0, idx_target)
        best_idx = numpy.argsort(-FI['F_score'].to_numpy())
        best_features = FI['features'].to_numpy()[best_idx][:4]
        target = df0.columns[idx_target]

        cnt = 0
        for i in range(len(best_features)):
            for j in range(i + 1, len(best_features)):
                c1, c2 = best_features[i], best_features[j]
                df = df0[[target, c1, c2]]
                df = df.dropna()
                df[target] = (df[target] <= 0).astype(int)
                df = tools_DF.hash_categoricals(df)

                URL = next(tempfile._get_candidate_names()) + '.png'
                self.P.plot_2D_features_v3(df,
                                           remove_legend=True,
                                           add_noice=True,
                                           transparency=0.75,
                                           filename_out=URL)
                pairplots[cnt] = [html.Img(src=self.app.get_asset_url(URL))]
                cnt += 1
                if cnt == 4: break
            if cnt == 4: break

        return pairplots
    def get_density(self, df0, idx_target, plots_dnst):
        FI = tools_feature_importance.evaluate_feature_importance(
            df0, idx_target)
        best_idx = numpy.argsort(-FI['F_score'].to_numpy())
        best_features = FI['features'].to_numpy()[best_idx][:4]
        target = df0.columns[idx_target]
        df = df0[[target, best_features[0], best_features[1]]]

        df = df.dropna()
        df[target] = (df[target] <= 0).astype(int)
        df = tools_DF.hash_categoricals(df)

        for i, C in enumerate([
                classifier_LM.classifier_LM(),
                classifier_SVM.classifier_SVM(),
                classifier_RF.classifier_RF(),
                classifier_KNN.classifier_KNN()
        ]):
            ML = tools_ML_v2.ML(C, self.folder_out, self.P.dark_mode)
            ML.E2E_train_test_df(df, 0, do_pca=False)
            ML.plot_density_2d(df,
                               idx_target=0,
                               N=30,
                               filename_out='density.png')

            URL = next(tempfile._get_candidate_names()) + '.png'
            os.rename(self.folder_out + 'density.png', self.folder_out + URL)
            plots_dnst[i] = [html.Img(src=self.app.get_asset_url(URL))]

        return plots_dnst
Exemple #6
0
    def plot_TS_separatly(self, df, idx_target):
        df = tools_DF.hash_categoricals(df)

        for i, feature in enumerate(df.columns):
            #color = seaborn.color_palette(palette='Dark2')[0] if i == idx_target else None
            self.TS_matplotlib(df, idxs_target=[i], idx_feature=None, filename_out='%s.png' % feature)

        return
Exemple #7
0
def get_data_titanic():
    df = seaborn.load_dataset('titanic')
    df = df.dropna()
    df = tools_DF.hash_categoricals(df)
    target, c1, c2 = 'survived', 'sex', 'deck'
    X = df.loc[:, [c1, c2]].to_numpy()
    Y = df.loc[:, [target]].to_numpy().flatten()
    return X, Y
Exemple #8
0
def preprocess(df, idx_target):
    df = df.dropna()
    df = tools_DF.hash_categoricals(df)

    columns = df.columns.to_numpy()
    idx = numpy.delete(numpy.arange(0, len(columns)), idx_target)

    X = df.iloc[:, idx].to_numpy()
    Y = df.iloc[:, idx_target].to_numpy()
    return X, Y
Exemple #9
0
def ex_01_ugly():
    df0 = seaborn.load_dataset('titanic')
    target, c1, c2 = 'survived', 'sex', 'age'

    df = df0[[target, c1, c2]]
    df = df.dropna()
    df = tools_DF.hash_categoricals(df)

    seaborn.jointplot(data=df, x=c1, y=c2, hue=target, kind="kde", fill=True)
    plt.show()
    return
Exemple #10
0
def ex_view_tree(df,idx_target):
    df = df.dropna()
    df = tools_DF.hash_categoricals(df)
    X, Y = tools_DF.df_to_XY(df, idx_target, keep_categoirical=False)

    columns = df.columns.to_numpy()
    idx = numpy.delete(numpy.arange(0, len(columns)), idx_target)
    columns = columns[idx]

    C = classifier_DTree.classifier_DT(max_depth=3,folder_out=folder_out)
    C.learn(X, Y,columns,do_debug=True)

    return
Exemple #11
0
def plot_all_in_one(df0, idx_target):

    df0 = tools_DF.hash_categoricals(df0)
    df0 = tools_DF.scale(df0)

    target = df0.columns[idx_target]
    features = df0.columns.to_numpy()[numpy.delete(
        numpy.arange(0, df0.shape[1]), idx_target)]
    FI = tools_feature_importance.feature_imporance_F_score(df0, idx_target)

    best_idx = numpy.argsort(-FI)
    best_features = features[best_idx][:4]
    df = df0[[target] + best_features.tolist()]

    df = tools_DF.hash_categoricals(df)
    P.TS_seaborn(df,
                 numpy.arange(1, df.shape[1]).tolist(),
                 None,
                 filename_out='all_best_features.png')
    P.TS_seaborn(df, 0, None, filename_out='target.png')

    return
Exemple #12
0
def ex_VIF(df):

    df = tools_DF.hash_categoricals(df)
    df = df.dropna()
    columns = df.columns.to_numpy()
    VIFs = numpy.array(
        [variance_inflation_factor(df.values, i) for i in range(df.shape[1])])
    idx = numpy.argsort(VIFs)

    for i in idx:
        print('%1.2f\t%s' % (VIFs[i], columns[i]))

    return
Exemple #13
0
    def pairplots_df(self,df0, idx_target=0,cumul_mode=False,add_noise=True):

        f_handle = open(self.folder_out + "descript.ion", "w+")
        f_handle.close()

        columns = df0.columns.to_numpy()
        target = columns[idx_target]
        idx = numpy.delete(numpy.arange(0, len(columns)), idx_target)
        transparency = 0.95 if add_noise else 0

        for i in range(len(idx)-1):
            for j in range(i+1,len(idx)):
                c1, c2 = columns[idx[i]], columns[idx[j]]
                df = df0[[target,c1,c2]]
                df = df.dropna()
                df = tools_DF.hash_categoricals(df)
                I = int(100*mutual_info_classif(df.iloc[:,[1, 2]], df.iloc[:,0]).sum())
                file_out = 'pairplot_%02d_%02d_%s_%s_%02d.png' % (i, j, c1, c2, I)
                if cumul_mode:
                    self.plot_2D_features_cumul(df, remove_legend=True,filename_out=file_out)
                else:
                    self.plot_2D_features_v3(df, add_noice=add_noise,transparency=transparency,remove_legend=True,filename_out=file_out)
                f_handle = open(self.folder_out + "descript.ion", "a+")
                f_handle.write("%s %s\n" % (file_out, '%03d'%I))
                f_handle.close()


        for i in range(len(idx)):
            c1 = columns[idx[i]]
            df = df0[[target, c1]]
            df = df.dropna()
            df = tools_DF.hash_categoricals(df)
            bins = numpy.arange(-0.5, df[[c1]].max() + 0.5, 0.25)
            self.plot_1D_features_pos_neg(df[[c1]].to_numpy(), df[target].to_numpy(), labels=True, bins=bins,filename_out='plot_%02d_%02d_%s.png' % (i, i,c1))


        return
Exemple #14
0
def ex3():
    df, idx_target = pd.read_csv(folder_in + 'dataset_titanic.csv', sep='\t'), 0
    df = df.dropna()
    df = tools_DF.hash_categoricals(df)
    columns = df.columns
    target = columns[idx_target]

    idx = numpy.delete(numpy.arange(0, len(columns)), idx_target)

    for i1 in range(len(idx) - 1):
        for i2 in range(i1 + 1, len(idx)):
            c1, c2 = columns[idx[i1]], columns[idx[i2]]
            I = tools_DF.get_Mutual_Information(df,idx_target,idx[i1],idx[i2])
            #I = mutual_info_classif(df[[c1, c2]], df[target]).sum()
            print(c1,c2,I)
    return
Exemple #15
0
def ex_VIF2(df):

    df = tools_DF.hash_categoricals(df)
    df = df.dropna()
    columns = df.columns
    VIFs = []

    for i in range(0, columns.shape[0]):
        y = df[columns[i]]
        x = df[columns.drop([columns[i]])]
        r2 = OLS(y, x).fit().rsquared
        vif = round(1 / (1 - r2), 2)
        VIFs.append(vif)

    idx = numpy.argsort(VIFs)
    for i in idx:
        print('%1.2f\t%s' % (VIFs[i], columns[i]))

    return
    def get_pc(self, df0, idx_target, pca_plots):

        columns = df0.columns.to_numpy()
        target = columns[idx_target]
        df = df0.dropna()
        df[target] = (df[target] <= 0).astype(int)
        df = tools_DF.hash_categoricals(df)

        self.P.plot_SVD(df, idx_target, 'dim_SVD.png')
        self.P.plot_PCA(df, idx_target, 'dim_PCA.png')
        self.P.plot_tSNE(df, idx_target, 'dim_tSNE.png')
        self.P.plot_ISOMAP(df, idx_target, 'dim_ISOMAP.png')

        for i, filename in enumerate(
            ['dim_SVD.png', 'dim_PCA.png', 'dim_tSNE.png', 'dim_ISOMAP.png']):
            URL = next(tempfile._get_candidate_names()) + '.png'
            os.rename(self.folder_out + filename, self.folder_out + URL)
            pca_plots[i] = [html.Img(src=self.app.get_asset_url(URL))]

        return pca_plots
Exemple #17
0
def ex_feature_correlation(df):

    df = tools_DF.hash_categoricals(df)
    columns = df.columns.to_numpy()
    corrmat = abs(df.corr()).to_numpy()

    for i in range(corrmat.shape[0]):
        corrmat[i, i] = 0

    ranks = []
    while len(ranks) < corrmat.shape[1]:
        idx = numpy.argmax(corrmat)
        r, c = numpy.unravel_index(idx, corrmat.shape)
        corrmat[r, c] = 0
        if r not in ranks:
            ranks.append(r)
        if c not in ranks:
            ranks.append(c)

    ranks = numpy.array(ranks)

    corrmat = abs(df[columns[ranks]].corr())

    for i in range(corrmat.shape[0]):
        corrmat.iloc[i, i] = numpy.nan

    plt.figure(figsize=(12, 8))
    sns.heatmap(corrmat,
                vmax=1,
                square=True,
                annot=True,
                fmt='.2f',
                cmap='GnBu',
                cbar_kws={"shrink": .5},
                robust=True)
    plt.savefig(folder_out + 'corr.png')

    return
Exemple #18
0
    features = df0.columns.to_numpy()[numpy.delete(
        numpy.arange(0, df0.shape[1]), idx_target)]
    FI = tools_feature_importance.feature_imporance_F_score(df0, idx_target)

    best_idx = numpy.argsort(-FI)
    best_features = features[best_idx][:4]
    df = df0[[target] + best_features.tolist()]

    df = tools_DF.hash_categoricals(df)
    P.TS_seaborn(df,
                 numpy.arange(1, df.shape[1]).tolist(),
                 None,
                 filename_out='all_best_features.png')
    P.TS_seaborn(df, 0, None, filename_out='target.png')

    return


# ----------------------------------------------------------------------------------------------------------------------
df, idx_target = pd.read_csv(folder_in + 'traffic_hourly_small.txt',
                             delimiter=','), 1
#df, idx_target = pd.read_csv(folder_in + 'electricity_hourly_small.txt', delimiter=','), 1
# ----------------------------------------------------------------------------------------------------------------------
if __name__ == '__main__':

    df = tools_DF.hash_categoricals(df)

    P.plot_TS_separatly(df, idx_target)
    #plot_all_in_one(df, idx_target)
    #P.plot_target_feature(df, idx_target)