Example #1
0
        X = pd.read_csv('../../data/simulated/mvnsim/mvnsim' + dataset +
                        '.csv',
                        sep=',',
                        header=0,
                        index_col=0)

        y = np.load('../../data/simulated/mvnsim/target' + dataset + '.npy')
    #print(y)
    #print(y.shape)
    #print(X.shape)

    distribution_boxplot(
        X,
        y,
        "Initial category 1 distribution of dataset %s" % dataset,
        "Initial category 0 distribution of distribution %s" % dataset,
        #output='show'
        #output='plotly',
        #ply_title="Initial distribution of dataset %s" % dataset,
        output='save',
        path='%sinitialdist_%s.png' % (filepath, nowtime))

    print('\nBoxplot of initial data for dataset %s saved.' % dataset)

    ## PREPROCESSING ##

    #Scale initial data to centre data

    X_scaled = scale(X)
    X_scaled_df = pd.DataFrame.from_records(X_scaled)

    distribution_boxplot(
                      #path='%sinitialdist.png' % filepath,
                      )
 
 print('\nBoxplot of initial data for dataset %s saved.' % dataset)
 '''
 ## PREPROCESSING ##
 
 #Scale initial data to centre data
 
 X_scaled = scale(X)
 X_scaled_df = pd.DataFrame.from_records(X_scaled)
 
 distribution_boxplot(X_scaled_df,
                      y,
                      "Scaled category 1 distribution of dataset %s" % dataset,
                      "Scaled category 0 distribution of dataset %s" % dataset,
                      #output='show'
                      output='save',
                      path='../../figs/out/%s/%s/scaledist.png' % (scriptname, dataset)
                      )
 #print(X_scaled.shape)
 print('\nBoxplot of scaled data for dataset %s saved.' % dataset)
 
 #Initiate KPCAwith various kernels
 
 # As I'm using 500 variables, 0.002 is the default gamma (1/n_variables)
 # I only explicitly state it at this point so I can display it on graphs
 gamma = 0.002
 
 #compute kernels not preloaded into kpca
 #laplacian
 K_lap = laplacian_kernel(X_scaled, gamma=gamma) 
#Import toy data and target
X = pd.read_csv('../../data/simulated/mvnsim/mvnsim' + dataset + '.csv',
                sep=',',
                header=0,
                index_col=0)
y = np.load('../../data/simulated/mvnsim/target' + dataset + '.npy')
#print(y)
#print(y.shape)
#print(X.shape)

distribution_boxplot(
    X,
    y,
    "Initial category 1 distribution of dataset %s" % dataset,
    "Initial category 0 distribution of dataset %s" % dataset,
    #output='show'
    #output='plotly',
    #ply_title="Initial distribution of dataset %s" % dataset,
    output='save',
    path='%sinitialdist.png' % filepath,
)

print('\nBoxplot of initial data for dataset %s saved.' % dataset)

## PREPROCESSING ##

#Scale initial data to centre data

X_scaled = scale(X)
X_scaled_df = pd.DataFrame.from_records(X_scaled)
#print(y)

#X2, y2 = target_split(inp_csv, 500)
print('\nShape of mvn dataframe: %s\n' % (X2.shape, ))
#print(X2)
print('\nShape of make_classification target array: %s\n' % (y2.shape, ))
#print(y2)

plt.figure(figsize=(50, 15))

plt.subplot(2, 1, 1)
img1 = sns.boxplot(data=inp_csv[inp_target == 1])
plt.title("Category A distribution of dataset 015", fontsize=20)

plt.subplot(2, 1, 2)
img2 = sns.boxplot(data=inp_csv[inp_target == 0])
plt.title("Category B distribution of dataset 015", fontsize=20)

#plt.savefig('../../data/simulated/mvnsim/mvnsim%sdist.png' % simname)
plt.show()

plt.close()

distribution_boxplot(
    inp_csv,
    inp_target,
    "Category A distribution of dataset 015",
    "Category B distribution of dataset 015",
    output='show',
)