コード例 #1
0
def plot_confusion_matrix(y_true, y_pred, classes,
                          normalize=False,
                          title=None,
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if not title:
        if normalize:
            title = 'Normalized confusion matrix'
        else:
            title = 'Confusion matrix, without normalization'

    # Compute confusion matrix
    cm = confusion_matrix(y_true, y_pred)
    # Only use the labels that appear in the data
    classes = unique_labels(Y)
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    fig, ax = plt.subplots()
    im = ax.imshow(cm, interpolation='nearest', cmap=cmap)
    ax.figure.colorbar(im, ax=ax)
    # We want to show all ticks...
    ax.set(xticks=np.arange(cm.shape[1]),
           yticks=np.arange(cm.shape[0]),
           # ... and label them with the respective list entries
           xticklabels=classes, yticklabels=classes,
           title=title,
           ylabel='True label',
           xlabel='Predicted label')

    # Rotate the tick labels and set their alignment.
    plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
             rotation_mode="anchor")

    # Loop over data dimensions and create text annotations.
    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    s = [['TN', 'FP'], ['FN', 'TP']]
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            ax.text(j, i, str(s[i][j]) + " = " + format(cm[i, j], fmt),
                    ha="center", va="center",
                    color="white" if cm[i, j] > thresh else "black")
    fig.tight_layout()
    return ax
コード例 #2
0
def plotConfusionMatrix(y_true,y_pred,classes,cmap=plt.cm.Blues):
	title = "Normalized confusion matrix"
	cm = confusion_matrix(y_true,y_pred)
	classes = classes[unique_labels(y_true,y_pred)]
	cm = cm.astype('float') / cm.sum(axis=1)[:,np.newaxis]
	fig, ax = plt.subplots()
	im = ax.imshow(cm, interpolation='nearest', cmap=cmap)
	ax.figure.colorbar(im,ax=ax)
	ax.set(xticks=np.arange(cm.shape[1]),yticks=np.arange(cm.shape[0]),xtickslabels=classes,ytickslabels=classes,title=title,ylabel='True Label',xlabel='Predicted Label')
	plt.setp(ax.get_xticklabels(), rotation=45, ha="right", rotation_mode="anchor")
	fmt = '.2f'
	thresh = cm.max() / 2.
	for i in range(cm.shape[0]):
		for j in range(cm.shape[1]):
			ax.text(j, i, format(cm[i,j],fmt),ha="center",va="center",color="white" if cm[i, j] > thresh else "black")
			fig.tight_layout()
			return ax
コード例 #3
0
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error, mean_squared_log_error
from sklearn.neural_network import MLPRegressor
from math import sqrt


def rmse(a, b):
    return sqrt(mean_squared_error(a, b))


#############################################################################################################################
#Confirmed Module
sns.relplot(x='Days', y='Confirmed', kind='line', data=global_data)
plt.title("Confirmed Around The World")
plt.setp(plt.xticks()[1], rotation=30,
         ha='right')  # ha is the same as horizontalalignment
plt.show()

#Global Confirmed Model
#Perceptron Model

#model input parameters
X = global_data.iloc[:, -1].values
Y = global_data.iloc[:, -4].values

#print("Before Reshape x \n" , x )
#print("Before Reshape y \n" , y )
X = X.reshape(-1, 1)
Y = Y.reshape(-1, 1)

#fitting the logistic growth curve for confirmed cases
コード例 #4
0
def compute(inp_dataset, input_path, output_path, de_analysis, n_pass):

    print("Current pass ", n_pass)
    import json
    import matplotlib as plt
    import csv
    from sklearn.manifold import TSNE
    import matplotlib.pyplot as plt
    from sklearn.decomposition import PCA
    from decimal import Decimal
    import seaborn as sns
    import pandas as pd
    import networkx as nx
    from sklearn.cluster import DBSCAN
    from sklearn.cluster import KMeans
    import operator
    import numpy as np
    import random
    import sys

    #csvData=[['data','x','y','type']]
    print("Processing the input data into datafames....")
    csvData = []
    count = 0
    #filename = "G:/Thesis/Dropclust/plots/output_normalized_own_cc.csv" filename = "G:/Thesis/Dropclust/plots/PCA_GENES/output_normalized_own_cc.csv" filename =
    #"G:/Thesis/Dropclust/output_normalized_zscore_cc1.csv" filename = "C:/Users/Swagatam/IdeaProjects/openOrd/output_normalized_own_cc.csv"
    filename = input_path + "/output_normalized_own_cc.csv"
    coord_data = pd.read_csv(filename, names=['data', 'x', 'y'])
    coord_data.set_index('data', inplace=True)
    data = []
    data_outlier = []
    with open(filename, 'r') as csvfile:
        csvreader = csv.reader(csvfile)
        for row in csvreader:
            #f=0
            #row=[float(i) for i in row]
            data.append(row)
            temp_outlier = []
            temp_outlier.append(row[1])
            temp_outlier.append(row[2])
            data_outlier.append(temp_outlier)
            temp = row
            #if row[0].isnumeric():
            #    temp.append('cell')
            if len(row[0]) >= 16:
                temp.append('cell')
            else:
                temp.append('gene')
                count = count + 1
            csvData.append(temp)

    # # DB SCAN

    # In[20]:

    if n_pass != 4:
        noise = []
        print("Performing clustering....")
        db = DBSCAN(eps=180, min_samples=55).fit_predict(data_outlier)
        final_data = []
        csvData = [['data', 'x', 'y', 'type']]
        for i in range(0, len(list(db))):
            if db[i] != -1:
                final_data.append(data[i])
                csvData.append(data[i])
            if db[i] == -1:
                noise.append(data[i][0])
        data = final_data

        n_clusters = len(set(db)) - (1 if -1 in list(db) else 0)
        print("Clustering done. the number of obtained clusters: ", n_clusters)
    else:
        remove_data = []

        prev_df = pd.read_csv(
            "Stardust_results/visualization_output/3_pass/data.csv",
            delimiter=",",
            index_col=False)
        prev_df.set_index('data', inplace=True)
        clusters_info = []
        for i in range(0, len(csvData)):
            if csvData[i][3] == 'cell':
                if csvData[i][0] in (prev_df.index):
                    clusters_info.append(prev_df.loc[csvData[i][0]]['cluster'])
                else:
                    remove_data.append(csvData[i])
            else:
                f = 0
                import pickle
                with open(
                        'Stardust_results/visualization_output/3_pass/de_genes_cluster.txt',
                        'rb') as fp:
                    de_gene_cluster = pickle.load(fp)
                for rank in range(0, len(de_gene_cluster)):
                    if csvData[i][0] in de_gene_cluster[rank]:
                        f = 1
                        clusters_info.append(de_gene_cluster[rank].index(
                            csvData[i][0]))
                        break
                if f == 0:
                    remove_data.append(csvData[i])
        for r in remove_data:
            csvData.remove(r)
        temp = [['data', 'x', 'y', 'type']]
        temp.extend(csvData)
        csvData = temp

    # In[13]:

    # # OUTLIER VISUALIZATION

    # In[21]:
    if n_pass != 4:
        print("Starting outlier detection....")
        data_type = []
        c = 0
        g = 0
        for i in range(0, len(coord_data)):
            if db[i] != -1:
                data_type.append("data")
            else:
                if len(coord_data.index[i]) >= 16:
                    data_type.append("cell_outliers")
                else:
                    g = g + 1
                    data_type.append("gene_outliers")
        coord_data["data_type"] = data_type
        data_colors = ["lightblue"]
        if g > 0:
            noise_colors = ['blue', 'red']
        else:
            noise_colors = ['blue']
        coord_data["alpha"] = np.where(coord_data['data_type'] == 'data', 0.5,
                                       1.0)
        plt.figure(figsize=(6, 4.5))
        #ax = sns.scatterplot(x="x", y="y", data=coord_data[coord_data['alpha']==0.5],hue="data_type",palette=sns.xkcd_palette(data_colors),sizes=(50,100),size="data_type",alpha=0.3)
        #sns.scatterplot(x="x", y="y", data=coord_data[coord_data['alpha']==1.0],hue="data_type",palette=sns.xkcd_palette(noise_colors),sizes=(50,100),size="data_type",marker="^",alpha=1.0,ax=ax)
        marker = {"gene_outliers": "^", "cell_outliers": "^"}
        ax = sns.scatterplot(x="x",
                             y="y",
                             data=coord_data[coord_data['alpha'] == 0.5],
                             hue="data_type",
                             palette=sns.xkcd_palette(data_colors),
                             sizes=(50, 100),
                             size="data_type",
                             linewidth=0.0,
                             s=10,
                             alpha=0.3)
        sns.scatterplot(x="x",
                        y="y",
                        data=coord_data[coord_data['alpha'] == 1.0],
                        hue="data_type",
                        palette=sns.xkcd_palette(noise_colors),
                        sizes=(100, 50),
                        size="data_type",
                        style="data_type",
                        markers=marker,
                        alpha=1.0,
                        linewidth=0.0,
                        s=10,
                        legend='brief',
                        ax=ax)
        #plt.legend(title=='')
        ax.legend(bbox_to_anchor=(1.1, 1.05), frameon=False)
        sns.despine(bottom=False, left=False)
        plt.xlabel("dim1")
        plt.ylabel("dim2")
        plt.savefig(output_path + 'outliers_visualization.png',
                    bbox_inches='tight')
        print("Outliers removed from the dataset....")

    # # POST-HOC CLUSTER ASSIGNMENT

    # In[23]:

    print("Starting post hoc clustering....")
    neighbor_df = pd.read_hdf(
        'Stardust_results/build_output/1_pass/neighbor.h5', 'df')
    if 'Unnamed: 0' in list(neighbor_df.columns):
        neighbor_df.set_index('Unnamed: 0', inplace=True)
    p = 0
    col = list(neighbor_df.columns)
    index = list(neighbor_df.index)
    cell_dict = dict()
    column_dict = dict()
    for i in range(len(col)):
        column_dict[i] = col[i]
    for i in range(len(list(neighbor_df.index))):
        row = neighbor_df.iloc[i]
        col_ind = list(row.to_numpy().nonzero())[0]
        for ind in col_ind:
            if index[i] in cell_dict.keys():
                cell_dict[index[i]].append(column_dict[ind])
            else:
                temp = []
                temp.append(column_dict[ind])
                cell_dict[index[i]] = temp
    cluster_assign = []
    for key_cell in cell_dict.keys():
        clust = dict()
        cells = cell_dict[key_cell]
        for cell in cells:
            if n_pass == 4:
                if cell in list(prev_df.index):
                    cluster = prev_df.loc[cell]['cluster']
                else:
                    cluster = -1
            else:
                cluster = db[list(coord_data.index).index(cell)]
            if cluster not in clust.keys():
                clust[cluster] = 1
            else:
                clust[cluster] = clust[cluster] + 1
        max_cluster = max(clust.items(), key=operator.itemgetter(1))[0]
        if max_cluster == -1:
            continue
        cluster_assign.append(max_cluster)
        x_total = 0
        y_total = 0
        count = 0
        for cell in cells:
            if (n_pass != 4
                    and db[list(coord_data.index).index(cell)] == max_cluster
                ) or (n_pass == 4 and cell in list(prev_df.index)
                      and prev_df.loc[cell]['cluster'] == max_cluster):
                count = count + 1
                x_total = x_total + coord_data.loc[cell]['x']
                y_total = y_total + coord_data.loc[cell]['y']
        temp = []
        temp.append(key_cell)
        temp.append(x_total / count)
        temp.append(y_total / count)
        temp.append('cell')
        p = p + 1
        csvData.append(temp)
    print("Post hoc clustering done....")

    # In[24]:

    with open(output_path + 'data.csv', 'w') as csvFile:
        writer = csv.writer(csvFile)
        writer.writerows(csvData)
    csvFile.close()
    data_df = pd.read_csv(output_path + "data.csv",
                          delimiter=",",
                          index_col=False)
    if n_pass != 4:
        clusters_info = [x for x in db if x != -1]
        clusters_info = clusters_info + cluster_assign
    else:
        clusters_info = clusters_info + cluster_assign
        data_df['cluster'] = clusters_info
    data_df.to_csv(output_path + 'data.csv')
    n_clusters = len(list(set(clusters_info)))
    print("cluster saved ....")

    n_clusters = len(data_df['cluster'].unique())
    colors = random.sample(seaborn_colors, n_clusters)

    colors = random.sample(seaborn_colors, n_clusters)
    plt.figure(figsize=(5, 5))
    #cmap = sns.cubehelix_palette(dark=.3, light=.8, as_cmap=True)
    ax = sns.scatterplot(x="x",
                         y="y",
                         data=data_df,
                         hue="cluster",
                         palette=sns.xkcd_palette(colors),
                         linewidth=0.0,
                         s=2)
    ax.legend(bbox_to_anchor=(1.0, 1.00), frameon=False)
    for cl in range(n_clusters):
        plt.annotate(cl,
                     data_df.loc[data_df['cluster'] == cl, ['x', 'y']].mean(),
                     horizontalalignment='center',
                     verticalalignment='center',
                     size=10,
                     weight='bold',
                     color="black")
    sns.despine(bottom=False, left=False)
    plt.xlabel("sd1", fontsize=20)
    plt.ylabel("sd2", fontsize=20)
    plt.setp(ax.spines.values(), linewidth=2)
    plt.yticks([], linewidth=20)
    plt.xticks([])
    plt.savefig(output_path + "cluster_visualization.png",
                bbox_inches='tight',
                dpi=600)
    plt.savefig(output_path + "cluster_visualization.pdf",
                bbox_inches='tight',
                dpi=600)

    if n_pass == 3:
        from sklearn.datasets import make_blobs
        from sklearn.metrics import silhouette_samples, silhouette_score
        silhouette_avg = silhouette_score(data_df[['x', 'y']],
                                          data_df['cluster'])
        sample_silhouette_values = silhouette_samples(data_df[['x', 'y']],
                                                      data_df['cluster'])
        print(silhouette_avg)

        y_lower = 10
        import matplotlib.cm as cm
        #fig, (ax1, ax2) = plt.subplots(1, 2)
        fig = plt.figure(figsize=(4, 7))
        #fig.set_size_inches(18, 7)
        for i in range(n_clusters):
            # Aggregate the silhouette scores for samples belonging to
            # cluster i, and sort them
            ith_cluster_silhouette_values = \
                sample_silhouette_values[data_df['cluster'] == i]

            ith_cluster_silhouette_values.sort()

            size_cluster_i = ith_cluster_silhouette_values.shape[0]
            y_upper = y_lower + size_cluster_i

            color = cm.nipy_spectral(float(i) / n_clusters)
            plt.fill_betweenx(np.arange(y_lower, y_upper),
                              0,
                              ith_cluster_silhouette_values,
                              facecolor=color,
                              edgecolor=color,
                              alpha=0.7)

            # Label the silhouette plots with their cluster numbers at the middle
            plt.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))

            # Compute the new y_lower for next plot
            y_lower = y_upper + 10  # 10 for the 0 samples

        plt.title("The silhouette plot for the various clusters.")
        plt.xlabel("silhouette coefficient", fontsize=20)
        plt.ylabel("Cluster label", fontsize=20)
        plt.axvline(x=silhouette_avg, color="red", linestyle="--")

        plt.yticks([])  # Clear the yaxis labels / ticks
        plt.xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])
        sns.despine(bottom=False, left=False)
        fig.savefig(output_path + "/silhouette.pdf",
                    bbox_inches='tight',
                    dpi=600)
        fig.savefig(output_path + "/silhouette.png",
                    bbox_inches='tight',
                    dpi=600)

    #  #  MARKER FINDING
    data_df = pd.read_csv(output_path + "data.csv",
                          delimiter=",",
                          index_col=False)
    data_df.set_index('data', inplace=True)
    import pickle
    if n_pass == 2:
        path = 'Stardust_results/visualization_output/1_pass'
    if n_pass == 3:
        path = 'Stardust_results/visualization_output/2_pass'
    if n_pass == 4:
        path = 'Stardust_results/visualization_output/3_pass'
    if n_pass != 1:
        with open(path + '/de_genes_cluster.txt', 'rb') as fp:
            de_gene_cluster = pickle.load(fp)

        marker = []
        disp_marker = []
        for cl in range(n_clusters):
            cls = data_df[data_df['cluster'] == cl]
            gene_df = cls[cls['type'] == 'gene']
            f = 0
            for rank in range(len(de_gene_cluster)):
                if f == 1:
                    break
                for gene in de_gene_cluster[rank]:
                    if gene in list(gene_df.index):
                        disp_marker.append(gene)
                        #print(cl)
                        f = 1
                        break
        marker = disp_marker

        #sys.exit(0)

    # # CELL GENE MARKER

    # In[28]:
    from sklearn.neighbors import KNeighborsRegressor
    prev_pass_data = pd.read_csv(
        'Stardust_results/visualization_output/3_pass/data_openOrd.csv')
    prev_pass_data.set_index('data', inplace=True)
    data_df = pd.read_csv(output_path + '/data.csv')
    data_df.set_index('data', inplace=True)
    gene_df = data_df[data_df['type'] == 'gene']
    x_gene_fit = list(gene_df['x'])
    y_gene_fit = list(gene_df['y'])
    cells = list(prev_pass_data.index)
    cell_list = []
    x_coord = []
    y_coord = []

    for i in range(len(cells)):
        if cells[i] in list(data_df.index):
            cell_list.append(cells[i])
            x_coord.append(prev_pass_data.iloc[i]['x'])
            y_coord.append(prev_pass_data.iloc[i]['y'])

    prev_df = pd.DataFrame(index=cell_list)
    prev_df['x'] = x_coord
    prev_df['y'] = y_coord

    import numpy as np
    from sklearn.linear_model import Lasso
    from sklearn.neighbors import KNeighborsRegressor
    import pickle
    cells = []
    genes = []
    gene_coord_x = []
    gene_coord_y = []

    for i in range(n_clusters):
        clust_data = data_df[data_df['cluster'] == i]
        clust_cells = clust_data[clust_data['type'] == 'cell']
        clust_genes = clust_data[clust_data['type'] == 'gene']
        cells.extend(list(clust_cells.index))
        genes.extend(list(clust_genes.index))
        if len(list(clust_genes.index)) == 0:
            continue
        model1 = KNeighborsRegressor(n_neighbors=4)

        model2 = KNeighborsRegressor(n_neighbors=4)
        temp = []
        for cell in list(clust_cells.index):
            if cell in list(prev_df.index):
                temp.append(cell)
        clust_cells = clust_cells.loc[temp]
        model1.fit(
            np.array(list(clust_cells['x'])).reshape((-1, 1)),
            np.array(list(prev_df.loc[list(clust_cells.index)]['x'])).reshape(
                (-1, 1)))

        filename = output_path + '/sd_x_KNN_model.sav'
        pickle.dump(model1, open(filename, 'wb'))
        #model1 = pickle.load(open(filename, 'rb'))
        x_gene_pred = model1.predict(
            np.array(list(clust_genes['x'])).reshape((-1, 1)))
        gene_coord_x.extend(x_gene_pred)
        model2.fit(
            np.array(list(clust_cells['y'])).reshape((-1, 1)),
            np.array(list(prev_df.loc[list(clust_cells.index)]['y'])).reshape(
                (-1, 1)))

        filename = output_path + '/sd_y_KNN_model.sav'
        pickle.dump(model2, open(filename, 'wb'))
        #model2 = pickle.load(open(filename, 'rb'))
        y_gene_pred = model2.predict(
            np.array(list(clust_genes['y'])).reshape((-1, 1)))
        gene_coord_y.extend(y_gene_pred)

    with open(output_path + "/sd_gene_coord_x.txt", 'wb') as fp:
        pickle.dump(gene_coord_x, fp)
    with open(output_path + "/sd_gene_coord_y.txt", 'wb') as fp:
        pickle.dump(gene_coord_y, fp)

    #with open (output_path+"/sd_gene_coord_x.txt", 'rb') as fp:
    #        gene_coord_x = pickle.load(fp)
    #with open (output_path+"/sd_gene_coord_y.txt", 'rb') as fp:
    #        gene_coord_y = pickle.load(fp)

    import matplotlib.pyplot as plt, mpld3
    from scipy.spatial import ConvexHull, convex_hull_plot_2d
    prev_pass_data = pd.read_csv(
        'Stardust_results/visualization_output/3_pass/data_openOrd.csv')
    prev_pass_data["alpha"] = np.where(prev_pass_data['type'] == 'gene', 1.0,
                                       0.5)
    color_gene = ["light blue"]
    color_cell = ["red"]
    #fig,ax1 = plt.subplots()
    plt.figure(figsize=(6, 6))
    ax = sns.scatterplot(x="x",
                         y="y",
                         data=prev_pass_data[prev_pass_data['alpha'] == 0.5],
                         hue="type",
                         palette=sns.xkcd_palette(color_gene),
                         sizes=(10, 5),
                         size="type",
                         alpha=0.3,
                         s=10)
    #sns.scatterplot(x="x", y="y", data=data_df[data_df['alpha']==1.0],hue="type",palette=sns.xkcd_palette(color_cell),sizes=(20,5),size="type",marker="^",alpha=1.0,ax=ax,s=10)
    sns.scatterplot(x=gene_coord_x,
                    y=gene_coord_y,
                    palette=sns.xkcd_palette(color_cell),
                    sizes=(20, 5),
                    marker="^",
                    alpha=1.0,
                    ax=ax,
                    s=10)
    for c in range(n_clusters):
        p = data_df[data_df["cluster"] == c]
        p = p[['x', 'y']]
        points = p.values
        hull = ConvexHull(points)
        #for simplex in hull.simplices:
    #    sns.lineplot(points[simplex, 0], points[simplex, 1])
    x_list = []
    y_list = []
    if n_pass != 1:
        for m in marker:
            #x_list.append(data_df.loc[m]['x'])
            x_list.append(gene_coord_x[genes.index(m)])
            #y_list.append(data_df.loc[m]['y'])
            y_list.append(gene_coord_y[genes.index(m)])
        for label, x, y in zip(marker, x_list, y_list):
            plt.annotate(
                label,
                xy=(x, y),
                xytext=(-20, 20),
                textcoords='offset points',
                ha='right',
                va='bottom',
                #bbox=dict(boxstyle='round,pad=0.5', fc='yellow', alpha=0.5),
                arrowprops=dict(arrowstyle='-', connectionstyle='arc3,rad=0'))
    ax.legend(bbox_to_anchor=(1.0, 1.00), frameon=False)
    sns.despine(bottom=False, left=False)
    plt.xlabel("sd1", fontsize=20)
    plt.ylabel("sd2", fontsize=20)
    plt.setp(ax.spines.values(), linewidth=2)
    plt.yticks([], linewidth=20)
    plt.xticks([])
    plt.savefig(output_path + "sd_embedding.png", bbox_inches='tight', dpi=600)
    plt.savefig(output_path + "sd_embedding.pdf", bbox_inches='tight', dpi=600)

    import matplotlib.pyplot as plt, mpld3
    from scipy.spatial import ConvexHull, convex_hull_plot_2d
    #data_df["alpha"] = np.where(data_df['type'] == 'gene', 1.0, 0.5)
    prev_pass_data.set_index('data', inplace=True)
    temp_data = prev_pass_data[prev_pass_data['type'] == 'cell']
    temp_genes = data_df[data_df['type'] == 'gene']
    for pos in range(0, len(genes)):
        temp_genes.at[genes[pos], 'x'] = gene_coord_x[pos]
        temp_genes.at[genes[pos], 'y'] = gene_coord_y[pos]
    temp_data.append(temp_genes)
    color_gene = ["light blue"]
    color_cell = ["red"]
    n_clusters = len(data_df['cluster'].unique())
    colors = random.sample(seaborn_colors, n_clusters)
    #fig,ax1 = plt.subplots()
    plt.figure(figsize=(6, 6))
    ax = sns.scatterplot(x="x",
                         y="y",
                         data=temp_data,
                         hue="cluster",
                         palette=sns.xkcd_palette(colors),
                         s=2,
                         linewidth=0.0)
    #sns.scatterplot(x="x", y="y", data=data_df[data_df['alpha']==1.0],hue="type",palette=sns.xkcd_palette(color_cell),sizes=(20,5),size="type",marker="^",alpha=1.0,ax=ax,s=10)
    #sns.scatterplot(x=gene_coord_x, y=gene_coord_y,palette=sns.xkcd_palette(color_cell),sizes=(20,5),marker="^",alpha=1.0,ax=ax,s=20)
    for c in range(n_clusters):
        p = data_df[data_df["cluster"] == c]
        p = p[['x', 'y']]
        points = p.values
        hull = ConvexHull(points)
        #for simplex in hull.simplices:
    #    sns.lineplot(points[simplex, 0], points[simplex, 1])
    x_list = []
    y_list = []
    d1 = prev_pass_data[prev_pass_data['alpha'] == 0.5]
    for cl in range(n_clusters):
        plt.annotate(cl,
                     d1.loc[d1['cluster'] == cl, ['x', 'y']].mean(),
                     horizontalalignment='center',
                     verticalalignment='center',
                     size=10,
                     weight='bold',
                     color="black")
    if n_pass != 1:
        for m in marker:
            #x_list.append(data_df.loc[m]['x'])
            x_list.append(gene_coord_x[genes.index(m)])
            #y_list.append(data_df.loc[m]['y'])
            y_list.append(gene_coord_y[genes.index(m)])
        for label, x, y in zip(marker, x_list, y_list):
            plt.annotate(
                label,
                xy=(x, y),
                xytext=(-20, 20),
                textcoords='offset points',
                ha='right',
                va='bottom',
                #bbox=dict(boxstyle='round,pad=0.5', fc='yellow', alpha=0.5),
                arrowprops=dict(arrowstyle='-', connectionstyle='arc3,rad=0'))
    ax.legend(bbox_to_anchor=(1.0, 1.00), frameon=False)
    sns.despine(bottom=False, left=False)
    plt.xlabel("sd1", fontsize=20)
    plt.ylabel("sd2", fontsize=20)
    plt.setp(ax.spines.values(), linewidth=2)
    plt.yticks([], linewidth=20)
    plt.xticks([])
    plt.savefig(output_path + "sd_color_embedding.png",
                bbox_inches='tight',
                dpi=600)
    plt.savefig(output_path + "sd_color_embedding.pdf",
                bbox_inches='tight',
                dpi=600)
    #sys.exit(0)
    # # UMAP CELL GENE MARKER # #

    if n_pass == 4:

        import pickle
        with open('Stardust_results/build_output/1_pass/umap_coord.txt',
                  'rb') as fp:
            umap_coord = pickle.load(fp)
        louvain_df = pd.read_csv(
            'Stardust_results/build_output/1_pass/louvain_cluster_df.csv')
        louvain_df.set_index('Unnamed: 0', inplace=True)
        #data_df = pd.read_csv('F:/output/output_visualize_melanoma_pca/3rd_pass/data.csv')
        data_df = pd.read_csv(output_path + '/data.csv')
        data_df.set_index('data', inplace=True)
        gene_df = data_df[data_df['type'] == 'gene']
        x_gene_fit = list(gene_df['x'])
        y_gene_fit = list(gene_df['y'])
        cells = list(louvain_df.index)
        cell_list = []
        x_coord = []
        y_coord = []
        for i in range(len(cells)):
            if cells[i] in list(data_df.index):
                cell_list.append(cells[i])
                x_coord.append(umap_coord[i][0])
                y_coord.append(umap_coord[i][1])
        umap_df = pd.DataFrame(index=cell_list)
        umap_df['x'] = x_coord
        umap_df['y'] = y_coord

        import numpy as np
        from sklearn.linear_model import Lasso
        from sklearn.neighbors import KNeighborsRegressor
        import pickle
        cells = []
        genes = []
        gene_coord_x = []
        gene_coord_y = []
        for i in range(n_clusters):
            clust_data = data_df[data_df['cluster'] == i]
            clust_cells = clust_data[clust_data['type'] == 'cell']
            clust_genes = clust_data[clust_data['type'] == 'gene']
            cells.extend(list(clust_cells.index))
            genes.extend(list(clust_genes.index))
            if len(list(clust_genes.index)) == 0:
                continue
            model1 = KNeighborsRegressor(n_neighbors=5)

            model2 = KNeighborsRegressor(n_neighbors=5)

            model1.fit(
                np.array(list(clust_cells['x'])).reshape((-1, 1)),
                np.array(list(umap_df.loc[list(
                    clust_cells.index)]['x'])).reshape((-1, 1)))

            filename = output_path + '/scanpy_x_KNN_model.sav'
            pickle.dump(model1, open(filename, 'wb'))
            #model1 = pickle.load(open(filename, 'rb'))
            x_gene_pred = model1.predict(
                np.array(list(clust_genes['x'])).reshape((-1, 1)))
            gene_coord_x.extend(x_gene_pred)
            model2.fit(
                np.array(list(clust_cells['y'])).reshape((-1, 1)),
                np.array(list(umap_df.loc[list(
                    clust_cells.index)]['y'])).reshape((-1, 1)))

            filename = output_path + '/scanpy_y_KNN_model.sav'
            pickle.dump(model2, open(filename, 'wb'))
            #model2 = pickle.load(open(filename, 'rb'))
            y_gene_pred = model2.predict(
                np.array(list(clust_genes['y'])).reshape((-1, 1)))
            gene_coord_y.extend(y_gene_pred)

        with open(output_path + "/scanpy_gene_coord_x.txt", 'wb') as fp:
            pickle.dump(gene_coord_x, fp)
        with open(output_path + "/scanpy_gene_coord_y.txt", 'wb') as fp:
            pickle.dump(gene_coord_y, fp)

        #with open (output_path+"/scanpy_gene_coord_x.txt", 'rb') as fp:
        #    gene_coord_x = pickle.load(fp)
        #with open (output_path+"/scanpy_gene_coord_y.txt", 'rb') as fp:
        #    gene_coord_y = pickle.load(fp)

        #n_clusters = len(list(data_df['cluster'].unique()))

        u_map_x = []
        u_map_y = []
        for ind in list(data_df.index):
            if ind in list(louvain_df.index):

                u_map_x.append(umap_coord[list(
                    louvain_df.index).index(ind)][0])
                u_map_y.append(umap_coord[list(
                    louvain_df.index).index(ind)][1])
            else:
                u_map_x.append(gene_coord_x[genes.index(ind)])
                u_map_y.append(gene_coord_y[genes.index(ind)])
        data_df['umap_x'] = u_map_x
        data_df['umap_y'] = u_map_y

        #        colors = random.sample(seaborn_colors,n_clusters)
        #colors = colors3
        plt.figure(figsize=(5, 5))
        #cmap = sns.cubehelix_palette(dark=.3, light=.8, as_cmap=True)
        ax = sns.scatterplot(x="umap_x",
                             y="umap_y",
                             data=data_df,
                             hue="cluster",
                             palette=sns.xkcd_palette(colors),
                             linewidth=0.0,
                             s=2)
        ax.legend(bbox_to_anchor=(1.0, 1.00), frameon=False)
        for cl in range(n_clusters):
            plt.annotate(cl,
                         data_df.loc[data_df['cluster'] == cl,
                                     ['umap_x', 'umap_y']].mean(),
                         horizontalalignment='center',
                         verticalalignment='center',
                         size=10,
                         weight='bold',
                         color="black")
        sns.despine(bottom=False, left=False)
        plt.xlabel("umap1", fontsize=20)
        plt.ylabel("umap2", fontsize=20)
        plt.setp(ax.spines.values(), linewidth=2)
        plt.yticks([], linewidth=20)
        plt.xticks([])
        plt.savefig(output_path + 'umap_clustering.png',
                    bbox_inches='tight',
                    dpi=600)
        plt.savefig(output_path + 'umap_clustering.pdf',
                    bbox_inches='tight',
                    dpi=600)

        import matplotlib.pyplot as plt, mpld3
        from scipy.spatial import ConvexHull, convex_hull_plot_2d
        data_df["alpha"] = np.where(data_df['type'] == 'gene', 1.0, 0.5)
        color_gene = ["light grey"]
        color_cell = ["red"]
        #fig,ax1 = plt.subplots()
        plt.figure(figsize=(6, 6))

        ax = sns.scatterplot(x="umap_x",
                             y="umap_y",
                             data=data_df[data_df['alpha'] == 0.5],
                             hue="type",
                             palette=sns.xkcd_palette(color_gene),
                             sizes=(10, 5),
                             size="type",
                             alpha=0.3,
                             s=10)
        sns.scatterplot(x="umap_x",
                        y="umap_y",
                        data=data_df[data_df['alpha'] == 1.0],
                        hue="type",
                        palette=sns.xkcd_palette(color_cell),
                        sizes=(20, 5),
                        size="type",
                        marker="^",
                        alpha=1.0,
                        ax=ax,
                        s=10)
        for c in range(n_clusters):
            p = data_df[data_df["cluster"] == c]
            p = p[['umap_x', 'umap_y']]
            points = p.values
            hull = ConvexHull(points)
            #for simplex in hull.simplices:
            #    sns.lineplot(points[simplex, 0], points[simplex, 1])
        x_list = []
        y_list = []
        for m in marker:
            x_list.append(data_df.loc[m]['umap_x'])
            #x_list.append(gene_coord_x[genes.index(m)])
            y_list.append(data_df.loc[m]['umap_y'])
            #y_list.append(gene_coord_y[genes.index(m)])
        for cl in range(n_clusters):
            plt.annotate(cl,
                         data_df.loc[data_df['cluster'] == cl,
                                     ['umap_x', 'umap_y']].mean(),
                         horizontalalignment='center',
                         verticalalignment='center',
                         size=10,
                         weight='bold',
                         color="black")
        for label, x, y in zip(marker, x_list, y_list):
            plt.annotate(
                label,
                xy=(x, y),
                xytext=(-20, 20),
                textcoords='offset points',
                ha='right',
                va='bottom',
                #bbox=dict(boxstyle='round,pad=0.5', fc='yellow', alpha=0.5),
                arrowprops=dict(arrowstyle='-', connectionstyle='arc3,rad=0'))
        ax.legend(bbox_to_anchor=(1.0, 1.00), frameon=False)
        sns.despine(bottom=False, left=False)
        plt.xlabel("umap1", fontsize=20)
        plt.ylabel("umap2", fontsize=20)
        plt.setp(ax.spines.values(), linewidth=2)
        plt.yticks([], linewidth=20)
        plt.xticks([])
        plt.savefig(output_path + 'umap_embedding.png',
                    bbox_inches='tight',
                    dpi=600)
        plt.savefig(output_path + 'umap_embedding.pdf',
                    bbox_inches='tight',
                    dpi=600)

        import matplotlib.pyplot as plt, mpld3
        from scipy.spatial import ConvexHull, convex_hull_plot_2d
        data_df["alpha"] = np.where(data_df['type'] == 'gene', 1.0, 0.5)
        color_gene = ["light grey"]
        color_cell = ["red"]
        #fig,ax1 = plt.subplots()
        plt.figure(figsize=(6, 6))
        #       colors = color
        ax = sns.scatterplot(x="umap_x",
                             y="umap_y",
                             data=data_df[data_df['alpha'] == 0.5],
                             hue="cluster",
                             linewidth=0.0,
                             sizes=(2, 5),
                             size="type",
                             palette=sns.xkcd_palette(colors),
                             s=2)
        sns.scatterplot(x="umap_x",
                        y="umap_y",
                        data=data_df[data_df['alpha'] == 1.0],
                        hue="type",
                        palette=sns.xkcd_palette(color_cell),
                        linewidth=0.1,
                        marker="^",
                        ax=ax,
                        alpha=1.0,
                        s=10)
        for c in range(n_clusters):
            p = data_df[data_df["cluster"] == c]
            p = p[['umap_x', 'umap_y']]
            points = p.values
            hull = ConvexHull(points)
            #for simplex in hull.simplices:
            #    sns.lineplot(points[simplex, 0], points[simplex, 1])
        x_list = []
        y_list = []
        for m in marker:
            x_list.append(data_df.loc[m]['umap_x'])
            y_list.append(data_df.loc[m]['umap_y'])
        for cl in range(n_clusters):
            plt.annotate(cl,
                         data_df.loc[data_df['cluster'] == cl,
                                     ['umap_x', 'umap_y']].mean(),
                         horizontalalignment='center',
                         verticalalignment='center',
                         size=10,
                         weight='bold',
                         color="black")
        for label, x, y in zip(marker, x_list, y_list):
            plt.annotate(
                label,
                xy=(x, y),
                xytext=(-20, 20),
                textcoords='offset points',
                ha='right',
                va='bottom',
                #bbox=dict(boxstyle='round,pad=0.5', fc='yellow', alpha=0.5),
                arrowprops=dict(arrowstyle='-', connectionstyle='arc3,rad=0'))
        ax.legend(bbox_to_anchor=(1.0, 1.00), frameon=False)
        sns.despine(bottom=False, left=False)
        plt.xlabel("umap1", fontsize=20)
        plt.ylabel("umap2", fontsize=20)
        plt.setp(ax.spines.values(), linewidth=2)
        plt.yticks([], linewidth=20)
        plt.xticks([])
        plt.savefig(output_path + 'umap_color_embedding.png',
                    bbox_inches='tight',
                    dpi=600)
        plt.savefig(output_path + 'umap_color_embedding.pdf',
                    bbox_inches='tight',
                    dpi=600)
コード例 #5
0
def process(path):

    dataset = pd.read_csv(path)
    X = dataset.iloc[:, 1:6].values
    y = dataset.iloc[:, 6].values
    #y=y.round()

    y = (y / 100).astype(int) * 100

    print(X)
    print(y)
    X_train, X_test, y_train, y_test = train_test_split(X, y)

    model2 = DecisionTreeClassifier()
    model2.fit(X_train, y_train)
    y_pred = model2.predict(X_test)

    result2 = open("static/results/resultDT.csv", "w")
    result2.write("ID,Predicted Value" + "\n")
    for j in range(len(y_pred)):
        result2.write(str(j + 1) + "," + str(y_pred[j]) + "\n")
    result2.close()

    mse = abs(round(mean_squared_error(y_test, y_pred), 2)) / 1000
    mae = abs(round(mean_absolute_error(y_test, y_pred), 2))
    r2 = abs(round(r2_score(y_test, y_pred), 2))

    print("---------------------------------------------------------")
    print("MSE VALUE FOR Decision Tree IS %f " % mse)
    print("MAE VALUE FOR Decision Tree IS %f " % mae)
    print("R-SQUARED VALUE FOR Decision Tree IS %f " % r2)
    rms = abs(round(np.sqrt(mean_squared_error(y_test, y_pred)), 2))
    print("RMSE VALUE FOR Decision Tree IS %f " % rms)
    ac = round(accuracy_score(y_test, y_pred), 2) * 100
    print("ACCURACY VALUE Decision Tree IS %f" % ac)
    print("---------------------------------------------------------")

    result2 = open('static/results/DTMetrics.csv', 'w')
    result2.write("Parameter,Value" + "\n")
    result2.write("MSE" + "," + str(mse) + "\n")
    result2.write("MAE" + "," + str(mae) + "\n")
    result2.write("R-SQUARED" + "," + str(r2) + "\n")
    result2.write("RMSE" + "," + str(rms) + "\n")
    result2.write("ACCURACY" + "," + str(ac) + "\n")
    result2.close()

    df = pd.read_csv('static/results/DTMetrics.csv')
    acc = df["Value"]
    alc = df["Parameter"]
    colors = ["#1f77b4", "#ff7f0e", "#2ca02c", "#d62728", "#8c564b"]
    explode = (0.1, 0, 0, 0, 0)

    fig = plt.figure()
    plt.bar(alc, acc, color=colors)
    plt.xlabel('Parameter')
    plt.ylabel('Value')
    plt.title(' Decision Tree Metrics Value')
    fig.savefig('static/results/DTMetricsValueBarChart.png')

    group_names = ['MSE', 'MAE', 'R2', 'RMSE', 'ACCURACY']
    group_size = acc
    subgroup_names = acc
    subgroup_size = acc

    # Create colors
    a, b, c, d, e = [
        plt.cm.Blues, plt.cm.Reds, plt.cm.Greens, plt.cm.Oranges,
        plt.cm.Purples
    ]

    # First Ring (outside)
    fig, ax = plt.subplots()
    ax.axis('equal')
    mypie, _ = ax.pie(group_size,
                      radius=1.0,
                      labels=group_names,
                      colors=[a(0.6), b(0.6),
                              c(0.6), d(0.1),
                              e(0.6)])
    plt.setp(mypie, width=0.3, edgecolor='white')

    ## Second Ring (Inside)
    mypie2, _ = ax.pie(subgroup_size,
                       radius=1.0 - 0.3,
                       labels=subgroup_names,
                       labeldistance=0.7,
                       colors=[a(0.6), b(0.6),
                               c(0.6), d(0.1),
                               e(0.6)])
    plt.setp(mypie2, width=0.4, edgecolor='white')
    plt.margins(0, 0)

    plt.title('Decision Tree Metrics Value')
    plt.savefig('static/results/DTMetricsValue.png')

    # set width of bar
    barWidth = 0.25
    fig = plt.subplots(figsize=(12, 8))

    # Set position of bar on X axis
    br1 = np.arange(len(y_pred))
    br2 = [x + barWidth for x in br1]

    # Make the plot
    plt.bar(br1,
            y_test,
            color='r',
            width=barWidth,
            edgecolor='grey',
            label='Original')
    plt.bar(br2,
            y_pred,
            color='g',
            width=barWidth,
            edgecolor='grey',
            label='Predicted')

    # Adding Xticks
    plt.xlabel('Number of Records', fontweight='bold', fontsize=15)
    plt.ylabel('Fish Weight', fontweight='bold', fontsize=15)
    plt.legend()
    plt.savefig('static/results/DTCompare.png')

    return y_test, y_pred


#process("dataset.csv")
コード例 #6
0
ファイル: ufo.py プロジェクト: selinozdas/UFO-Sightings
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
days = list(range(1,32))
months = ['January','February','March','April','May','June','July','August','September','October','November','December']
density = np.zeros((31,12))
for index,row in us_data.iterrows():
    density[int(row['day']-1),int(row['month']-1)]+=1

fig,ax = plt.subplots()
im = ax.imshow(density)
ax.set_xticks(np.arange(len(months)))
ax.set_yticks(np.arange(len(days)))
ax.set_xticklabels(months)
ax.set_yticklabels(days)
plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
         rotation_mode="anchor")

for i in range(len(days)):
    for j in range(len(months)):
        text = ax.text(j, i, int(density[i, j]),
                       ha="center", va="center", color="w")
ax.set_title("The common days the UFO sightings are reported")
#fig.tight_layout()
fig.set_size_inches(10, 20, forward=True)
plt.show()

days = list(range(1,5))
months = ['January','February','March','April','May','June','July','August','September','October','November','December']
density = np.zeros((4,12))
for index,row in us_data.iterrows():
    if int(row['day']-1)<8:
コード例 #7
0
ax[0].plot(full_grouped['Confirmed'], color='red')
ax[0].set_ylim(ymin=0, ymax=None)
ax[0].set_ylabel('Confirmed')
ax[0].set_xlabel('Months')
ax[0].grid(True, which='major')

# Set layout for 'Deaths' cases
ax[1].plot(full_grouped['Deaths'], color='black')
ax[1].set_ylim(ymin=0, ymax=None)
ax[1].set_ylabel('Deaths')
ax[1].set_xlabel('Months')
ax[1].grid(True, which='major')

# Set layout for 'Recovered' cases
ax[2].plot(full_grouped['Recovered'], color='green')
ax[2].set_ylim(ymin=0, ymax=None)
ax[2].set_ylabel('Recovered')
ax[2].set_xlabel('Months')
ax[2].grid(True, which='major')

# Set general layout for figure (all axis)
fig.tight_layout()
plt.setp(ax[2].get_xticklabels(), rotation=45, horizontalalignment='right')
plt.show()

plt.show('Australia')
#plot('Germany')
#plot('France')
#plot('Spain')
#plot('Italy')
コード例 #8
0
# Impact velocities between 0.1 and 10m/s
impact_velocity = np.arange(0.1, 10, 0.1)

# Use conservation of energy, ignore aerodynamic effects
height = impact_velocity**2 / (2 * g)

# Plot in SI?
if PLOT_SI:
    # Set the plot size - 3x2 aspect ratio is best
    fig = plt.Figure(figsize=(6, 4))
    ax = plt.gca()
    plt.subplots_adjust(bottom=0.17, left=0.17, top=0.96, right=0.96)

    # Change the axis units font
    plt.setp(ax.get_ymajorticklabels(), fontsize=18)
    plt.setp(ax.get_xmajorticklabels(), fontsize=18)

    ax.spines['right'].set_color('none')
    ax.spines['top'].set_color('none')

    ax.xaxis.set_ticks_position('bottom')
    ax.yaxis.set_ticks_position('left')

    # Turn on the plot grid and set appropriate linestyle and color
    ax.grid(True, linestyle=':', color='0.75')
    ax.set_axisbelow(True)

    # Define the X and Y axis labels
    plt.xlabel('Impact Velocity (m/s)', fontsize=22, weight='bold', labelpad=5)
    plt.ylabel('Drop Height (m)', fontsize=22, weight='bold', labelpad=10)
コード例 #9
0
pickle_out.close()

pickle_in = open("MNIST_history.pickle", "rb")
saved_history = pickle.load(pickle_in)
print(saved_history)

history_dict = history.history

loss_values = history_dict['loss']
val_loss_values = history_dict['val_loss']
epochs = range(1, len(loss_values) + 1)

# visualizar o a perca da validacao e teste...
line1 = plt.plot(epochs, val_loss_values, label='Validation/Test Loss')
line2 = plt.plot(epochs, loss_values, label='Training Loss')
plt.setp(line1, linewidth=2.0, marker='+', markersize=10.0)
plt.setp(line2, linewidth=2.0, marker='4', markersize=10.0)
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.grid(True)
plt.legend()
plt.show()

# visualizar a acuracia
# acurácia razoavel....
# Plotting our accuracy charts

history_dict = history.history

acc_values = history_dict['acc']
val_acc_values = history_dict['val_acc']