def print_pred_distrib_figure(filename, bins, histo, dx, J_opt): assert isinstance(filename, str), 'filename must be a string' filename = os.path.splitext(filename)[0] + '.png' matplotlib = _try_import_matplotlib() if matplotlib is None: return else: from matplotlib import pyplot as plt figure = plt.figure(figsize=(7, 7)) plt.bar(bins[:-1], histo[0], width=dx, align='edge', color='blue', alpha=0.7, label='neutral') plt.bar(bins[:-1], histo[1], width=dx, align='edge', color='red', alpha=0.7, label='deleterious') plt.axvline(x=J_opt, color='k', ls='--', lw=1) plt.ylabel('distribution') plt.xlabel('predicted score') plt.legend() figure.savefig(filename, format='png', bbox_inches='tight') plt.close() plt.rcParams.update(plt.rcParamsDefault) LOGGER.info(f'Predictions distribution saved to {filename}')
def scatter_plot(P, L, pcIdx1, pcIdx2, letterList, rev): fig = plt.figure() # following the convention in lecture note ScatterPlot.html colors = ["r", "lime", "b", "y", "c", "m", "k", "tan", "pink", "darkred"] for i, letter in enumerate(letterList): plt.scatter(P[L == letter, pcIdx2], P[L == letter, pcIdx1], s=0.1, c=colors[i], label=letter) plt.axes().set_aspect('equal') #plt.axes().set_aspect('equal', 'datalim') plt.xlabel("Principle Component {}".format(pcIdx2)) plt.ylabel("Principle Component {}".format(pcIdx1)) plt.axhline(0, color='grey') plt.axvline(0, color='grey') plt.ylim([-5000, 5000]) plt.xlim([-5000, 5000]) plt.legend() plt.gca().invert_yaxis() fig.set_size_inches(8, 8) fName = os.path.join( pDir, 'scatter_PC{}_PC{}_{}_{}.png'.format(pcIdx1, pcIdx2, "".join(letterList), rev)) savefig(fName, bbox_inches='tight') plt.show()
def plotFeatImportance(pathOut, imp, oob, oos, method, tag=0, simNum=0, **kargs): # plot mean imp bars with std mpl.figure(figsize=(10, imp.shape[0] / 5.)) imp = imp.sort_values('mean', ascending=True) ax = imp['mean'].plot(kind='barh', color='b', alpha=0.25, xerr=imp['std'], error_kw={'ecolor': 'r'}) if method == 'MDI': mpl.xlim([0, imp.sum(axis=1).max()]) mpl.axvline(1. / imp.shape[0], lw=1., color='r', ls='dotted') ax.get_yaxis().set_visible(False) for i, j in zip(ax.patches, imp.index): ax.text(i.get_width() / 2, i.get_y() + i.get_height() / 2, j, ha='center', va='center', color='k') mpl.title('tag=' + tag + ' | simNUm=' + str(simNum) + ' | oob=' + str(round(oob, 4)) + ' | oos=' + str(round(oos, 4))) mpl.savefig(pathOut + 'featImportance_' + str(simNum) + '.png', dpi=100) mpl.clf() mpl.close() return
def plotLearning(x, scores, epsilons, filename, lines=None): fig = plt.figure() ax = fig.add_subplot(111, label="1") ax2 = fig.add_subplot(111, label="2", frame_on=False) ax.plot(x, epsilons, color="C0") ax.set_xlabel("Game", color="C0") ax.set_ylabel("Epsilon", color="C0") ax.tick_params(axis='x', colors="C0") ax.tick_params(axis='y', colors="C0") N = len(scores) running_avg = np.empty(N) for t in range(N): running_avg[t] = np.mean(scores[max(0, t - 20):(t + 1)]) ax2.scatter(x, running_avg, color="C1") #ax2.xaxis.tick_top() ax2.axes.get_xaxis().set_visible(False) ax2.yaxis.tick_right() #ax2.set_xlabel('x label 2', color="C1") ax2.set_ylabel('Score', color="C1") #ax2.xaxis.set_label_position('top') ax2.yaxis.set_label_position('right') #ax2.tick_params(axis='x', colors="C1") ax2.tick_params(axis='y', colors="C1") if lines is not None: for line in lines: plt.axvline(x=line) plt.savefig(filename)
def silhouette(): if not os.path.exists("Stardust_results"): print( "The directory structure Stardust_results doest not exist. Please run run_stardust first" ) sys.exit() if not os.path.exists("Stardust_results/analysis"): os.mkdir("Stardust_results/analysis") output_path = "Stardust_results/analysis/" from sklearn.metrics import silhouette_samples, silhouette_score data_df = pd.read_csv( 'Stardust_results/visualization_output/3_pass/data.csv', delimiter=",", index_col=False) data_df.set_index('data', inplace=True) silhouette_avg = silhouette_score(data_df[['x', 'y']], data_df['cluster']) sample_silhouette_values = silhouette_samples(data_df[['x', 'y']], data_df['cluster']) print("silhouette score ", silhouette_avg) y_lower = 10 import matplotlib.cm as cm fig = plt.figure(figsize=(4, 7)) n_clusters = len(list(data_df['cluster'].unique())) for i in range(n_clusters): # Aggregate the silhouette scores for samples belonging to # cluster i, and sort them ith_cluster_silhouette_values = \ sample_silhouette_values[data_df['cluster'] == i] ith_cluster_silhouette_values.sort() size_cluster_i = ith_cluster_silhouette_values.shape[0] y_upper = y_lower + size_cluster_i color = cm.nipy_spectral(float(i) / n_clusters) plt.fill_betweenx(np.arange(y_lower, y_upper), 0, ith_cluster_silhouette_values, facecolor=color, edgecolor=color, alpha=0.7) # Label the silhouette plots with their cluster numbers at the middle plt.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i)) # Compute the new y_lower for next plot y_lower = y_upper + 10 # 10 for the 0 samples plt.title("The silhouette plot for the various clusters.") plt.xlabel("silhouette coefficient", fontsize=20) plt.ylabel("Cluster label", fontsize=20) plt.axvline(x=silhouette_avg, color="red", linestyle="--") plt.yticks([]) # Clear the yaxis labels / ticks plt.xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1]) sns.despine(bottom=False, left=False) fig.savefig(output_path + "/silhouette.pdf", bbox_inches='tight', dpi=600) fig.savefig(output_path + "/silhouette.png", bbox_inches='tight', dpi=600)
def getCommentLengthsDistribution(comments): commentsList = [] for i in range(0, len(comments)): commentsList.append(len(comments[i])) #fig, ax = plt.subplots() plt.hist(commentsList, bins=np.arange(0, 500, 10)) plt.xlabel('Number of Words in Comment') plt.ylabel('Comment Counts') plt.title('Histogram of Word Counts in Comments') plt.axvline(x=200, color='r', linestyle='dashed', linewidth=2) plt.show()
def Plot3Data(x_data, y_data, z_data, ylabel, zlabel, plottitle, savename, LOGFILE, participant, section, savepath, verticallineindices=[0], grid=1, xlabel='Time (in Seconds)'): if DEBUG == 1: print("Plotting function called for : ", ylabel) try: #starting the plot fig = plt.figure() fig.tight_layout() plt.title(plottitle) plt.plot(x_data, y_data, 'r-', label=ylabel, linewidth=0.1) plt.plot(x_data, z_data, 'g--', label=zlabel) if DEBUG == 1: print("First few elements of the x,y and z data are : ", x_data[0:3], '\n', y_data[0:3], '\n', z_data[0:3]) if len(verticallineindices ) > 1: #Meaning the verticallineindices array is not empty for i in range(len(verticallineindices)): if verticallineindices[i] == 1: plt.axvline(x=x_data[i], linewidth='1') plt.xlabel(xlabel) plt.ylabel(str(ylabel) + ' and ' + str(zlabel)) plt.legend(loc='upper right') if grid == 1: plt.grid(color='b', linestyle='-.', linewidth=0.1) #plt.show() plt.savefig(savepath + savename, bbox_inches='tight', dpi=900, quality=100) plt.close() except Exception as e: print("Exception at the plotting function in PlottingFunctions.py : ", e) file = open(LOGFILE, 'a') writer = csv.writer(file) writer.writerow([ ' Exception in the plotting function ', ' Participant: ', participant, ' Section : ', section, ' ', ' Exception: ', e ]) file.close()
def vertical_mean_line(x, **kwargs): plt.axvline(x.mean(), linestyle="--", color=kwargs.get("color", "r")) txkw = dict(size=15, color=kwargs.get("color", "r")) label_x_pos_adjustment = 0.08 # this needs customization based on your data label_y_pos_adjustment = 5 # this needs customization based on your data if x.mean() < 6: # this needs customization based on your data tx = "mean: {:.2f}\n(std: {:.2f})".format(x.mean(), x.std()) plt.text(x.mean() + label_x_pos_adjustment, label_y_pos_adjustment, tx, **txkw) else: tx = "mean: {:.2f}\n (std: {:.2f})".format(x.mean(), x.std()) plt.text(x.mean() - 1.4, label_y_pos_adjustment, tx, **txkw)
def get_graph(n, title): """ Draw a distribution histogram for a sample of N data from n-dimensional Normal distribution """ sample = np.random.normal(size=(N, n)) dist = np.square(np.linalg.norm(sample, axis=1)) lower_bound, upper_bound = get_2_std_estimates(dist) n, bins, patches = plt.hist(dist, bins='auto', density="true") plt.axvline(x=lower_bound, color='red') plt.axvline(x=upper_bound, color='red') plt.title(title, fontdict={'fontsize': 20}) plt.show()
def graph(x,y,xLabel,yLabel,title,figname): plt.clf() plt.hist(x,color="c",edgecolor="k",alpha=0.5) plt.axvline(np.array(x).mean(),color="k",linestyle="dashed",linewidth=3,label="average") plt.xlabel(xLabel) plt.ylabel(yLabel) plt.title(title) yAxis = np.arange(0,10,1) acRes = [y] z = np.array(acRes*10) plt.plot(z,yAxis,label="model accuracy") p_value = ttest_ind(x,[y])[1] plt.plot([],[],label=f"p-value: {np.round(p_value,4)}",color="w") plt.legend() plt.savefig(figname)
def plot_std(qstack): plt.figure() flux_covar = qstack.flux_covar std = np.sqrt(np.diagonal(flux_covar)) ax = plt.axes(None, label=str(bin_size)) plt.plot(qstack.wave_stack, qstack.flux_stack, label='Stacked Flux') ax.fill_between(qstack.wave_stack, qstack.flux_stack - std, qstack.flux_stack + std, alpha=0.25, label="1-$\\sigma$ Uncertainty Range") #plt.title("Stacked Continuum Normalized Flux Near Ly-$\\alpha$ Transition") plt.xlabel("Wavelength (Angstroms)") plt.ylabel("Stacked Continuum Normalized Flux") plt.axvline(x=1215.67, color='red', linestyle='--') plt.legend()
def plotMultipleNumpylist(plotDict, yLabel, xLable): # this function plots multiple lines using values from diffrent numpy list _max = [] for key, item in plotDict.items(): plt.plot(item, linewidth = .7) _max.append(max(item)) plt.ylabel(yLabel) plt.xlabel(xLable) _text_loc_y = max(_max) plt.axvline(120, ymin=0, ymax =100, linestyle = 'dashed', color = 'maroon') plt.text(120, _text_loc_y, " GRAMs Launching", {'color': 'maroon', 'fontsize': 10}) plt.legend(plotDict.keys(), loc='upper left') plt.show() return
def Gershgorin(self): if is_square(self.x) != True: print('Please enter a square matrix') return [] else: row_sum = [] list_diagonals = [] list_diagonals.append(np.array(self.x).diagonal()) self.x = np.absolute(self.x) row_sum.append( np.array(self.x).sum(axis=1) - np.array(self.x).diagonal()) y, z = row_sum, list_diagonals z = np.array(z).tolist() y = np.array(y).tolist() circles = list(map(list, zip(z[0], y[0]))) index, radi = zip(*circles) Xupper = max(index) + np.std(index) Xlower = min(index) - np.std(index) Ylimit = max(radi) + np.std(index) fig, ax = plt.subplots() ax = plt.gca() ax.cla() ax.set_xlim((Xlower, Xupper)) ax.set_ylim((-Ylimit, Ylimit)) plt.xlabel('Real Axis') plt.ylabel('Imaginary Axis') plt.title('Gershgorin circles') for x in range(0, len(circles)): circ = plt.Circle((index[x], 0), radius=radi[x]) ax.add_artist(circ) ax.plot([Xlower, Xupper], [0, 0], 'k--') ax.plot([0, 0], [-Ylimit, Ylimit], 'k--') ax.yaxis.grid(True, linestyle="--") ax.xaxis.grid(True, linestyle="--") for i in index: plt.axvline(x=i, linestyle='--', color='r') # vertical lines plt.show()
def plot_boot(qstack): plt.figure() flux_covar = qstack.flux_covar std = np.sqrt(np.diagonal(flux_covar)) ax = plt.axes(None, label=str(bin_size)) num = 100 ws_boot = qstack.ws_boot[:100] fs_boot = qstack.fs_boot[:100] plt.plot(ws_boot.T, fs_boot.T, alpha=0.1, color='orange') plt.plot(ws_boot[0], fs_boot[0], alpha=0.1, color='orange', label='Bootstrap Samples') plt.plot(qstack.wave_stack, qstack.flux_stack, label='Stacked Flux') #plt.title("Stacked Continuum Normalized Flux Near Ly-$\\alpha$ Transition") plt.xlabel("Wavelength (Angstroms)") plt.ylabel("Stacked Continuum Normalized Flux") plt.axvline(x=1215.67, color='red', linestyle='--') plt.legend()
def PlotParticipantData(): #chosenfolder = raw_input("\n\nPlease enter the name of the participant whose data we need to plot (e.g. P006/P010/P027...)\n\n") for chosenfolder in listoffolders: #chosenfolder = raw_input("\n\nPlease enter an acceptable folder name!\n\n") os.chdir(chosenfolder + '/ClippedData/') #Navigating in to the participant subfolder. #print "\n ****** Plotting for participant:", chosenfolder, " Opening all stripped files *******\n" #Wondering if I should sim data. There is nothing there that we need now for now. #simfile = open('StrippedSimData.csv','r') #simreader = csv.reader(simfile) #skiplines(simreader,1) #simdata = list(simreader) try: #Plotting imotions Data imofile = open('StrippediMotionsData.csv', 'r') imoreader = csv.reader(imofile) skiplines(imoreader, 1) imodata = list(imoreader) time = [float(imodata[i][0]) for i in range(len(imodata))] eventmarker = [float(imodata[i][2]) for i in range(len(imodata))] steer = [float(imodata[i][3]) for i in range(len(imodata))] throttle = [float(imodata[i][4]) for i in range(len(imodata))] brake = [float(imodata[i][5]) for i in range(len(imodata))] PPG = [float(imodata[i][6]) for i in range(len(imodata))] speed = [float(imodata[i][7]) for i in range(len(imodata))] GSR = [float(imodata[i][8]) for i in range(len(imodata))] #Locating indices and respective times for vertical marker placement # Markers for participants under 61 if int(chosenfolder[1:4]) <= 61: xi = [eventmarker.index(1)] xi.append(eventmarker.index(21)) xi.append(eventmarker.index(5)) xi.append(eventmarker.index(10)) xc = [time[xi[0]]] for i in xi: xc.append(time[i]) print "x coordinates: ", xc, '\n' # Markers for participants over 62 if int(chosenfolder[1:4]) >= 61: xi = [eventmarker.index(1)] xi.append(eventmarker.index(5)) xi.append(eventmarker.index(10)) xc = [time[xi[0]]] for i in xi: xc.append(time[i]) print "x coordinates: ", xc, '\n' #Starting the iMotions Figure here. imofig1 = plt.figure(1) imofig1.tight_layout() plt.subplot(411) plt.title('Driving Data Plot (Steer/Throttle/Brake)') plt.plot(time, steer, 'r-', label='Steer') plt.xlabel('Time (sec)') plt.ylabel('Steer') plt.legend(loc='upper right') for j in xc: plt.axvline(x=j, linewidth=0.25) plt.subplot(412) plt.plot(time, throttle, 'b-', label='Throttle') plt.xlabel('Time (sec)') plt.ylabel('Throttle') plt.legend(loc='upper right') for j in xc: plt.axvline(x=j, linewidth=0.25) plt.subplot(413) plt.plot(time, brake, 'g-', label='Brake') plt.xlabel('Time (sec)') plt.ylabel('Brake') plt.legend(loc='upper right') for j in xc: plt.axvline(x=j, linewidth=0.25) plt.subplot(414) plt.plot(time, speed, 'b-', label='Speed') plt.xlabel('Time (sec)') plt.ylabel('Speed') plt.legend(loc='upper right') for j in xc: plt.axvline(x=j, linewidth=0.25) imofig1.savefig("iMotionsDrivingData.pdf", bbox_inches='tight') plt.close() #END OF FIGURE 1 imofig2 = plt.figure(1) imofig2.tight_layout() plt.subplot(211) plt.title('Physiological Data Plot (PPG/GSR)') plt.plot(time, PPG, 'r-', label='PPG') plt.legend(loc='upper right') plt.xlabel('Time (sec)') plt.ylabel('PPG/HR') for j in xc: plt.axvline(x=j, linewidth=0.25) plt.subplot(212) plt.plot(time, GSR, 'g-', label='GSR') plt.xlabel('Time (sec)') plt.ylabel('GSR') plt.legend(loc='upper right') for j in xc: plt.axvline(x=j, linewidth=0.25) imofig2.savefig("iMotionsPhysioData.pdf", bbox_inches='tight') plt.close() #END OF FIGURE 2 except: print "Participant : ", chosenfolder, " has bad data. Please exclude from analysis." '''if os.path.isfile('BadData.txt'): pass else: markerfile = open('BadData.csv','wb') markerwriter = csv.writer(markerfile) markerfile.close()''' pass #Plotting Eye Tracker Data try: etfile = open('StrippedEyeTrackingFile.csv', 'r') etreader = csv.reader(etfile) skiplines(etreader, 1) etdata = list(etreader) # Plotting the marker using counter in the indexbinocular column. time = [float(etdata[i][0]) for i in range(len(etdata))] catbin = [etdata[i][3] for i in range(len(etdata))] pupdia = [] indexbin = [] #initializing to populate them later for i in range(len(etdata)): try: pupdia.append(float(etdata[i][2])) except ValueError: pupdia.append(0) try: indexbin.append(etdata[i][15]) except ValueError: indexbin.append('-') # Function to calculate PERCLOS stats from catbin variable and time variable perclos_array = PERCLOS(time, catbin) #print "PERCLOS: \n", len(perclos_array)," \n\n\n", perclos_array if perclos_array[0][0] != 0: perclos_file = open('PERCLOS.csv', 'wb') percloswriter = csv.writer(perclos_file) percloswriter.writerow(['Time', 'PERCLOS']) percloswriter.writerows( [perclos_array[i][1], perclos_array[i][0]] for i in range(len(perclos_array))) perclos_file.close() #x = [ time[i] for i in range(len(etdata)) if catbin[i] == 'User Event']# This produces the same results as xc from above #Starting the eyetracker Figure here. etfig = plt.figure(1) etfig.tight_layout() plt.subplot(211) plt.title('Eye Tracking Data Plot (Pupil Diameter/Blinks)') plt.plot(time, pupdia, 'r-', label='Pupil Diameter') plt.xlabel('Time (sec)') plt.ylabel('Pupil Diameter (mm)') plt.legend(loc='upper right') for j in xc: plt.axvline(x=j, linewidth=0.25) plt.subplot(212) plt.plot([perclos_array[i][1] for i in range(len(perclos_array))], [perclos_array[i][0] for i in range(len(perclos_array))], 'b--', label='PERCLOS') plt.xlabel('Time (sec)') plt.ylabel('PERCLOS ( 0 - 1 )') plt.legend(loc='upper right') for j in xc: plt.axvline(x=j, linewidth=0.25) etfig.savefig("EyeTrackerData.pdf", bbox_inches='tight') plt.close() #END OF FIGURE 3 except IOError: print "Eye tracker data for: ", chosenfolder, "is not available to plot. This participant has an error with markers or the eye tracker data wasn't recorded." '''if os.path.isfile('BadData.txt'): pass else: markerfile = open('BadData.csv','wb') markerwriter = csv.writer(markerfile) markerfile.close()''' pass os.chdir('../../') #Navigating back to the main folder now.
cv = lambda z: (1 / 2) * (1 + scipy.special.erf(z / np.sqrt(2))) plt.figure() plt.plot(tq_fine, pdf) plt.xlim(quantile(cv(-4)), quantile(cv(4))) plt.xlabel("$\\log_{10}(t_Q)$ (years)") plt.ylabel("Probability Density") tqmed = quantile(0.5) one_sig_upper = quantile(cv(1)) one_sig_lower = quantile(cv(-1)) two_sig_upper = quantile(cv(2)) two_sig_lower = quantile(cv(-2)) plt.axvline(x=one_sig_upper, color='red', linestyle='--') plt.axvline(x=one_sig_lower, color='red', linestyle='--') plt.axvline(x=two_sig_upper, color='blue', linestyle='--') plt.axvline(x=two_sig_lower, color='blue', linestyle='--') plt.axvline(x=tqmed, color='orange', linestyle='--') plt.savefig("model/hist/{0}_tq_{1}_hist.pdf".format(stack, bin_size)) u_var = one_sig_upper - tqmed l_var = tqmed - one_sig_lower print("{0} stack, bin size = {1}, tq =".format(stack, bin_size), tqmed, '+', u_var, '-', l_var) #tqmed = 5.9 cov_interp = interpol.interp1d(tqs, mod_covars, axis=0)
def test(): # define datasets----------------------------------- datasets = ['Breast'] names = ["DecisionTree", "KNeighbors", "GaussianNB"] # define classifiers------------------------------------------- classifiers = [ DecisionTreeClassifier(max_depth=4), KNeighborsClassifier(n_neighbors=3), GaussianNB() ] clfs = list(zip(names, classifiers)) eclf_soft = VotingClassifier(estimators=clfs, voting='soft') eclf_hard = VotingClassifier(estimators=clfs, voting='soft') classifiers.append(eclf_soft) classifiers.append(eclf_hard) names.append("VotingSoft") names.append("VotingHard") # iterate over datasets for dataset in datasets: X_train, y_train = utils.read_data('./data/' + dataset + '_train.data') X_test, y_test = utils.read_data('./data/' + dataset + '_test.data') # iterate over classifiers------------------------------------------- probas = [] for name, clf in zip(names, classifiers): clf.fit(X_train, y_train) # predict class probabilities for all classifiers probas.append(clf.predict_proba(X_test)) # get class probabilities for the first sample in the dataset class1_1 = [pr[0, 0] for pr in probas] class2_1 = [pr[0, 1] for pr in probas] class3_1 = [pr[0, 2] for pr in probas] class4_1 = [pr[0, 3] for pr in probas] class5_1 = [pr[0, 4] for pr in probas] # plotting N = 4 # number of groups ind = np.arange(N) # group positions width = 0.35 # bar width fig, ax = plt.subplots() # bars for classifier 1-3 p1 = ax.bar(ind, np.hstack(([class1_1[:-1], [0]])), width, color='green', edgecolor='k') p2 = ax.bar(ind + width, np.hstack(([class2_1[:-1], [0]])), width, color='lightgreen', edgecolor='k') # bars for VotingClassifier p3 = ax.bar(ind, [0, 0, 0, class1_1[-1]], width, color='blue', edgecolor='k') p4 = ax.bar(ind + width, [0, 0, 0, class2_1[-1]], width, color='steelblue', edgecolor='k') # plot annotations plt.axvline(2.8, color='k', linestyle='dashed') ax.set_xticks(ind + width) ax.set_xticklabels([ 'LogisticRegression\nweight 1', 'GaussianNB\nweight 1', 'RandomForestClassifier\nweight 5', 'VotingClassifier\n(average probabilities)' ], rotation=40, ha='right') plt.ylim([0, 1]) plt.title('Class probabilities for sample 1 by different classifiers') plt.legend([p1[0], p2[0]], ['class 1', 'class 2'], loc='upper left') plt.tight_layout() plt.show()
ax2 = fig.add_subplot(212) ax2.set_ylabel('Cumulative P&L (USD)') tradingResultDf['P&L'].cumsum().plot(ax=ax2) plt.hlines(y=0,xmin = 0,xmax = 2000,color='b', linestyle='--') plt.title('AAPL SVM trading on spread crossing using 10,000 events \n(Since openning of 6/21/2012)') plt.savefig('10k_combined_spread.png', bbox_inches='tight', dpi=400) # plot 30,000 events backtesting p&l fig2 = plt.figure() fig2.set_size_inches(18.5, 10.5) ax3 = fig2.add_subplot(211) res['P&L'].plot(ax=ax3) ax1.set_ylabel('P&L (USD)') ax4 = fig2.add_subplot(212) ax4.set_ylabel('Cumulative P&L (USD)') res['P&L'].cumsum().plot(ax=ax4) xposition = [2000, 4000] for xc in xposition: plt.axvline(x=xc, color='k', linestyle='--') plt.hlines(y=0,xmin = 0,xmax = 6000,color='b', linestyle='--') plt.title('AAPL SVM trading on mid-price using 30,000 events \n(Since beginning of 6/21/2012)') plt.savefig('30k_combined.png', bbox_inches='tight', dpi=400) # ax2.plot(buys.index, results.short_mavg.ix[buys.index], # '^', markersize=10, color='m') # ax2.plot(sells.index, results.short_mavg.ix[sells.index], # 'v', markersize=10, color='k') # plt.legend(loc=0)
def prompt_MDP(dataname): # ---------------数据读取 dat = [] bat = [] xrt = [] path_png = '../GRB_prompt_MDP(3)_300s/' num = 0 with open(dataname, 'r') as f: bat_start = bat_end = xrt_start = xrt_end = 0 for line in f.readlines(): num += 1 if 'batSNR5flux' in line: bat_start = num if 'batSNR5gamma' in line: bat_end = num - 2 if 'xrtwtflux' in line: xrt_start = num if 'xrtwtgamma' in line: xrt_end = num - 2 with open(dataname, 'r') as f: for line in f.readlines()[bat_start:bat_end]: bat.append(re.split(r'\s+', line)) bat = np.array(bat) if len(bat) == 0: pass else: bat = bat[:, :-1] bat = bat.astype(np.float) with open(dataname, 'r') as f: for line in f.readlines()[xrt_start:xrt_end]: xrt.append(re.split(r'\s+', line)) xrt = np.array(xrt) xrt = xrt[:, :-1] xrt = xrt.astype(np.float) #---------------------- 转换光子数 def N_count(flux, index=xrt_photon_index): N = flux * integrate.quad(lambda E: E ** (-index), 2, 10.0)[0] / \ integrate.quad(lambda E: E * E ** (-index), 0.3, 10)[0] / 1.6e-9 return N #print('%3.3f count/cm2/s'%N_count(2.4e-8)) if len(bat) == 0: pass else: flux = np.array(bat[:, 3]) flux_err = np.array(bat[:, 4]) bat_count = [ N_count(flux[i], index=bat_photon_index) for i in range(len(flux)) ] bat_count_err = [ N_count(flux_err[i], index=bat_photon_index) for i in range(len(flux_err)) ] bat = np.column_stack((bat, bat_count, bat_count_err)) xrt_flux = np.array(xrt[:, 3]) xrt_flux_err = np.array(xrt[:, 4]) xrt_count = [N_count(xrt_flux[i]) for i in range(len(xrt_flux))] xrt_count_err = [ N_count(xrt_flux_err[i]) for i in range(len(xrt_flux_err)) ] xrt = np.column_stack((xrt, xrt_count, xrt_count_err)) #print(bat_count) # ---------------------------画图 fig, ax = plt.subplots() if len(bat) == 0: pass else: x = bat[:, 0] xerr = bat[:, 1] xerr_ = bat[:, 2] y = bat[:, -2] yerr = bat[:, -1] plt.errorbar(x, y, yerr=yerr, xerr=xerr, fmt='o', label='BAT(flux to count)') xrt_x = xrt[:, 0] xrt_xerr = xrt[:, 1] xrt_xerr_ = xrt[:, 2] xrt_y = xrt[:, -2] xrt_yerr = xrt[:, -1] plt.errorbar(xrt_x, xrt_y, yerr=xrt_yerr, xerr=xrt_xerr, fmt='o', color='red', label='XRT(flux to count)') plt.xlabel('Time since BAT trigger (s)') plt.ylabel(r'2-10 keV (Count/cm$^2$/s)') plt.title('Swift BAT-XRT data of %s' % dataname) plt.loglog() #-----------------------------------合并BAT与XRT反推光子的数据 xrt_x0 = xrt[0, 0] print(xrt_x0) if len(bat) == 0: bat = xrt else: bat = bat[bat[:, 0] < xrt_x0, :] bat = np.row_stack((bat, xrt)) x = bat[:, 0] xerr = bat[:, 1] xerr_ = bat[:, 2] y = bat[:, -2] yerr = bat[:, -1] # plt.errorbar(x, y, yerr=yerr, xerr=xerr, fmt='o',label='BAT') # -----------------------------------计算总MPD t = xerr - xerr_ N_cm = sum(y * t) N_total = N_cm * eff * area # print(N_cm, N_total) MDP = 4.29 / (miu * np.sqrt(N_total)) * 100 # print(MDP) # ----------------------------t_start 之后观测到的数据点及画图 bat2 = bat[bat[:, 0] > t_start, :] bat2 = bat2[bat2[:, 0] < t_end, :] #print(bat2) x2 = bat2[:, 0] xerr2 = bat2[:, 1] xerr2_ = bat2[:, 2] y2 = bat2[:, 6] yerr2 = bat2[:, 7] # plt.errorbar(x2, y2, yerr=yerr2, xerr=xerr2, fmt='*') plt.axvline(t_start, label='t=%s s' % t_start, color='green') plt.axvline(t_end, label='t=%s s' % t_end, color='green') # ----------------------------t_start 之后观测到的MDP t2 = xerr2 - xerr2_ N_cm2 = sum(y2 * t2) N_total2 = N_cm2 * eff * area # print(N_cm2, N_total2) MDP2 = 4.29 / (miu * np.sqrt(N_total2)) * 100 # print(MDP2) fig.text(0.2, 0.2, 'MDP = %2.2f %%' % MDP2, color='red', fontsize=12, fontweight='bold') plt.legend(loc='upper left') #plt.legend() plt.savefig(path_png + dataname + ' %2.2f%%' % MDP2 + '.png') plt.show() return MDP2
plt.scatter(X0[0], model.predict(X0[:1]), marker='o', s=300, c='r', lw=5, alpha=0.5) plt.plot(xx, model.predict(xx[:, np.newaxis]) > 0.5, lw=2) plt.scatter(X0[0], model.predict_proba(X0[:1])[0][1], marker='x', s=300, c='r', lw=5, alpha=0.5) plt.axvline(X0[0], c='r', lw=2, alpha=0.5) plt.xlim(-3, 3) plt.subplot(212) plt.bar(model.classes_, model.predict_proba(X0[:1])[0], align="center") plt.xlim(-1, 2) plt.gca().xaxis.grid(False) plt.xticks(model.classes_) plt.title("conditional probability") plt.tight_layout() plt.show() from sklearn.datasets import load_iris from sklearn.tree import DecisionTreeClassifier from sklearn.cross_validation import train_test_split iris = load_iris() X = iris.data[:, [2, 3]]
def compute(inp_dataset, input_path, output_path, de_analysis, n_pass): print("Current pass ", n_pass) import json import matplotlib as plt import csv from sklearn.manifold import TSNE import matplotlib.pyplot as plt from sklearn.decomposition import PCA from decimal import Decimal import seaborn as sns import pandas as pd import networkx as nx from sklearn.cluster import DBSCAN from sklearn.cluster import KMeans import operator import numpy as np import random import sys #csvData=[['data','x','y','type']] print("Processing the input data into datafames....") csvData = [] count = 0 #filename = "G:/Thesis/Dropclust/plots/output_normalized_own_cc.csv" filename = "G:/Thesis/Dropclust/plots/PCA_GENES/output_normalized_own_cc.csv" filename = #"G:/Thesis/Dropclust/output_normalized_zscore_cc1.csv" filename = "C:/Users/Swagatam/IdeaProjects/openOrd/output_normalized_own_cc.csv" filename = input_path + "/output_normalized_own_cc.csv" coord_data = pd.read_csv(filename, names=['data', 'x', 'y']) coord_data.set_index('data', inplace=True) data = [] data_outlier = [] with open(filename, 'r') as csvfile: csvreader = csv.reader(csvfile) for row in csvreader: #f=0 #row=[float(i) for i in row] data.append(row) temp_outlier = [] temp_outlier.append(row[1]) temp_outlier.append(row[2]) data_outlier.append(temp_outlier) temp = row #if row[0].isnumeric(): # temp.append('cell') if len(row[0]) >= 16: temp.append('cell') else: temp.append('gene') count = count + 1 csvData.append(temp) # # DB SCAN # In[20]: if n_pass != 4: noise = [] print("Performing clustering....") db = DBSCAN(eps=180, min_samples=55).fit_predict(data_outlier) final_data = [] csvData = [['data', 'x', 'y', 'type']] for i in range(0, len(list(db))): if db[i] != -1: final_data.append(data[i]) csvData.append(data[i]) if db[i] == -1: noise.append(data[i][0]) data = final_data n_clusters = len(set(db)) - (1 if -1 in list(db) else 0) print("Clustering done. the number of obtained clusters: ", n_clusters) else: remove_data = [] prev_df = pd.read_csv( "Stardust_results/visualization_output/3_pass/data.csv", delimiter=",", index_col=False) prev_df.set_index('data', inplace=True) clusters_info = [] for i in range(0, len(csvData)): if csvData[i][3] == 'cell': if csvData[i][0] in (prev_df.index): clusters_info.append(prev_df.loc[csvData[i][0]]['cluster']) else: remove_data.append(csvData[i]) else: f = 0 import pickle with open( 'Stardust_results/visualization_output/3_pass/de_genes_cluster.txt', 'rb') as fp: de_gene_cluster = pickle.load(fp) for rank in range(0, len(de_gene_cluster)): if csvData[i][0] in de_gene_cluster[rank]: f = 1 clusters_info.append(de_gene_cluster[rank].index( csvData[i][0])) break if f == 0: remove_data.append(csvData[i]) for r in remove_data: csvData.remove(r) temp = [['data', 'x', 'y', 'type']] temp.extend(csvData) csvData = temp # In[13]: # # OUTLIER VISUALIZATION # In[21]: if n_pass != 4: print("Starting outlier detection....") data_type = [] c = 0 g = 0 for i in range(0, len(coord_data)): if db[i] != -1: data_type.append("data") else: if len(coord_data.index[i]) >= 16: data_type.append("cell_outliers") else: g = g + 1 data_type.append("gene_outliers") coord_data["data_type"] = data_type data_colors = ["lightblue"] if g > 0: noise_colors = ['blue', 'red'] else: noise_colors = ['blue'] coord_data["alpha"] = np.where(coord_data['data_type'] == 'data', 0.5, 1.0) plt.figure(figsize=(6, 4.5)) #ax = sns.scatterplot(x="x", y="y", data=coord_data[coord_data['alpha']==0.5],hue="data_type",palette=sns.xkcd_palette(data_colors),sizes=(50,100),size="data_type",alpha=0.3) #sns.scatterplot(x="x", y="y", data=coord_data[coord_data['alpha']==1.0],hue="data_type",palette=sns.xkcd_palette(noise_colors),sizes=(50,100),size="data_type",marker="^",alpha=1.0,ax=ax) marker = {"gene_outliers": "^", "cell_outliers": "^"} ax = sns.scatterplot(x="x", y="y", data=coord_data[coord_data['alpha'] == 0.5], hue="data_type", palette=sns.xkcd_palette(data_colors), sizes=(50, 100), size="data_type", linewidth=0.0, s=10, alpha=0.3) sns.scatterplot(x="x", y="y", data=coord_data[coord_data['alpha'] == 1.0], hue="data_type", palette=sns.xkcd_palette(noise_colors), sizes=(100, 50), size="data_type", style="data_type", markers=marker, alpha=1.0, linewidth=0.0, s=10, legend='brief', ax=ax) #plt.legend(title=='') ax.legend(bbox_to_anchor=(1.1, 1.05), frameon=False) sns.despine(bottom=False, left=False) plt.xlabel("dim1") plt.ylabel("dim2") plt.savefig(output_path + 'outliers_visualization.png', bbox_inches='tight') print("Outliers removed from the dataset....") # # POST-HOC CLUSTER ASSIGNMENT # In[23]: print("Starting post hoc clustering....") neighbor_df = pd.read_hdf( 'Stardust_results/build_output/1_pass/neighbor.h5', 'df') if 'Unnamed: 0' in list(neighbor_df.columns): neighbor_df.set_index('Unnamed: 0', inplace=True) p = 0 col = list(neighbor_df.columns) index = list(neighbor_df.index) cell_dict = dict() column_dict = dict() for i in range(len(col)): column_dict[i] = col[i] for i in range(len(list(neighbor_df.index))): row = neighbor_df.iloc[i] col_ind = list(row.to_numpy().nonzero())[0] for ind in col_ind: if index[i] in cell_dict.keys(): cell_dict[index[i]].append(column_dict[ind]) else: temp = [] temp.append(column_dict[ind]) cell_dict[index[i]] = temp cluster_assign = [] for key_cell in cell_dict.keys(): clust = dict() cells = cell_dict[key_cell] for cell in cells: if n_pass == 4: if cell in list(prev_df.index): cluster = prev_df.loc[cell]['cluster'] else: cluster = -1 else: cluster = db[list(coord_data.index).index(cell)] if cluster not in clust.keys(): clust[cluster] = 1 else: clust[cluster] = clust[cluster] + 1 max_cluster = max(clust.items(), key=operator.itemgetter(1))[0] if max_cluster == -1: continue cluster_assign.append(max_cluster) x_total = 0 y_total = 0 count = 0 for cell in cells: if (n_pass != 4 and db[list(coord_data.index).index(cell)] == max_cluster ) or (n_pass == 4 and cell in list(prev_df.index) and prev_df.loc[cell]['cluster'] == max_cluster): count = count + 1 x_total = x_total + coord_data.loc[cell]['x'] y_total = y_total + coord_data.loc[cell]['y'] temp = [] temp.append(key_cell) temp.append(x_total / count) temp.append(y_total / count) temp.append('cell') p = p + 1 csvData.append(temp) print("Post hoc clustering done....") # In[24]: with open(output_path + 'data.csv', 'w') as csvFile: writer = csv.writer(csvFile) writer.writerows(csvData) csvFile.close() data_df = pd.read_csv(output_path + "data.csv", delimiter=",", index_col=False) if n_pass != 4: clusters_info = [x for x in db if x != -1] clusters_info = clusters_info + cluster_assign else: clusters_info = clusters_info + cluster_assign data_df['cluster'] = clusters_info data_df.to_csv(output_path + 'data.csv') n_clusters = len(list(set(clusters_info))) print("cluster saved ....") n_clusters = len(data_df['cluster'].unique()) colors = random.sample(seaborn_colors, n_clusters) colors = random.sample(seaborn_colors, n_clusters) plt.figure(figsize=(5, 5)) #cmap = sns.cubehelix_palette(dark=.3, light=.8, as_cmap=True) ax = sns.scatterplot(x="x", y="y", data=data_df, hue="cluster", palette=sns.xkcd_palette(colors), linewidth=0.0, s=2) ax.legend(bbox_to_anchor=(1.0, 1.00), frameon=False) for cl in range(n_clusters): plt.annotate(cl, data_df.loc[data_df['cluster'] == cl, ['x', 'y']].mean(), horizontalalignment='center', verticalalignment='center', size=10, weight='bold', color="black") sns.despine(bottom=False, left=False) plt.xlabel("sd1", fontsize=20) plt.ylabel("sd2", fontsize=20) plt.setp(ax.spines.values(), linewidth=2) plt.yticks([], linewidth=20) plt.xticks([]) plt.savefig(output_path + "cluster_visualization.png", bbox_inches='tight', dpi=600) plt.savefig(output_path + "cluster_visualization.pdf", bbox_inches='tight', dpi=600) if n_pass == 3: from sklearn.datasets import make_blobs from sklearn.metrics import silhouette_samples, silhouette_score silhouette_avg = silhouette_score(data_df[['x', 'y']], data_df['cluster']) sample_silhouette_values = silhouette_samples(data_df[['x', 'y']], data_df['cluster']) print(silhouette_avg) y_lower = 10 import matplotlib.cm as cm #fig, (ax1, ax2) = plt.subplots(1, 2) fig = plt.figure(figsize=(4, 7)) #fig.set_size_inches(18, 7) for i in range(n_clusters): # Aggregate the silhouette scores for samples belonging to # cluster i, and sort them ith_cluster_silhouette_values = \ sample_silhouette_values[data_df['cluster'] == i] ith_cluster_silhouette_values.sort() size_cluster_i = ith_cluster_silhouette_values.shape[0] y_upper = y_lower + size_cluster_i color = cm.nipy_spectral(float(i) / n_clusters) plt.fill_betweenx(np.arange(y_lower, y_upper), 0, ith_cluster_silhouette_values, facecolor=color, edgecolor=color, alpha=0.7) # Label the silhouette plots with their cluster numbers at the middle plt.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i)) # Compute the new y_lower for next plot y_lower = y_upper + 10 # 10 for the 0 samples plt.title("The silhouette plot for the various clusters.") plt.xlabel("silhouette coefficient", fontsize=20) plt.ylabel("Cluster label", fontsize=20) plt.axvline(x=silhouette_avg, color="red", linestyle="--") plt.yticks([]) # Clear the yaxis labels / ticks plt.xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1]) sns.despine(bottom=False, left=False) fig.savefig(output_path + "/silhouette.pdf", bbox_inches='tight', dpi=600) fig.savefig(output_path + "/silhouette.png", bbox_inches='tight', dpi=600) # # MARKER FINDING data_df = pd.read_csv(output_path + "data.csv", delimiter=",", index_col=False) data_df.set_index('data', inplace=True) import pickle if n_pass == 2: path = 'Stardust_results/visualization_output/1_pass' if n_pass == 3: path = 'Stardust_results/visualization_output/2_pass' if n_pass == 4: path = 'Stardust_results/visualization_output/3_pass' if n_pass != 1: with open(path + '/de_genes_cluster.txt', 'rb') as fp: de_gene_cluster = pickle.load(fp) marker = [] disp_marker = [] for cl in range(n_clusters): cls = data_df[data_df['cluster'] == cl] gene_df = cls[cls['type'] == 'gene'] f = 0 for rank in range(len(de_gene_cluster)): if f == 1: break for gene in de_gene_cluster[rank]: if gene in list(gene_df.index): disp_marker.append(gene) #print(cl) f = 1 break marker = disp_marker #sys.exit(0) # # CELL GENE MARKER # In[28]: from sklearn.neighbors import KNeighborsRegressor prev_pass_data = pd.read_csv( 'Stardust_results/visualization_output/3_pass/data_openOrd.csv') prev_pass_data.set_index('data', inplace=True) data_df = pd.read_csv(output_path + '/data.csv') data_df.set_index('data', inplace=True) gene_df = data_df[data_df['type'] == 'gene'] x_gene_fit = list(gene_df['x']) y_gene_fit = list(gene_df['y']) cells = list(prev_pass_data.index) cell_list = [] x_coord = [] y_coord = [] for i in range(len(cells)): if cells[i] in list(data_df.index): cell_list.append(cells[i]) x_coord.append(prev_pass_data.iloc[i]['x']) y_coord.append(prev_pass_data.iloc[i]['y']) prev_df = pd.DataFrame(index=cell_list) prev_df['x'] = x_coord prev_df['y'] = y_coord import numpy as np from sklearn.linear_model import Lasso from sklearn.neighbors import KNeighborsRegressor import pickle cells = [] genes = [] gene_coord_x = [] gene_coord_y = [] for i in range(n_clusters): clust_data = data_df[data_df['cluster'] == i] clust_cells = clust_data[clust_data['type'] == 'cell'] clust_genes = clust_data[clust_data['type'] == 'gene'] cells.extend(list(clust_cells.index)) genes.extend(list(clust_genes.index)) if len(list(clust_genes.index)) == 0: continue model1 = KNeighborsRegressor(n_neighbors=4) model2 = KNeighborsRegressor(n_neighbors=4) temp = [] for cell in list(clust_cells.index): if cell in list(prev_df.index): temp.append(cell) clust_cells = clust_cells.loc[temp] model1.fit( np.array(list(clust_cells['x'])).reshape((-1, 1)), np.array(list(prev_df.loc[list(clust_cells.index)]['x'])).reshape( (-1, 1))) filename = output_path + '/sd_x_KNN_model.sav' pickle.dump(model1, open(filename, 'wb')) #model1 = pickle.load(open(filename, 'rb')) x_gene_pred = model1.predict( np.array(list(clust_genes['x'])).reshape((-1, 1))) gene_coord_x.extend(x_gene_pred) model2.fit( np.array(list(clust_cells['y'])).reshape((-1, 1)), np.array(list(prev_df.loc[list(clust_cells.index)]['y'])).reshape( (-1, 1))) filename = output_path + '/sd_y_KNN_model.sav' pickle.dump(model2, open(filename, 'wb')) #model2 = pickle.load(open(filename, 'rb')) y_gene_pred = model2.predict( np.array(list(clust_genes['y'])).reshape((-1, 1))) gene_coord_y.extend(y_gene_pred) with open(output_path + "/sd_gene_coord_x.txt", 'wb') as fp: pickle.dump(gene_coord_x, fp) with open(output_path + "/sd_gene_coord_y.txt", 'wb') as fp: pickle.dump(gene_coord_y, fp) #with open (output_path+"/sd_gene_coord_x.txt", 'rb') as fp: # gene_coord_x = pickle.load(fp) #with open (output_path+"/sd_gene_coord_y.txt", 'rb') as fp: # gene_coord_y = pickle.load(fp) import matplotlib.pyplot as plt, mpld3 from scipy.spatial import ConvexHull, convex_hull_plot_2d prev_pass_data = pd.read_csv( 'Stardust_results/visualization_output/3_pass/data_openOrd.csv') prev_pass_data["alpha"] = np.where(prev_pass_data['type'] == 'gene', 1.0, 0.5) color_gene = ["light blue"] color_cell = ["red"] #fig,ax1 = plt.subplots() plt.figure(figsize=(6, 6)) ax = sns.scatterplot(x="x", y="y", data=prev_pass_data[prev_pass_data['alpha'] == 0.5], hue="type", palette=sns.xkcd_palette(color_gene), sizes=(10, 5), size="type", alpha=0.3, s=10) #sns.scatterplot(x="x", y="y", data=data_df[data_df['alpha']==1.0],hue="type",palette=sns.xkcd_palette(color_cell),sizes=(20,5),size="type",marker="^",alpha=1.0,ax=ax,s=10) sns.scatterplot(x=gene_coord_x, y=gene_coord_y, palette=sns.xkcd_palette(color_cell), sizes=(20, 5), marker="^", alpha=1.0, ax=ax, s=10) for c in range(n_clusters): p = data_df[data_df["cluster"] == c] p = p[['x', 'y']] points = p.values hull = ConvexHull(points) #for simplex in hull.simplices: # sns.lineplot(points[simplex, 0], points[simplex, 1]) x_list = [] y_list = [] if n_pass != 1: for m in marker: #x_list.append(data_df.loc[m]['x']) x_list.append(gene_coord_x[genes.index(m)]) #y_list.append(data_df.loc[m]['y']) y_list.append(gene_coord_y[genes.index(m)]) for label, x, y in zip(marker, x_list, y_list): plt.annotate( label, xy=(x, y), xytext=(-20, 20), textcoords='offset points', ha='right', va='bottom', #bbox=dict(boxstyle='round,pad=0.5', fc='yellow', alpha=0.5), arrowprops=dict(arrowstyle='-', connectionstyle='arc3,rad=0')) ax.legend(bbox_to_anchor=(1.0, 1.00), frameon=False) sns.despine(bottom=False, left=False) plt.xlabel("sd1", fontsize=20) plt.ylabel("sd2", fontsize=20) plt.setp(ax.spines.values(), linewidth=2) plt.yticks([], linewidth=20) plt.xticks([]) plt.savefig(output_path + "sd_embedding.png", bbox_inches='tight', dpi=600) plt.savefig(output_path + "sd_embedding.pdf", bbox_inches='tight', dpi=600) import matplotlib.pyplot as plt, mpld3 from scipy.spatial import ConvexHull, convex_hull_plot_2d #data_df["alpha"] = np.where(data_df['type'] == 'gene', 1.0, 0.5) prev_pass_data.set_index('data', inplace=True) temp_data = prev_pass_data[prev_pass_data['type'] == 'cell'] temp_genes = data_df[data_df['type'] == 'gene'] for pos in range(0, len(genes)): temp_genes.at[genes[pos], 'x'] = gene_coord_x[pos] temp_genes.at[genes[pos], 'y'] = gene_coord_y[pos] temp_data.append(temp_genes) color_gene = ["light blue"] color_cell = ["red"] n_clusters = len(data_df['cluster'].unique()) colors = random.sample(seaborn_colors, n_clusters) #fig,ax1 = plt.subplots() plt.figure(figsize=(6, 6)) ax = sns.scatterplot(x="x", y="y", data=temp_data, hue="cluster", palette=sns.xkcd_palette(colors), s=2, linewidth=0.0) #sns.scatterplot(x="x", y="y", data=data_df[data_df['alpha']==1.0],hue="type",palette=sns.xkcd_palette(color_cell),sizes=(20,5),size="type",marker="^",alpha=1.0,ax=ax,s=10) #sns.scatterplot(x=gene_coord_x, y=gene_coord_y,palette=sns.xkcd_palette(color_cell),sizes=(20,5),marker="^",alpha=1.0,ax=ax,s=20) for c in range(n_clusters): p = data_df[data_df["cluster"] == c] p = p[['x', 'y']] points = p.values hull = ConvexHull(points) #for simplex in hull.simplices: # sns.lineplot(points[simplex, 0], points[simplex, 1]) x_list = [] y_list = [] d1 = prev_pass_data[prev_pass_data['alpha'] == 0.5] for cl in range(n_clusters): plt.annotate(cl, d1.loc[d1['cluster'] == cl, ['x', 'y']].mean(), horizontalalignment='center', verticalalignment='center', size=10, weight='bold', color="black") if n_pass != 1: for m in marker: #x_list.append(data_df.loc[m]['x']) x_list.append(gene_coord_x[genes.index(m)]) #y_list.append(data_df.loc[m]['y']) y_list.append(gene_coord_y[genes.index(m)]) for label, x, y in zip(marker, x_list, y_list): plt.annotate( label, xy=(x, y), xytext=(-20, 20), textcoords='offset points', ha='right', va='bottom', #bbox=dict(boxstyle='round,pad=0.5', fc='yellow', alpha=0.5), arrowprops=dict(arrowstyle='-', connectionstyle='arc3,rad=0')) ax.legend(bbox_to_anchor=(1.0, 1.00), frameon=False) sns.despine(bottom=False, left=False) plt.xlabel("sd1", fontsize=20) plt.ylabel("sd2", fontsize=20) plt.setp(ax.spines.values(), linewidth=2) plt.yticks([], linewidth=20) plt.xticks([]) plt.savefig(output_path + "sd_color_embedding.png", bbox_inches='tight', dpi=600) plt.savefig(output_path + "sd_color_embedding.pdf", bbox_inches='tight', dpi=600) #sys.exit(0) # # UMAP CELL GENE MARKER # # if n_pass == 4: import pickle with open('Stardust_results/build_output/1_pass/umap_coord.txt', 'rb') as fp: umap_coord = pickle.load(fp) louvain_df = pd.read_csv( 'Stardust_results/build_output/1_pass/louvain_cluster_df.csv') louvain_df.set_index('Unnamed: 0', inplace=True) #data_df = pd.read_csv('F:/output/output_visualize_melanoma_pca/3rd_pass/data.csv') data_df = pd.read_csv(output_path + '/data.csv') data_df.set_index('data', inplace=True) gene_df = data_df[data_df['type'] == 'gene'] x_gene_fit = list(gene_df['x']) y_gene_fit = list(gene_df['y']) cells = list(louvain_df.index) cell_list = [] x_coord = [] y_coord = [] for i in range(len(cells)): if cells[i] in list(data_df.index): cell_list.append(cells[i]) x_coord.append(umap_coord[i][0]) y_coord.append(umap_coord[i][1]) umap_df = pd.DataFrame(index=cell_list) umap_df['x'] = x_coord umap_df['y'] = y_coord import numpy as np from sklearn.linear_model import Lasso from sklearn.neighbors import KNeighborsRegressor import pickle cells = [] genes = [] gene_coord_x = [] gene_coord_y = [] for i in range(n_clusters): clust_data = data_df[data_df['cluster'] == i] clust_cells = clust_data[clust_data['type'] == 'cell'] clust_genes = clust_data[clust_data['type'] == 'gene'] cells.extend(list(clust_cells.index)) genes.extend(list(clust_genes.index)) if len(list(clust_genes.index)) == 0: continue model1 = KNeighborsRegressor(n_neighbors=5) model2 = KNeighborsRegressor(n_neighbors=5) model1.fit( np.array(list(clust_cells['x'])).reshape((-1, 1)), np.array(list(umap_df.loc[list( clust_cells.index)]['x'])).reshape((-1, 1))) filename = output_path + '/scanpy_x_KNN_model.sav' pickle.dump(model1, open(filename, 'wb')) #model1 = pickle.load(open(filename, 'rb')) x_gene_pred = model1.predict( np.array(list(clust_genes['x'])).reshape((-1, 1))) gene_coord_x.extend(x_gene_pred) model2.fit( np.array(list(clust_cells['y'])).reshape((-1, 1)), np.array(list(umap_df.loc[list( clust_cells.index)]['y'])).reshape((-1, 1))) filename = output_path + '/scanpy_y_KNN_model.sav' pickle.dump(model2, open(filename, 'wb')) #model2 = pickle.load(open(filename, 'rb')) y_gene_pred = model2.predict( np.array(list(clust_genes['y'])).reshape((-1, 1))) gene_coord_y.extend(y_gene_pred) with open(output_path + "/scanpy_gene_coord_x.txt", 'wb') as fp: pickle.dump(gene_coord_x, fp) with open(output_path + "/scanpy_gene_coord_y.txt", 'wb') as fp: pickle.dump(gene_coord_y, fp) #with open (output_path+"/scanpy_gene_coord_x.txt", 'rb') as fp: # gene_coord_x = pickle.load(fp) #with open (output_path+"/scanpy_gene_coord_y.txt", 'rb') as fp: # gene_coord_y = pickle.load(fp) #n_clusters = len(list(data_df['cluster'].unique())) u_map_x = [] u_map_y = [] for ind in list(data_df.index): if ind in list(louvain_df.index): u_map_x.append(umap_coord[list( louvain_df.index).index(ind)][0]) u_map_y.append(umap_coord[list( louvain_df.index).index(ind)][1]) else: u_map_x.append(gene_coord_x[genes.index(ind)]) u_map_y.append(gene_coord_y[genes.index(ind)]) data_df['umap_x'] = u_map_x data_df['umap_y'] = u_map_y # colors = random.sample(seaborn_colors,n_clusters) #colors = colors3 plt.figure(figsize=(5, 5)) #cmap = sns.cubehelix_palette(dark=.3, light=.8, as_cmap=True) ax = sns.scatterplot(x="umap_x", y="umap_y", data=data_df, hue="cluster", palette=sns.xkcd_palette(colors), linewidth=0.0, s=2) ax.legend(bbox_to_anchor=(1.0, 1.00), frameon=False) for cl in range(n_clusters): plt.annotate(cl, data_df.loc[data_df['cluster'] == cl, ['umap_x', 'umap_y']].mean(), horizontalalignment='center', verticalalignment='center', size=10, weight='bold', color="black") sns.despine(bottom=False, left=False) plt.xlabel("umap1", fontsize=20) plt.ylabel("umap2", fontsize=20) plt.setp(ax.spines.values(), linewidth=2) plt.yticks([], linewidth=20) plt.xticks([]) plt.savefig(output_path + 'umap_clustering.png', bbox_inches='tight', dpi=600) plt.savefig(output_path + 'umap_clustering.pdf', bbox_inches='tight', dpi=600) import matplotlib.pyplot as plt, mpld3 from scipy.spatial import ConvexHull, convex_hull_plot_2d data_df["alpha"] = np.where(data_df['type'] == 'gene', 1.0, 0.5) color_gene = ["light grey"] color_cell = ["red"] #fig,ax1 = plt.subplots() plt.figure(figsize=(6, 6)) ax = sns.scatterplot(x="umap_x", y="umap_y", data=data_df[data_df['alpha'] == 0.5], hue="type", palette=sns.xkcd_palette(color_gene), sizes=(10, 5), size="type", alpha=0.3, s=10) sns.scatterplot(x="umap_x", y="umap_y", data=data_df[data_df['alpha'] == 1.0], hue="type", palette=sns.xkcd_palette(color_cell), sizes=(20, 5), size="type", marker="^", alpha=1.0, ax=ax, s=10) for c in range(n_clusters): p = data_df[data_df["cluster"] == c] p = p[['umap_x', 'umap_y']] points = p.values hull = ConvexHull(points) #for simplex in hull.simplices: # sns.lineplot(points[simplex, 0], points[simplex, 1]) x_list = [] y_list = [] for m in marker: x_list.append(data_df.loc[m]['umap_x']) #x_list.append(gene_coord_x[genes.index(m)]) y_list.append(data_df.loc[m]['umap_y']) #y_list.append(gene_coord_y[genes.index(m)]) for cl in range(n_clusters): plt.annotate(cl, data_df.loc[data_df['cluster'] == cl, ['umap_x', 'umap_y']].mean(), horizontalalignment='center', verticalalignment='center', size=10, weight='bold', color="black") for label, x, y in zip(marker, x_list, y_list): plt.annotate( label, xy=(x, y), xytext=(-20, 20), textcoords='offset points', ha='right', va='bottom', #bbox=dict(boxstyle='round,pad=0.5', fc='yellow', alpha=0.5), arrowprops=dict(arrowstyle='-', connectionstyle='arc3,rad=0')) ax.legend(bbox_to_anchor=(1.0, 1.00), frameon=False) sns.despine(bottom=False, left=False) plt.xlabel("umap1", fontsize=20) plt.ylabel("umap2", fontsize=20) plt.setp(ax.spines.values(), linewidth=2) plt.yticks([], linewidth=20) plt.xticks([]) plt.savefig(output_path + 'umap_embedding.png', bbox_inches='tight', dpi=600) plt.savefig(output_path + 'umap_embedding.pdf', bbox_inches='tight', dpi=600) import matplotlib.pyplot as plt, mpld3 from scipy.spatial import ConvexHull, convex_hull_plot_2d data_df["alpha"] = np.where(data_df['type'] == 'gene', 1.0, 0.5) color_gene = ["light grey"] color_cell = ["red"] #fig,ax1 = plt.subplots() plt.figure(figsize=(6, 6)) # colors = color ax = sns.scatterplot(x="umap_x", y="umap_y", data=data_df[data_df['alpha'] == 0.5], hue="cluster", linewidth=0.0, sizes=(2, 5), size="type", palette=sns.xkcd_palette(colors), s=2) sns.scatterplot(x="umap_x", y="umap_y", data=data_df[data_df['alpha'] == 1.0], hue="type", palette=sns.xkcd_palette(color_cell), linewidth=0.1, marker="^", ax=ax, alpha=1.0, s=10) for c in range(n_clusters): p = data_df[data_df["cluster"] == c] p = p[['umap_x', 'umap_y']] points = p.values hull = ConvexHull(points) #for simplex in hull.simplices: # sns.lineplot(points[simplex, 0], points[simplex, 1]) x_list = [] y_list = [] for m in marker: x_list.append(data_df.loc[m]['umap_x']) y_list.append(data_df.loc[m]['umap_y']) for cl in range(n_clusters): plt.annotate(cl, data_df.loc[data_df['cluster'] == cl, ['umap_x', 'umap_y']].mean(), horizontalalignment='center', verticalalignment='center', size=10, weight='bold', color="black") for label, x, y in zip(marker, x_list, y_list): plt.annotate( label, xy=(x, y), xytext=(-20, 20), textcoords='offset points', ha='right', va='bottom', #bbox=dict(boxstyle='round,pad=0.5', fc='yellow', alpha=0.5), arrowprops=dict(arrowstyle='-', connectionstyle='arc3,rad=0')) ax.legend(bbox_to_anchor=(1.0, 1.00), frameon=False) sns.despine(bottom=False, left=False) plt.xlabel("umap1", fontsize=20) plt.ylabel("umap2", fontsize=20) plt.setp(ax.spines.values(), linewidth=2) plt.yticks([], linewidth=20) plt.xticks([]) plt.savefig(output_path + 'umap_color_embedding.png', bbox_inches='tight', dpi=600) plt.savefig(output_path + 'umap_color_embedding.pdf', bbox_inches='tight', dpi=600)
# usually now I would want to average rows MRI1 & MRI2 into standard_recon # and MRI1_long with MRI2_long into long_recon, yet the subs don't match # which is something I need to fix in the bash script... standard_recon = np.concatenate((np.array(t['MRI1']), np.array(t['MRI2'])), axis=0) standard_recon = standard_recon[~np.isnan(standard_recon)] long_recon = np.concatenate((np.array(t['MRI1_long']), np.array(t['MRI2_long'])), axis=0) long_recon = long_recon[~np.isnan(long_recon)] # Histograms of different processing times, with average value clearly marked for the 3 processes sns.set_style("white") hist_standard = sns.distplot(standard_recon, norm_hist=True) hist_base = sns.distplot(base, norm_hist=True) hist_long = sns.distplot(long_recon, norm_hist=True) plt.axvline(2.8, 0,0.17) # Scatterplot of processing time with TIV