def acquisition_scatter(y_unk_pred, var_unk_pred, acquisition, regress_type): y_unk_pred = y_unk_pred[:] y_unk_pred[y_unk_pred > 10000] = 10000 plt.figure() plt.scatter(y_unk_pred, var_unk_pred, alpha=0.5, c=-acquisition, cmap='hot') plt.title(regress_type.title()) plt.xlabel('Predicted score') plt.ylabel('Variance') plt.savefig('figures/acquisition_unknown_{}.png' .format(regress_type), dpi=200) plt.close()
def score_scatter(y_pred, y, var_pred, regress_type, prefix=''): y_pred = y_pred[:] y_pred[y_pred < 0] = 0 y_pred[y_pred > 10000] = 10000 plt.figure() plt.scatter(y_pred, var_pred, alpha=0.3, c=(y - y.min()) / (y.max() - y.min())) plt.viridis() plt.xlabel('Predicted score') plt.ylabel('Variance') plt.savefig('figures/variance_vs_pred_{}regressors{}.png' .format(prefix, regress_type), dpi=300) plt.close()
loads.append(line) if len(loads) < 2: print "coudln't find loads for " + f continue loads_d = [] for i in loads: loads_d.append(get_num_dict(i)) chord_loads.append(get_50_percent(loads_d[0])) vserver_loads.append(get_50_percent(loads_d[1])) x_values.append(next(get_numbers(f))) plt.figure().set_size_inches(6.5,5) plt.xlabel("#Nodes") plt.ylabel("% of nodes storing 50% of data") from matplotlib.ticker import EngFormatter formatter = EngFormatter(places=0) plt.gca().xaxis.set_major_formatter(formatter) plt.ylim(0,0.5) plt.xlim(0,1000000) out_file = "intro_lb_chord.pdf" d1 = prepare(x_values,chord_loads) d2 = prepare(x_values,vserver_loads) d1['label'] = 'Neighbor Replication' d1['linestyle'] = 'dashed'
y_scores = np.where(y_scores > 0.5, 1, 0) y_true = np.where(y_true > 0.5, 1, 0) import os os.mkdir('./output') output_folder = 'output/' #Area under the ROC curve fpr, tpr, thresholds = roc_curve((y_true), y_scores) AUC_ROC = roc_auc_score(y_true, y_scores) print("\nArea under the ROC curve: " + str(AUC_ROC)) roc_curve = plt.figure() plt.plot(fpr, tpr, '-', label='Area Under the Curve (AUC = %0.4f)' % AUC_ROC) plt.title('ROC curve') plt.xlabel("FPR (False Positive Rate)") plt.ylabel("TPR (True Positive Rate)") plt.legend(loc="lower right") plt.savefig(output_folder + "ROC.png") #Precision-recall curve precision, recall, thresholds = precision_recall_curve(y_true, y_scores) precision = np.fliplr([precision])[0] recall = np.fliplr([recall])[0] AUC_prec_rec = np.trapz(precision, recall) print("\nArea under Precision-Recall curve: " + str(AUC_prec_rec)) prec_rec_curve = plt.figure() plt.plot(recall, precision, '-', label='Area Under the Curve (AUC = %0.4f)' % AUC_prec_rec) plt.title('Precision - Recall curve')
datasets, genes_list, n_cells = load_names(data_names) #datasets, genes = merge_datasets(datasets, genes_list) #datasets_dimred, genes = process_data(datasets, genes, hvg=hvg) datasets, genes = correct(datasets, genes_list) X = np.concatenate(datasets) X[X < 0] = 0 cell_labels = ( open('data/cell_labels/pancreas_cluster.txt').read().rstrip().split()) er_idx = [i for i, cl in enumerate(cell_labels) if cl == 'beta_er'] beta_idx = [i for i, cl in enumerate(cell_labels) if cl == 'beta'] gadd_idx = list(genes).index('GADD45A') herp_idx = list(genes).index('HERPUD1') plt.figure() plt.boxplot([X[er_idx, gadd_idx], X[beta_idx, gadd_idx]], showmeans=True) plt.title('GADD45A (p < {})'.format( ttest_ind(X[er_idx, gadd_idx], X[beta_idx, gadd_idx])[1])) plt.xticks([1, 2], ['beta_er', 'beta']) plt.ylabel('Scaled gene expression') plt.savefig('er_stress_GADD45A.svg') plt.figure() plt.boxplot([X[er_idx, herp_idx], X[beta_idx, herp_idx]], showmeans=True) plt.title('HERPUD1 (p < {})'.format( ttest_ind(X[er_idx, herp_idx], X[beta_idx, herp_idx])[1])) plt.xticks([1, 2], ['beta_er', 'beta']) plt.ylabel('Scaled gene expression') plt.savefig('er_stress_HERPUD1.svg')
for line in fileinput.input(f+"/control.messagecost.weighted"): ns = get_numbers(line) count += list(ns)[-1] nsize = next(get_numbers(f)) if "chord" in f: chord_values.append(count/nsize) x_values1.append(nsize) else: best_values.append(count/nsize) x_values2.append(nsize) plt.figure().set_size_inches(6.5,5) plt.xlabel("#Nodes") plt.ylabel("Per-node Replication Maintenance Cost (KB)") from matplotlib.ticker import EngFormatter formatter = EngFormatter(places=0) plt.gca().xaxis.set_major_formatter(formatter) plt.yscale('log') plt.xlim(0,1000000) out_file = "intro_rep_ma.pdf" d1 = prepare(x_values1,chord_values) d2 = prepare(x_values2,best_values) d1['label'] = 'Neighbor Replication' d1['linestyle'] = 'dashed'
if __name__ == '__main__': labels = np.array(open('data/cell_labels/all.txt').read().rstrip().split()) # Scanorama. X = np.loadtxt('data/panorama_embedding.txt') idx = np.random.choice(X.shape[0], size=20000, replace=False) sil_pan = sil(X[idx, :], labels[idx]) print(np.median(sil_pan)) # scran MNN. X = np.loadtxt('data/mnn_embedding.txt') idx = np.random.choice(X.shape[0], size=20000, replace=False) sil_mnn = sil(X[idx, :], labels[idx]) print(np.median(sil_mnn)) # Seurat CCA. X = np.loadtxt('data/cca_embedding.txt') idx = np.random.choice(X.shape[0], size=20000, replace=False) sil_cca = sil(X[idx, :], labels[idx]) print(np.median(sil_cca)) print(ttest_ind(sil_pan, sil_mnn)) print(ttest_ind(sil_pan, sil_cca)) plt.figure() plt.boxplot([sil_pan, sil_mnn, sil_cca], showmeans=True) plt.title('Distributions of Silhouette Coefficients') plt.xticks([1, 2, 3], ['Scanorama', 'scran MNN', 'Seurat CCA']) plt.ylabel('Silhouette Coefficient') plt.savefig('silhouette.svg')
datasets = [normalize(ds, axis=1) for ds in datasets] tsne = TSNE(n_iter=400, perplexity=100, verbose=2, random_state=69) tsne.fit(np.concatenate(datasets[1:])) plot_clusters(tsne.embedding_, np.concatenate(clusters[1:]), s=500) plt.title('Uncorrected data') plt.savefig('simulation_uncorrected.svg') # Assemble datasets. assembled = assemble(datasets[1:], verbose=1, sigma=1, knn=10, approx=True) tsne.fit(datasets[1]) plot_clusters(tsne.embedding_, clusters[1], s=500) plt.title('Dataset 1') plt.xlabel('t-SNE 1') plt.ylabel('t-SNE 2') plt.savefig('simulation_ds1.svg') tsne.fit(datasets[2]) plot_clusters(tsne.embedding_, clusters[2], s=500) plt.title('Dataset 2') plt.xlabel('t-SNE 1') plt.ylabel('t-SNE 2') plt.savefig('simulation_ds2.svg') tsne.fit(datasets[3]) plot_clusters(tsne.embedding_, clusters[3], s=500) plt.title('Dataset 3') plt.xlabel('t-SNE 1') plt.ylabel('t-SNE 2') plt.savefig('simulation_ds3.svg')
print "coudln't find loads for " + f continue monitor_d = [] for i in monitor: monitor_d.append(get_num_dict(i)) t_d = expand_num_dict(monitor_d[0]) chord_monitor.append(float(sum(t_d))/len(t_d)) t_d = expand_num_dict(monitor_d[1]) vserver_monitor.append(float(sum(t_d))/len(t_d)) x_values.append(next(get_numbers(f))) plt.figure().set_size_inches(6.5,5) plt.xlabel("#Nodes") plt.ylabel("#Monitored Nodes") from matplotlib.ticker import EngFormatter formatter = EngFormatter(places=0) plt.gca().xaxis.set_major_formatter(formatter) plt.xlim(0,1000000) out_file = "intro_mon_chord.pdf" d1 = prepare(x_values,chord_monitor) d2 = prepare(x_values,vserver_monitor) d1['label'] = 'Neighbor Replication' d1['linestyle'] = 'dashed' d2['label'] = "Virtual Servers"