def test_pearsonr(self): # Tests some computations of Pearson's r x = ma.arange(10) with warnings.catch_warnings(): # The tests in this context are edge cases, with perfect # correlation or anticorrelation, or totally masked data. # None of these should trigger a RuntimeWarning. warnings.simplefilter("error", RuntimeWarning) assert_almost_equal(mstats.pearsonr(x, x)[0], 1.0) assert_almost_equal(mstats.pearsonr(x, x[::-1])[0], -1.0) x = ma.array(x, mask=True) pr = mstats.pearsonr(x, x) assert_(pr[0] is masked) assert_(pr[1] is masked) x1 = ma.array([-1.0, 0.0, 1.0]) y1 = ma.array([0, 0, 3]) r, p = mstats.pearsonr(x1, y1) assert_almost_equal(r, np.sqrt(3)/2) assert_almost_equal(p, 1.0/3) # (x2, y2) have the same unmasked data as (x1, y1). mask = [False, False, False, True] x2 = ma.array([-1.0, 0.0, 1.0, 99.0], mask=mask) y2 = ma.array([0, 0, 3, -1], mask=mask) r, p = mstats.pearsonr(x2, y2) assert_almost_equal(r, np.sqrt(3)/2) assert_almost_equal(p, 1.0/3)
def R2(obs, mod, axis=None): """ Coefficient of Determination (unit squared)""" from scipy.stats.mstats import pearsonr if axis is None: return pearsonr(obs, mod)[0]**2 else: return apply_along_axis_2v(lambda x, y: pearsonr(x, y)[0]**2, axis, obs, mod)
def test_pearsonr(self): "Tests some computations of Pearson's r" x = ma.arange(10) assert_almost_equal(mstats.pearsonr(x,x)[0], 1.0) assert_almost_equal(mstats.pearsonr(x,x[::-1])[0], -1.0) # x = ma.array(x, mask=True) pr = mstats.pearsonr(x,x) assert(pr[0] is masked) assert(pr[1] is masked)
def test_pearsonr(self): "Tests some computations of Pearson's r" x = ma.arange(10) assert_almost_equal(mstats.pearsonr(x, x)[0], 1.0) assert_almost_equal(mstats.pearsonr(x, x[::-1])[0], -1.0) # x = ma.array(x, mask=True) pr = mstats.pearsonr(x, x) assert (pr[0] is masked) assert (pr[1] is masked)
def test_pearsonr(self): "Tests some computations of Pearson's r" x = ma.arange(10) olderr = np.seterr(all='ignore') try: assert_almost_equal(mstats.pearsonr(x,x)[0], 1.0) assert_almost_equal(mstats.pearsonr(x,x[::-1])[0], -1.0) x = ma.array(x, mask=True) pr = mstats.pearsonr(x,x) finally: np.seterr(**olderr) assert_(pr[0] is masked) assert_(pr[1] is masked)
def performance_indicators(y, y_true, modelname, verbose=False, plot_scatter=False): # calculate different accuracy scores r2_score = r2(y, y_true) spearman_corr = spearmanr(y, y_true)[0] rms_error = np.sqrt(mean_squared_error(y, y_true)) pearson_corr = pearsonr(y, y_true)[0] if verbose: print(f"prediction accuracy for {modelname}") print(f"R^2 score: \t {r2_score}") print(f"RMS error: \t {rms_error}") print(f"Pearson: \t {pearson_corr}") print(f"Spearman: \t {spearman_corr}") if plot_scatter: data = pd.DataFrame({'true_values': y_true.reshape(-1), 'predictions': y.reshape(-1)}) joint_grid = sns.jointplot("true_values", "predictions", data=data, kind="scatter", xlim=(min(y_true), max(y_true)), ylim=(min(y_true), max(y_true)), height=7) joint_grid.ax_joint.plot([min(y_true), max(y_true)], [min(y_true), max(y_true)], 'r') summary_dict = {"rmse": rms_error, "r2": r2_score, "pearson": pearson_corr, "spearman": spearman_corr} return summary_dict
def compute(self, x, y): assert np.size(x) == np.size(y) r, pv = mstats.pearsonr(x, y) try: n_pv = float(pv) except ValueError: n_pv = float(pv.data[0]) return {'PEARSON': r, 'PEARSON_PV': n_pv}
def all_pairs_pearson(M): """This should return a squareform matrix. This is about 15% faster than "correlate all", but correlate_all calcuates twice the extra values. """ C = np.zeros((len(M), len(M))) for i in xrange(len(M)): for j in xrange(i+1, len(M)): C[i][j] = mstats.pearsonr(M[i],M[j])[0] return C
def getMetrics(self): # Hausdorff self.metrics['Hausdorff'] = hd(self.segment, self.mask) self.LB_HausdorffValue.setText( str(round(self.metrics['Hausdorff'], 3)) + " Pixels") # Dice self.metrics['Dice'] = 100 * dc(self.segment, self.mask) self.LB_DiceValue.setText(str(round(self.metrics['Dice'], 3)) + " %") # Jaccard self.metrics['Jaccard'] = 100 * jc(self.segment, self.mask) self.LB_JaccardValue.setText( str(round(self.metrics['Jaccard'], 3)) + " %") # P self.metrics['P_Value'] = pearsonr(self.segment.ravel(), self.mask.ravel())[1] self.LB_P_Value.setText(str(round(self.metrics['P_Value'], 3))) # Pearson Corellation Coefficient self.metrics['Pearson'] = pearsonr(self.segment.ravel(), self.mask.ravel())[0] self.LB_PearsonValue.setText(str(round(self.metrics['Pearson'], 3)))
def compute(self, x, y): assert np.size(x) == np.size(y) r, pv = mstats.pearsonr(x,y) try: n_pv = float(pv) except ValueError: n_pv = float(pv.data[0]) return { 'PEARSON': r, 'PEARSON_PV': n_pv }
def calc_correlation(target_array, reference_array): '''Calculate the correlation coefficient between two arrays. :param target_array: an array to be evaluated, as model output :type target_array: :class:'numpy.ma.core.MaskedArray' :param reference_array: an array of reference dataset :type reference_array: :class:'numpy.ma.core.MaskedArray' :returns: pearson's correlation coefficient between the two input arrays :rtype: :class:'numpy.ma.core.MaskedArray' ''' return mstats.pearsonr(reference_array.flatten(), target_array.flatten())[0]
def calc_correlation(target_array, reference_array): """Calculate the correlation coefficient between two arrays. :param target_array: an array to be evaluated, as model output :type target_array: :class:'numpy.ma.core.MaskedArray' :param reference_array: an array of reference dataset :type reference_array: :class:'numpy.ma.core.MaskedArray' :returns: pearson's correlation coefficient between the two input arrays :rtype: :class:'numpy.ma.core.MaskedArray' """ return mstats.pearsonr(reference_array.flatten(), target_array.flatten())[0]
def pearson_correlation(target: np.ndarray, source: np.ndarray, map: np.ndarray) -> float: """ Compute pearson correlation index after alignment Parameters ---------- target: np.array target image in gray scale source: np.array source image in gray scale map: sklearn.transformation computed transformation Returns ------- source_points, target_points: np.array Filtered source and target points for affine transformation Example ------- >>> import skimage.transform >>> source = np.ones((1000,1000)) >>> target = np.ones((1000,1000)) >>> source_points = np.array([[1.0,1.0],[500,500],[700,500]) >>> target_points = source_points >>> M = transform.estimate_transform("affine",source_points,target_points) >>> pearson_correlation(target, source, M) (1.0) """ mask = np.zeros_like(target) mask[0:source.shape[0], 0:source.shape[1]] = 1 source_extended = np.zeros_like(target) source_extended[0:source.shape[0], 0:source.shape[1]] = source binary_mask = transform.warp(mask, inverse_map=map.inverse).astype(np.bool) source_warped = transform.warp(source_extended, inverse_map=map.inverse) * 255 masked_source = np.ma.array(data=source_warped, mask=np.logical_not(binary_mask)) masked_target = np.ma.array(data=target, mask=np.logical_not(binary_mask)) corr_coef = pearsonr(masked_source.flatten(), masked_target.flatten()) print("image:", corr_coef) #masked correlation only compare visible parts return corr_coef
def plot_matrix(aa_list, par_child_mat, par_gen_mat): pearson_corr_te_par_child_par_gen_mut = pearsonr(par_child_mat, par_gen_mat) print( "Pearson correlation between true par-child mut and true par-gen mut: {}" .format(str(pearson_corr_te_par_child_par_gen_mut))) # generate plots cmap = "Blues" plt.rcParams.update({'font.size': 14}) fig, axs = plt.subplots(2) pos_ticks = list(np.arange(0, len(aa_list))) pos_labels = aa_list interpolation = "none" ax0 = axs[0].imshow(par_child_mat, cmap=cmap, interpolation=interpolation, aspect='auto') axs[0].set_title("(A) Parent-child AA transition frequency") axs[0].set_ylabel("From") axs[0].set_xlabel("To") axs[0].set_xticks(pos_ticks) axs[0].set_xticklabels(pos_labels, rotation='horizontal') axs[0].set_yticks(pos_ticks) axs[0].set_yticklabels(pos_labels, rotation='horizontal') ax1 = axs[1].imshow(par_gen_mat, cmap=cmap, interpolation=interpolation, aspect='auto') axs[1].set_title("(B) Parent-gen AA transition frequency") axs[1].set_ylabel("From") axs[1].set_xlabel("To") axs[1].set_xticks(pos_ticks) axs[1].set_xticklabels(pos_labels, rotation='horizontal') axs[1].set_yticks(pos_ticks) axs[1].set_yticklabels(pos_labels, rotation='horizontal') cbar_ax = fig.add_axes([0.92, 0.15, 0.03, 0.7]) cbar = fig.colorbar(ax0, cax=cbar_ax) plt.suptitle( "AA transition frequency in true and generated datasets. Parent: {}, children: {}. Pearson correlation of A & B: {}" .format(clade_parent, ",".join(clade_children), str(np.round(pearson_corr_te_par_child_par_gen_mut[0], 2)))) plt.show()
def cross_vect_score(vect_a, vect_b, scoring='euclidean', inv_noise_cov=None): """ Use the scoring function to compute a value between two vectors Parameters ---------- vect_a, vect_b: vector Data vectors. scoring: Scoring function in euclidean / mahalanobis / crossnobis / spearmanr / pearsornr. If "spearmanr_dist", return 1 - spearmanr correlation. inv_noise_cov: 2D array Inverse of the noise covariance matrix needed for mahalanobis and crossnobis scorings. Returns ------- score: float Score value. """ if scoring == 'euclidean': score = euclidean(vect_a, vect_b) elif scoring == "mahalanobis": score = mahalanobis(vect_a, vect_b, inv_noise_cov) elif scoring == "crossnobis": raise NotImplemented("Cross validated Mahalanobis distance is not " + \ "yet available") elif scoring in ["spearmanr", "spearmanr_dist"]: # Warning: ranking takes time, it's faster to input ranked vectors and # use pearsonr distance when doing multiple test on same vectors score, _ = spearmanr(vect_a, vect_b) elif scoring == "pearsonr": score, _ = pearsonr(vect_a, vect_b) else: raise ValueError("Unknown scoring function") if scoring[-5:] == "_dist": return 1 - score return score
def correlacaoPearson(vetA, vetB): usuarioA = vetA usuarioB = vetB #-----------------------------DEBUG---------------------------------- #print("Pearson init") #print(str(len(usuarioA))) #print(usuarioA) #print(str(len(usuarioB))) #print(usuarioB) #-----------------------------DEBUG---------------------------------- indexRemove = [] for i in range(len(usuarioA)): if (usuarioA[i] == "?"): indexRemove.append(i) for j in range(len(usuarioB)): if (usuarioB[j] == "?"): indexRemove.append(j) indexRemove.sort(reverse=True) #Para fazer a correlaçao de Pearson é necessario comparar os itens avaliados pelos dois usuarios #ou seja, remover do vetor de comparacao os "?" for k in indexRemove: usuarioA = np.delete(usuarioA, k) usuarioB = np.delete(usuarioB, k) #-----------------------------DEBUG---------------------------------- #print("Pearson after") #print(usuarioA) #print(usuarioB) #-----------------------------DEBUG---------------------------------- usuarioA = usuarioA.astype(int) usuarioB = usuarioB.astype(int) #chama a biblioteca PearsonR passando 2 vetores return pearsonr(usuarioA, usuarioB)[0]
def LyungBoxTest(ts, tested_lag, significance=0.95): """ ts: a time series. tested_lag: is the lag being tested, but must be an int. """ tested_lag = int(tested_lag) f_ts = ts f_ts = f_ts - f_ts.mean() n = f_ts.shape[0] Q = 0 for i in range(1, tested_lag + 1): lagged_f_ts = f_ts.shift(i) m_f_ts = ma.masked_array(lagged_f_ts, mask=np.isnan(lagged_f_ts)) Q += mstats.pearsonr(f_ts, m_f_ts)[0] ** 2 / (n - i) Q = Q * n * (n + 2) t = stats.chi2(tested_lag).ppf(significance) if Q < t: print "%d | Not enough evidence to reject Null: Q = %.4f < %.4f" % (tested_lag, Q, t) # print "Not enough evidence to reject "+func.__name__ + " " + series_name+" as not %d autocorrelated according to the Lyung Box test. Q = %.4f < %.4f"%(tested_lag, Q,t) else: print "%d | Reject Null: Q = %.4f > %.4f" % (tested_lag, Q, t)
def LyungBoxTest(ts, tested_lag, significance=0.95): """ ts: a time series. tested_lag: is the lag being tested, but must be an int. """ tested_lag = int(tested_lag) f_ts = ts f_ts = f_ts - f_ts.mean() n = f_ts.shape[0] Q = 0 for i in range(1, tested_lag + 1): lagged_f_ts = f_ts.shift(i) m_f_ts = ma.masked_array(lagged_f_ts, mask=np.isnan(lagged_f_ts)) Q += mstats.pearsonr(f_ts, m_f_ts)[0]**2 / (n - i) Q = Q * n * (n + 2) t = stats.chi2(tested_lag).ppf(significance) if Q < t: print "%d | Not enough evidence to reject Null: Q = %.4f < %.4f" % ( tested_lag, Q, t) #print "Not enough evidence to reject "+func.__name__ + " " + series_name+" as not %d autocorrelated according to the Lyung Box test. Q = %.4f < %.4f"%(tested_lag, Q,t) else: print "%d | Reject Null: Q = %.4f > %.4f" % (tested_lag, Q, t)
def compute(self, x, y, i): assert np.size(x) == np.size(y) and i >= 0 self.Matrices["PEARSON"][i], self.Matrices["PEARSON_PV"][i] = mstats.pearsonr(x,y)
# all_comp_gppl = [] # Do diffs correlate with sum(- worse_item_rank + better_item_rank)? for idx in range(len(diffs)): #print('Item: %i' % ids[idx]) #print('Diff: %f; BWS rank=%i, GPPL rank=%i' % (diffs[idx], rank_bws[idx], rank_gppl[idx])) otherids = pairs[pairs[:, 0] == ids[idx], 1] otheridxs = [ np.argwhere(ids == otherid).flatten()[0] for otherid in otherids ] tot_rank_gppl = 0 for otheridx in otheridxs: tot_rank_gppl -= rank_gppl[otheridx] otherids = pairs[pairs[:, 1] == ids[idx], 0] otheridxs = [ np.argwhere(ids == otherid).flatten()[0] for otherid in otherids ] for otheridx in otheridxs: tot_rank_gppl += rank_gppl[otheridx] #print('Total rank differences: BWS=%i, GPPL=%i' % (tot_rank_gppl, tot_rank_bws)) all_comp_gppl.append(tot_rank_gppl) print('Correlation between rank diff and total ranks of compared items: %f' % spearmanr(all_comp_gppl, diffs)[0]) print(pearsonr(all_comp_gppl, diffs))
def pears_corr(X, Y): return mstats.pearsonr(X,Y)
#get the log prob scores model = kenlm.LanguageModel(os.path.join(args.model_dir, "model.klm")) logprobs = [] mean_logprobs = [] norm_logprobs = [] slors = [] for s in test_sentences: uni = 0.0 for w in s.split() + ["</s>"]: #for w in s.split(): uni += unigram_logprob[w] fs = model.full_scores(s) n = 0 logprob = 0.0 for p, l in fs: logprob += p n += 1 logprobs.append(logprob) mean_logprobs.append(logprob / n) norm_logprobs.append(logprob / uni * -1.0) slors.append((logprob - uni) / n) #calculate correlation print "logprob =", pearsonr(logprobs, test_ratings)[0] print "mean logprob =", pearsonr(mean_logprobs, test_ratings)[0] print "norm logprob =", pearsonr(norm_logprobs, test_ratings)[0] print "slor =", pearsonr(slors, test_ratings)[0]
norm_lp_div.append((-1.0 * lp) / unigram_lp[i]) norm_lp_sub.append(lp - unigram_lp[i]) slor.append( (lp - unigram_lp[i]) / sent_lens[i] ) #bottom 5 lowest word logprobs, mean, m1q and m2q wordlp = wordlps[i] wordlp_min5 = sorted(wordlp)[:5] wlp_min1.append(wordlp_min5[0]) wlp_min2.append(wordlp_min5[1]) wlp_min3.append(wordlp_min5[2]) wlp_min4.append(wordlp_min5[3]) wlp_min5.append(wordlp_min5[4]) wlp_mean.append(numpy.mean(wordlp)) wlp_m1q.append(mean_of_percentile(wordlp, 25.0)) wlp_m2q.append(mean_of_percentile(wordlp, 50.0)) if (args.test_csv_output): test_out.write(str(i) + ",," + str(sent_lens[i]) + "," + str(lp) + "," + str(unigram_lp[i])) test_out.write("," + str(mean_lp[-1]) + "," + str(norm_lp_div[-1]) + ",") test_out.write(str(norm_lp_sub[-1]) + "," + str(slor[-1]) + ",") test_out.write(",".join([str(item) for item in wordlp_min5]) + ",") test_out.write(str(wlp_mean[-1]) + "," + str(wlp_m1q[-1]) + "," + str(wlp_m2q[-1]) + "\n") metrics_list = header.split(",")[3:] results = [lps, unigram_lp, mean_lp, norm_lp_div, norm_lp_sub, slor, wlp_min1, wlp_min2, wlp_min3, wlp_min4, wlp_min5, wlp_mean, wlp_m1q, wlp_m2q] #print the results print "METRICS\tCORRELATION" for i, m in enumerate(metrics_list): print m + "\t" + str(pearsonr(results[i], gold)[0])
def run(self): img = IMG() markerset = MarkerSet() print 'Reading metadata.' metadata = img.genomeMetadata('Final') print 'Getting marker genes.' pfamMarkers, tigrMarkers = markerset.getLineageMarkerGenes('Archaea') markerGenes = pfamMarkers.union(tigrMarkers) print ' Marker genes: ' + str(len(markerGenes)) print 'Getting genomes of interest.' genomeIds = img.genomeIdsByTaxonomy('Archaea', 'Final') print ' Genomes: ' + str(len(genomeIds)) print 'Getting position of each marker gene.' geneDistTable = img.geneDistTable(genomeIds, markerGenes) spearmanValues = [] pearsonValues = [] genomeIds = list(genomeIds) for i in xrange(0, len(genomeIds)): print str(i+1) + ' of ' + str(len(genomeIds)) geneOrderI = [] maskI = [] for markerGenesId in markerGenes: if markerGenesId in geneDistTable[genomeIds[i]]: geneOrderI.append(float(geneDistTable[genomeIds[i]][markerGenesId][0][0]) / metadata[genomeIds[i]]['genome size']) maskI.append(0) else: geneOrderI.append(-1) maskI.append(1) for j in xrange(i+1, len(genomeIds)): geneOrderJ = [] maskJ = [] for markerGenesId in markerGenes: if markerGenesId in geneDistTable[genomeIds[j]]: geneOrderJ.append(float(geneDistTable[genomeIds[j]][markerGenesId][0][0]) / metadata[genomeIds[j]]['genome size']) maskJ.append(0) else: geneOrderJ.append(-1) maskJ.append(1) # test all translations bestSpearman = 0 bestPearson = 0 for _ in xrange(0, len(markerGenes)): maskedI = [] maskedJ = [] for k in xrange(0, len(maskI)): if maskI[k] == 0 and maskJ[k] == 0: maskedI.append(geneOrderI[k]) maskedJ.append(geneOrderJ[k]) r, _ = spearmanr(maskedI, maskedJ) if abs(r) > bestSpearman: bestSpearman = abs(r) r, _ = pearsonr(maskedI, maskedJ) if abs(r) > bestPearson: bestPearson = abs(r) geneOrderJ = geneOrderJ[1:] + [geneOrderJ[0]] maskJ = maskJ[1:] + [maskJ[0]] spearmanValues.append(bestSpearman) pearsonValues.append(bestPearson) print 'Spearman: %.2f +/- %.2f: ' % (mean(spearmanValues), std(spearmanValues)) print 'Pearson: %.2f +/- %.2f: ' % (mean(pearsonValues), std(pearsonValues))
wlp_min2.append(wordlp_min5[1]) wlp_min3.append(wordlp_min5[2]) wlp_min4.append(wordlp_min5[3]) wlp_min5.append(wordlp_min5[4]) wlp_mean.append(numpy.mean(wordlp)) wlp_m1q.append(mean_of_percentile(wordlp, 25.0)) wlp_m2q.append(mean_of_percentile(wordlp, 50.0)) if (args.test_csv_output): test_out.write( str(i) + ",," + str(sent_lens[i]) + "," + str(lp) + "," + str(unigram_lp[i])) test_out.write("," + str(mean_lp[-1]) + "," + str(norm_lp_div[-1]) + ",") test_out.write(str(norm_lp_sub[-1]) + "," + str(slor[-1]) + ",") test_out.write(",".join([str(item) for item in wordlp_min5]) + ",") test_out.write( str(wlp_mean[-1]) + "," + str(wlp_m1q[-1]) + "," + str(wlp_m2q[-1]) + "\n") metrics_list = header.split(",")[3:] results = [ lps, unigram_lp, mean_lp, norm_lp_div, norm_lp_sub, slor, wlp_min1, wlp_min2, wlp_min3, wlp_min4, wlp_min5, wlp_mean, wlp_m1q, wlp_m2q ] #print the results print "METRICS\tCORRELATION" for i, m in enumerate(metrics_list): print m + "\t" + str(pearsonr(results[i], gold)[0])
def pears_corr(X, Y): return mstats.pearsonr(X, Y)
import xarray as xr fname = '/home/ecougnon/ana/SSTa_daily_Aus_20032016Dec.nc' lat_obs = xr.open_dataset(fname)['lat'] # lat_obs = lat_obs.sel(lat=slice(lat_px_min,lat_px_max)) lat_obs = lat_obs.sel(lat=lat_mdl, method='nearest') lon_obs = xr.open_dataset(fname)['lon'] # lon_obs = lon_obs.sel(lon=slice(lon_px_min,lon_px_max)) lon_obs = lon_obs.sel(lon=lon_mdl+360, method='nearest') tim_obs = xr.open_dataset(fname)['time'] tim_obs = tim_obs.sel(time=slice('2003-01-01','2016-12-31')) sst_obs = xr.open_dataset(fname)['SSTa'] sst_obs = sst_obs.sel(time=tim_obs, lat=lat_obs,lon=lon_obs) sst_obs = np.nanmean(np.nanmean(sst_obs,0),0) # pearson correlation PearC, tmp = st.pearsonr(sst_mdl, sst_obs) ## plotting plt.figure(figsize=(13,7)) ax = plt.subplot(111) #plt.plot(tim_mdl,sst_mdl) plt.plot(tim_vec,sst_obs) #plt.legend(['mdl','obs']) #plt.title('SSTa TAS area time series -- 37-45S 147-155E') plt.title('SSTa TAS area time series -- 42-44S 144-146E') plt.grid() #plt.text(0.3, 0.1, 'Pearson Correlation coefficient:' + str(round(PearC,3)), \ # ha='center', va='center', transform=ax.transAxes, \ # fontsize=14) #plt.savefig(figfile, bbox_inches='tight', dpi=300)
plot_path = os.path.join(outdir, "loghist" + "_".join(name_tuple) + ".png") print "Saving figure as %s..." % plot_path plt.savefig(plot_path, dpi=600) # compute all-pairs for x, y in itertools.combinations(R.keys(), 2): name_tuple = (x, y, enrich_name, D["gse_id"]) print "Scatter Plotting %s versus %s for %s from %s" % (name_tuple) X_Q = R[x]["Q"] Y_Q = R[y]["Q"] if E_Mask is not None: X_Q = X_Q[E_Mask] Y_Q = Y_Q[E_Mask] pcc = mstats.pearsonr(X_Q, Y_Q) print "PCC of %s and %s:" % (x, y), pcc plt.clf() plt.cla() plt.title(" ".join(name_tuple)) plt.xlabel(x) plt.ylabel(y) plot_path = os.path.join(outdir, "scatter" + "_".join(name_tuple) + ".png") plt.plot(X_Q, Y_Q, "b.") print "Saving figure as %s..." % plot_path plt.savefig(plot_path, dpi=600) # if not no enrichment, compute enrichment # output stats # plot stats
import pwd import shutil LOG_MSG = "#npy_fname=%(npy_fname)s, function=%(function)s, start=%(start)d, end=%(end)d, m=%(m)d, date=%(date)s" REPORT_N = 1000 # get username TMP_DIR = "/tmp/%s" % pwd.getpwuid(os.getuid()).pw_name def euclidean(x,y): q=x-y return ma.sqrt((q*q.T).sum()) # this should be in a separate file FUNCTIONS = { 'pearson': lambda x, y: mstats.pearsonr(x,y)[0], 'spearman': lambda x, y: mstats.spearmanr(x,y)[0], 'euclidean': euclidean, 'kendalltau': lambda x,y: mstats.kendalltau(x,y)[0], 'dcor': dcor, } def main(npy_fname=None, function=None, batchname=None, outdir=None, start=None, end=None, m=None): """Compute pairs of dependency""" assert npy_fname, function assert function in FUNCTIONS assert os.path.exists(outdir) assert os.path.isdir(outdir) m = int(m) assert m > 0
ratings.append(float(line.strip())) if debug: print "Ratings", len(ratings), "=", ratings[:10] #process the test.csv file metrics = [] probs = [] for line_id, line in enumerate(open(args.test_csv)): data = line.strip().split(",") if line_id == 0: metrics = data[3:] if debug: print "\nmetrics =", metrics else: for i, score in enumerate(data[3:]): if len(probs) == i: probs.append([]) if score == "": score = 0 probs[i].append(float(score)) #print "\n".join(metrics), "\n" print "METRICS\tCORRELATION" for i, prob in enumerate(probs): if debug: print "\nmetric =", metrics[i] print "\tprob", len(prob), "=", prob[:5] corr = pearsonr(ratings, prob)[0] print metrics[i] + "\t" + str(corr)
mode4_mdl = mode4_mdl.reshape((t,Y*X)) mode1_obs = mode1_obs.reshape((t,Y*X)) mode2_obs = mode2_obs.reshape((t,Y*X)) mode3_obs = mode3_obs.reshape((t,Y*X)) mode4_obs = mode4_obs.reshape((t,Y*X)) corr_map_mode1 = np.empty(X*Y) corr_map_mode2 = np.empty(X*Y) corr_map_mode3 = np.empty(X*Y) corr_map_mode4 = np.empty(X*Y) corr_map_mode1.fill(np.nan) corr_map_mode2.fill(np.nan) corr_map_mode3.fill(np.nan) corr_map_mode4.fill(np.nan) for ii in range(0,(X*Y)): corr_map_mode1[ii], tmp = st.pearsonr(mode1_mdl[:,ii], mode1_obs[:,ii]) corr_map_mode2[ii], tmp = st.pearsonr(mode2_mdl[:,ii], mode2_obs[:,ii]) corr_map_mode3[ii], tmp = st.pearsonr(mode3_mdl[:,ii], mode3_obs[:,ii]) corr_map_mode4[ii], tmp = st.pearsonr(mode4_mdl[:,ii], mode4_obs[:,ii]) # change shape back to lat/lon corr_map_mode1 = np.reshape(corr_map_mode1,(Y,X)) corr_map_mode2 = np.reshape(corr_map_mode2,(Y,X)) corr_map_mode3 = np.reshape(corr_map_mode3,(Y,X)) corr_map_mode4 = np.reshape(corr_map_mode4,(Y,X)) ## plotting domain = [-55, 90, 10, 180] #[-55, -270, 10, -180] #[-55, 90, 10, 180] domain_draw = [-55, 90, 10, 180] #[-55, -270, 10, -180] #[-55, 90, 10, 180] dlat = 10 #30 #10 dlon = 30 #90 #30 llon_obs, llat_obs = np.meshgrid(lon, lat)
def run(self): img = IMG() markerset = MarkerSet() print('Reading metadata.') metadata = img.genomeMetadata('Final') print('Getting marker genes.') pfamMarkers, tigrMarkers = markerset.getLineageMarkerGenes('Archaea') markerGenes = pfamMarkers.union(tigrMarkers) print(' Marker genes: ' + str(len(markerGenes))) print('Getting genomes of interest.') genomeIds = img.genomeIdsByTaxonomy('Archaea', 'Final') print(' Genomes: ' + str(len(genomeIds))) print('Getting position of each marker gene.') geneDistTable = img.geneDistTable(genomeIds, markerGenes, spacingBetweenContigs=1e6) spearmanValues = [] pearsonValues = [] genomeIds = list(genomeIds) for i in range(0, len(genomeIds)): print(str(i + 1) + ' of ' + str(len(genomeIds))) geneOrderI = [] maskI = [] for markerGenesId in markerGenes: if markerGenesId in geneDistTable[genomeIds[i]]: geneOrderI.append( float(geneDistTable[genomeIds[i]][markerGenesId][0][0]) / metadata[genomeIds[i]]['genome size']) maskI.append(0) else: geneOrderI.append(-1) maskI.append(1) for j in range(i + 1, len(genomeIds)): geneOrderJ = [] maskJ = [] for markerGenesId in markerGenes: if markerGenesId in geneDistTable[genomeIds[j]]: geneOrderJ.append( float(geneDistTable[genomeIds[j]][markerGenesId][0] [0]) / metadata[genomeIds[j]]['genome size']) maskJ.append(0) else: geneOrderJ.append(-1) maskJ.append(1) # test all translations bestSpearman = 0 bestPearson = 0 for _ in range(0, len(markerGenes)): maskedI = [] maskedJ = [] for k in range(0, len(maskI)): if maskI[k] == 0 and maskJ[k] == 0: maskedI.append(geneOrderI[k]) maskedJ.append(geneOrderJ[k]) r, _ = spearmanr(maskedI, maskedJ) if abs(r) > bestSpearman: bestSpearman = abs(r) r, _ = pearsonr(maskedI, maskedJ) if abs(r) > bestPearson: bestPearson = abs(r) geneOrderJ = geneOrderJ[1:] + [geneOrderJ[0]] maskJ = maskJ[1:] + [maskJ[0]] spearmanValues.append(bestSpearman) pearsonValues.append(bestPearson) print('Spearman: %.2f +/- %.2f: ' % (mean(spearmanValues), std(spearmanValues))) print('Pearson: %.2f +/- %.2f: ' % (mean(pearsonValues), std(pearsonValues)))
def plot_mutation_counts(): df_true_pred = pd.read_csv(results_path + file_name_mut_ct, sep=",") #df_true_pred = df_true_pred[:100] print(df_true_pred) cols = list(df_true_pred.columns) parent_child = dict() parent_gen = dict() child_gen = dict() parent_child_pos = dict() parent_gen_pos = dict() f_dict = read_json(results_path + "f_word_dictionaries.json") rev_dict = read_json(results_path + "r_word_dictionaries.json") encoded_wuhan_seq = utils.read_wuhan_seq(WUHAN_SEQ, rev_dict) # compare differences at positions space = 1 for index, row in df_true_pred.iterrows(): true_x = row[cols[0]].split(",") true_y = row[cols[1]].split(",") pred_y = row[cols[2]].split(",") for i in range(len(true_x)): first = true_x[i:i + space] sec = true_y[i:i + space] third = pred_y[i:i + space] first_aa = [f_dict[j] for j in first] sec_aa = [f_dict[j] for j in sec] third_aa = [f_dict[j] for j in third] first_mut = first_aa[0] second_mut = sec_aa[0] third_mut = third_aa[0] '''if first_mut != second_mut and first_mut != third_mut: key_par_child = "{}>{}".format(first_mut, second_mut) key_pos_par_child = "{}>{}>{}".format(first_mut, str(i+1), second_mut) print("Parent-child: {}".format(key_pos_par_child)) if key_par_child not in parent_child: parent_child[key_par_child] = 0 parent_child[key_par_child] += 1 key_par_gen = "{}>{}".format(first_mut, third_mut) key_pos_par_gen = "{}>{}>{}".format(first_mut, str(i+1), third_mut) print("Parent-gen: {}".format(key_pos_par_gen)) print("------------") if key_par_gen not in parent_gen: parent_gen[key_par_gen] = 0 parent_gen[key_par_gen] += 1''' if first_mut != second_mut: key = "{}>{}".format(first_mut, second_mut) key_pos_par_child = "{}>{}>{}".format(first_mut, str(i + 1), second_mut) if key_pos_par_child not in parent_child_pos: parent_child_pos[key_pos_par_child] = 0 parent_child_pos[key_pos_par_child] += 1 if key not in parent_child: parent_child[key] = 0 parent_child[key] += 1 if first_mut != third_mut: key = "{}>{}".format(first_mut, third_mut) key_pos_par_gen = "{}>{}>{}".format(first_mut, str(i + 1), third_mut) if key_pos_par_gen not in parent_gen_pos: parent_gen_pos[key_pos_par_gen] = 0 parent_gen_pos[key_pos_par_gen] += 1 if key not in parent_gen: parent_gen[key] = 0 parent_gen[key] += 1 write_dict( results_path + "te_parent_child_{}_{}.json".format(clade_parent, clade_child), parent_child) write_dict( results_path + "te_parent_gen_{}_{}.json".format(clade_parent, clade_child), parent_gen) aa_list = list('QNKWFPYLMTEIARGHSDVC') print("---------------------") print("Parent child mutations with POS") parent_child_pos = dict( sorted(parent_child_pos.items(), key=lambda item: item[1], reverse=True)) print(len(parent_child_pos), parent_child_pos) print() print("Parent gen mutations with POS") parent_gen_pos = dict( sorted(parent_gen_pos.items(), key=lambda item: item[1], reverse=True)) print(len(parent_gen_pos), parent_gen_pos) print() write_dict( results_path + "te_parent_child_pos_{}_{}.json".format(clade_parent, clade_child), parent_child_pos) write_dict( results_path + "te_parent_gen_pos_{}_{}.json".format(clade_parent, clade_child), parent_gen_pos) keys1 = list(parent_child_pos.keys()) keys2 = list(parent_gen_pos.keys()) inter = list(set(keys1).intersection(set(keys2))) print(len(inter), inter) print() print("---------------------") test_size = df_true_pred.shape[0] parent_child = dict( sorted(parent_child.items(), key=lambda item: item[1], reverse=True)) print("Test: Mutation freq between parent-child: {}".format(parent_child)) print("Test: # Mutations between parent-child: {}".format( str(len(parent_child)))) print() parent_gen = dict( sorted(parent_gen.items(), key=lambda item: item[1], reverse=True)) print("Test: Mutation freq between parent-gen: {}".format(parent_gen)) print("Test: # Mutations between parent-child: {}".format( str(len(parent_gen)))) print() par_child_mat = get_mat(aa_list, parent_child, test_size) print() par_gen_mat = get_mat(aa_list, parent_gen, test_size) print("Preparing train data...") tr_par_child_mat, tr_parent_child = get_train_mat() pearson_corr_tr_par_child_mut = pearsonr(tr_par_child_mat, par_child_mat) pearson_corr_tr_par_child_par_gen_mut = pearsonr(tr_par_child_mat, par_gen_mat) pearson_corr_te_par_child_par_gen_mut = pearsonr(par_child_mat, par_gen_mat) print( "Pearson correlation between train and test par-child mut: {}".format( str(pearson_corr_tr_par_child_mut))) print( "Pearson correlation between train par-child mut and test par-gen mut: {}" .format(str(pearson_corr_tr_par_child_par_gen_mut))) print("Pearson correlation between test par-child mut and par-gen mut: {}". format(str(pearson_corr_te_par_child_par_gen_mut))) tr_par_child_keys = list(tr_parent_child.keys()) te_par_child_keys = list(parent_child.keys()) te_par_gen_keys = list(parent_gen.keys()) print("Size of mutations - tr par-child, te par-child, te par-gen") print(len(tr_parent_child), len(parent_child), len(parent_gen)) intersection_tr_par_child_te_par_child = len( list(set(tr_par_child_keys).intersection( set(te_par_child_keys)))) / float(len(tr_parent_child)) print("% intersection between tr par-child and te par-child: {}".format( str(np.round(intersection_tr_par_child_te_par_child, 2)))) intersection_tr_par_child_te_par_gen = len( list(set(tr_par_child_keys).intersection( set(te_par_gen_keys)))) / float(len(tr_parent_child)) print("% intersection between tr par-child and te par-gen: {}".format( str(np.round(intersection_tr_par_child_te_par_gen, 2)))) intersection_te_par_child_te_par_gen = len( list(set(te_par_child_keys).intersection( set(te_par_gen_keys)))) / float(len(te_par_child_keys)) print("% intersection between te par-child and te par-gen: {}".format( str(np.round(intersection_te_par_child_te_par_gen, 2)))) print() print("Common mutations in tr, test and gen for {}>{} branch".format( clade_parent, clade_child)) for mut in tr_parent_child: if mut in parent_child and mut in parent_gen: print(mut, tr_parent_child[mut], parent_child[mut], parent_gen[mut]) # generate plots cmap = "Blues" #"RdYlBu" Spectral plt.rcParams.update({'font.size': 10}) fig, axs = plt.subplots(3) pos_ticks = list(np.arange(0, len(aa_list))) pos_labels = aa_list interpolation = "none" ax0 = axs[0].imshow(tr_par_child_mat, cmap=cmap, interpolation=interpolation, aspect='auto') axs[0].set_title("(A) Train parent-child mutation frequency") axs[0].set_ylabel("From") axs[0].set_xlabel("To") axs[0].set_xticks(pos_ticks) axs[0].set_xticklabels(pos_labels, rotation='horizontal') axs[0].set_yticks(pos_ticks) axs[0].set_yticklabels(pos_labels, rotation='horizontal') ax1 = axs[1].imshow(par_child_mat, cmap=cmap, interpolation=interpolation, aspect='auto') axs[1].set_title("(B) Test parent-child mutation frequency") axs[1].set_ylabel("From") axs[1].set_xlabel("To") axs[1].set_xticks(pos_ticks) axs[1].set_xticklabels(pos_labels, rotation='horizontal') axs[1].set_yticks(pos_ticks) axs[1].set_yticklabels(pos_labels, rotation='horizontal') ax2 = axs[2].imshow(par_gen_mat, cmap=cmap, interpolation=interpolation, aspect='auto') axs[2].set_title("(C) Test parent-generated mutation frequency") axs[2].set_ylabel("From") axs[2].set_xlabel("To") axs[2].set_xticks(pos_ticks) axs[2].set_xticklabels(pos_labels, rotation='horizontal') axs[2].set_yticks(pos_ticks) axs[2].set_yticklabels(pos_labels, rotation='horizontal') cbar_ax = fig.add_axes([0.92, 0.15, 0.03, 0.7]) cbar = fig.colorbar(ax0, cax=cbar_ax) plt.suptitle( "Mutation frequency in test, train and generated datasets. Pearson correlation of A & B: {}, A & C: {}, B & C: {}" .format(str(np.round(pearson_corr_tr_par_child_mut[0], 2)), str(np.round(pearson_corr_tr_par_child_par_gen_mut[0], 2)), str(np.round(pearson_corr_te_par_child_par_gen_mut[0], 2)))) plt.show() # plot differences diff_tr_par_child_te_par_child = par_child_mat - tr_par_child_mat diff_te_par_gen_te_par_child = par_gen_mat - par_child_mat diff_tr_par_child_te_par_gen = par_gen_mat - tr_par_child_mat cmap = "RdBu" fig, axs = plt.subplots(3) vmin = -0.08 vmax = 0.08 ax0 = axs[0].imshow(diff_tr_par_child_te_par_child, cmap=cmap, interpolation=interpolation, aspect='auto', vmin=vmin, vmax=vmax) # , axs[0].set_title("Test vs training") axs[0].set_ylabel("From") axs[0].set_xlabel("To") axs[0].set_xticks(pos_ticks) axs[0].set_xticklabels(pos_labels, rotation='horizontal') axs[0].set_yticks(pos_ticks) axs[0].set_yticklabels(pos_labels, rotation='horizontal') ax1 = axs[1].imshow(diff_te_par_gen_te_par_child, cmap=cmap, interpolation=interpolation, aspect='auto', vmin=vmin, vmax=vmax) axs[1].set_title("Generated vs test") axs[1].set_ylabel("From") axs[1].set_xlabel("To") axs[1].set_xticks(pos_ticks) axs[1].set_xticklabels(pos_labels, rotation='horizontal') axs[1].set_yticks(pos_ticks) axs[1].set_yticklabels(pos_labels, rotation='horizontal') ax2 = axs[2].imshow(diff_tr_par_child_te_par_gen, cmap=cmap, interpolation=interpolation, aspect='auto', vmin=vmin, vmax=vmax) axs[2].set_title("Generated vs training") axs[2].set_ylabel("From") axs[2].set_xlabel("To") axs[2].set_xticks(pos_ticks) axs[2].set_xticklabels(pos_labels, rotation='horizontal') axs[2].set_yticks(pos_ticks) axs[2].set_yticklabels(pos_labels, rotation='horizontal') cbar_ax = fig.add_axes([0.92, 0.15, 0.03, 0.7]) cbar = fig.colorbar(ax0, cax=cbar_ax) plt.suptitle("Delta of mutation frequency plots") plt.show()
def calcPearsonCC(self, pred, gt): ''' Calculates Pearson's Correlation Coefficient ''' pcc = pearsonr(pred, gt)[0] return pcc
#create similarity matrix, start by creating the numpy structure pearsonMatrix = numpy.zeros((rows,rows) , dtype=numpy.float) counter = 1. totalrows = rows i = 0 j = 0 for userA in userList: for userB in userList: if userA <> userB: userARatings = userRatings[userMap[userA]] userBRatings = userRatings[userMap[userB]] #print userARatings.shape #print userBRatings.shape #pearsonMatrix[i,j] = pearsonCorrelation(userA,userB) if pearsonMatrix[i,j] == 0.: pearsonMatrix[i,j] = pearsonr(userARatings,userBRatings)[0] pearsonMatrix[j,i] = pearsonMatrix[i,j] j = j+1 progress = round ((counter * 100)/totalrows,3) print "Progress: "+str(progress) + "%" counter = counter + 1 i = i+1 j = 0. print "Saving Pearson Similarity matrix, please wait" numpy.savetxt("pearsonMatrix.csv", pearsonMatrix, delimiter=',') print "Pearson matrix saved"
fig = plt.figure(figsize=(10, 6)) ax = fig.add_axes([.025, .025, .675, .875]) cax = fig.add_axes([0.725, .025, .025, .875]) pax = fig.add_axes([0.85, .025, .1, .875]) bmap.drawcoastlines(ax=ax) bmap.drawcountries(ax=ax) bmap.drawstates(ax=ax) for season, months in seasons.items(): print(season) pax.cla() tidx = np.array([ti for ti, t in enumerate(times) if t.month in months]) tmO3 = mO3[tidx] toO3 = oO3[tidx] tbO3 = tmO3[:].mean(0) - toO3[:].mean(0) rs = np.ma.masked_invalid( [mstats.pearsonr(m, o)[0] for m, o in zip(tmO3.T, toO3.T)]) print(tbO3.min(), tbO3.max()) s = bmap.scatter(lon, lat, c=tbO3, norm=bnorm, cmap=bcmap, ax=ax, edgecolors='k') cbar = plt.colorbar(s, cax=cax, label='ppb') hist, edgesdummy = np.histogram(tbO3, bins=bedges) pax.plot(hist.repeat(2, 0) / hist.sum() * 100, bedges.repeat(2, 0)[1:-1]) pax.set_ylim(bedges[0], bedges[-1]) pax.xaxis.tick_top() pax.yaxis.set_major_formatter(plt.NullFormatter()) pax.set_xlabel('% Sites')
if debug: print "Ratings", len(ratings), "=", ratings[:10] #process the test.csv file metrics = [] probs = [] for line_id, line in enumerate(open(args.test_csv)): data = line.strip().split(",") if line_id == 0: metrics = data[3:] if debug: print "\nmetrics =", metrics else: for i, score in enumerate(data[3:]): if len(probs) == i: probs.append([]) if score == "": score = 0 probs[i].append(float(score)) #print "\n".join(metrics), "\n" print "METRICS\tCORRELATION" for i, prob in enumerate(probs): if debug: print "\nmetric =", metrics[i] print "\tprob", len(prob), "=", prob[:5] corr = pearsonr(ratings, prob)[0] print metrics[i] + "\t" + str(corr)