def makeRelDiffPlot(truthCol, predCol, df, outBase, method, measure): plt.cla() plt.clf() if measure == "tpm": outFile = "{}_tpm_rel_diff.pdf".format(outBase) elif measure == "num_reads": outFile = "{}_num_reads_rel_diff.pdf".format(outBase) # Get the relative difference numbers rdV, nzV = AnalysisUtils.relDiff(truthCol, predCol, df) try: _ = plt.hist(rdV["relDiff"], bins=100, histtype="stepfilled") mdText = "mean rel diff = {0:.2f}\nmean |rel diff| = {1:.2f}".format( rdV["relDiff"].mean(), rdV["relDiff"].abs().mean()) plt.figtext(0.15, 0.85, mdText) ax = plt.axes() ax.set_ylabel("frequency") ax.set_xlabel("relative difference") # Get rid of axis spines sns.despine() plt.savefig(outFile) except: f = open(outFile, 'w') f.close() print("Error making relative difference plot {}".format(outFile))
def rsemRelDiffPlots(D, measure, methods, simIDs, outdir, plotname): sns.set_style('white') sns.set_context('poster') plt.clf() plt.cla() md = {} for m in methods: md[m] = [] for i in simIDs: rd, _ = AnalysisUtils.relDiff("{}_truth{}".format(measure, str(i)), "{}_{}{}".format(measure, m, str(i)), D) md[m].append(rd['relDiff'].abs().mean()) ax = None for mn, mv in md.items(): weights = np.ones_like(mv)/len(mv) ax = sns.distplot(mv, hist_kws={"histtype": "stepfilled"},kde=False, rug=True, label=mn, ax=ax, norm_hist=False) sns.despine() ax = plt.axes() ax.set_xlabel('mean absolute relative difference') ax.set_ylabel('frequency') plt.legend() plt.savefig('{}/{}.pdf'.format(outdir, plotname))
def makeTable(methodDict, outpath, outfile, measure, annotPath): import pandas as pd import seaborn as sns import ParsingUtils import AnalysisUtils dframes = [] for k, v in methodDict.items(): if k.upper().startswith('SALMON'): d = ParsingUtils.readSalmon(v, '_{}'.format(k)) elif k.upper().startswith('KALLISTO'): d = ParsingUtils.readKallisto(v, '_{}'.format(k)) elif k.upper().startswith('EXPRESS'): d = ParsingUtils.readExpress(v, '_{}'.format(k)) elif k.upper() == 'SAILFISH': d = ParsingUtils.readSailfish(v, '_{}'.format(k)) elif k.upper() == 'SAILFISH (QUASI)': d = ParsingUtils.readSalmon(v, '_{}'.format(k)) elif k.upper().startswith('TRUTH'): suffix = '_{}'.format(k) d = ParsingUtils.readProFile(v, suffix) d["TPM{}".format(suffix)] = 1000000.0 * (d["ExpFrac{}".format(suffix)] / d["ExpFrac{}".format(suffix)].sum()) # Flux sim thinks paired-end = 2 reads . . . sinh d["NumReads{}".format(suffix)] = d["SeqNum{}".format(suffix)] * 0.5 # Add this dataframe to the list dframes.append(d) M = dframes[0].join(dframes[1:]) # Filter eXpress results minVal = np.inf for mn in set(methodDict.keys()) - set(["Truth", "eXpress"]): newMin = M.loc[M["{}_{}".format(measure, mn)]>0, "{}_{}".format(measure,mn)].min() minVal = min(minVal, newMin) print("filtering eXpress results < {} {}".format(minVal, measure)) AnalysisUtils.filterValues("{}_{}".format(measure, "eXpress"), M, minVal) org = outfile.split('/')[-1].split('_')[0] print("org = {}".format(org)) if org == 'human': plotStratifiedDiffs(M, methodDict, annotPath, outpath, measure) mrdName = 'abs. mean rel. diff.' corrName = 'Spearman corr.' propName = 'Proportionality corr.' tpefName = 'TP error fraction' tpMedErrorName = 'TP median per. error' res = pd.DataFrame(data={ m : {tpMedErrorName : np.nan, tpefName : np.nan, mrdName : np.nan, corrName : np.nan, propName : np.nan} for m in (methodDict.keys() - set('Truth'))}) import scipy as sp import scipy.stats for k in methodDict: if k.upper() != "TRUTH": c = sp.stats.spearmanr(M["{}_Truth".format(measure)], M["{}_{}".format(measure, k)])[0] res[k][corrName] = c mrd, _ = AnalysisUtils.relDiff("{}_Truth".format(measure), "{}_{}".format(measure, k), M) res[k][mrdName] = mrd["relDiff"].abs().mean() pc = AnalysisUtils.proportionalityCorrelation("{}_Truth".format(measure), "{}_{}".format(measure, k), M) res[k][propName] = pc tpind = M[M["{}_Truth".format(measure)] >= 1] y = tpind["{}_{}".format(measure, k)] x = tpind["{}_Truth".format(measure)] ef = 10.0 re = (y - x) / x are = 100.0 * (y - x).abs() / x tpef = len(are[are > ef]) / float(len(are)) res[k][tpefName] = tpef res[k][tpMedErrorName] = re.median() res.drop('Truth', axis=1, inplace=True) print(res) res.to_csv(outfile+".csv") with open(outfile, 'w') as ofile: ofile.write(res.to_latex(float_format=lambda x: "{0:.2f}".format(x))) print("wrote {}".format(outpath))
def plotStratifiedDiffs(M, methodDict, annotPath, outpath, measure): import pickle import seaborn as sns tgmap = {} with open('{}/tgmap.txt'.format(annotPath),'r') as ifile: for l in ifile: toks = l.split() tgmap[toks[0]] = toks[1] genes = pd.DataFrame([(k,v) for k,v in tgmap.items()], columns=['Name', 'Gene']) genes.set_index('Name', inplace=True) M = M.join(genes) relDiffDict = {} for mn, mf in methodDict.items(): rdiffs, _ = AnalysisUtils.relDiff('{}_Truth'.format(measure), '{}_{}'.format(measure, mn), M) M['RelDiff_{}'.format(mn)] = rdiffs mByGenes = M.groupby('Gene') groups = dict(list(mByGenes)) ntmap = {} methods = ["Sailfish", "Salmon", "eXpress"] def retainedMethod(mn): return mn in methods sns.set_palette(sns.color_palette([methodcolor[m] for m in methods])) numExpressedGenes = 0 for i,(k,group) in enumerate(groups.items()): if i % 10000 == 0: print("processing group {} of {}".format(i, len(groups))) ## remove 0s? if group['NumReads_Truth'].sum() == 0: continue numExpressedGenes += 1 numTran = len(group) if numTran not in ntmap: ntmap[numTran] = {} for mn, mf in methodDict.items(): if retainedMethod(mn): #if not mn.upper().endswith('TRUTH'): ntmap[numTran][mn] = [] for mn, mf in methodDict.items(): if retainedMethod(mn): #if not mn.upper().endswith('TRUTH'): ntmap[numTran][mn].append(group['RelDiff_{}'.format(mn)].abs().mean()) print("There were {} expressed genes (genes with at least 1 expressed isoform)".format(numExpressedGenes)) nt = [set([])] totInClass = 0 classSize = 1000 for k in sorted(ntmap.keys()): if k > 1: vd = ntmap[k] #num = len(vd['Kallisto']) num = len(vd['Sailfish']) if len(nt[-1]) == 0 or totInClass <= classSize: nt[-1].add(k) totInClass += num if totInClass > classSize: totInClass = 0 nt.append(set([])) print(nt) ntClasses = {} for c in nt: # make a name for the class minVal = min(list(c)) maxVal = max(list(c)) if minVal == maxVal: ntClasses[minVal] = str(minVal) else: name = str(minVal)+"-"+str(maxVal) for x in c: ntClasses[x] = name from pprint import pprint pprint(ntClasses) data = [] for k, v in ntmap.items(): if k > 1: for mn, vals in v.items(): for x in vals: cname = ntClasses[k] data.append((cname, mn, x)) ntxp = '# txp. per gene' D = pd.DataFrame(data, columns=[ntxp, 'Method', 'MARDs']) sns.set_style('white') #sns.set_palette('pastel', desat=0.7) plt.clf() plt.cla() plt.figure(figsize=(12,8)) cnames = sorted(list(set([v for k,v in ntClasses.items()]))) print("Classes = ") pprint(cnames) x_order = sorted(cnames, key=lambda x : int(x.split('-')[0])) import matplotlib matplotlib.rc("lines", markersize=0, markeredgewidth=0) g = sns.factorplot(ntxp, "MARDs", "Method", D, kind='box', markers='', x_order=x_order) #g = sns.facetgrid(ntxp, "MARDs", "Method", D) g.set_xticklabels(rotation=30) #g.despine(offset=10, trim=True) #plt.tight_layout() plt.gcf().subplots_adjust(bottom=0.15) plt.savefig('{}/GeneStratRD.pdf'.format(outpath))