Ejemplo n.º 1
0
def makeRelDiffPlot(truthCol, predCol, df, outBase, method, measure):
    plt.cla()
    plt.clf()

    if measure == "tpm":
        outFile = "{}_tpm_rel_diff.pdf".format(outBase)
    elif measure == "num_reads":
        outFile = "{}_num_reads_rel_diff.pdf".format(outBase)

    # Get the relative difference numbers
    rdV, nzV = AnalysisUtils.relDiff(truthCol, predCol, df)
    try:
        _ = plt.hist(rdV["relDiff"], bins=100, histtype="stepfilled")

        mdText = "mean rel diff = {0:.2f}\nmean |rel diff| = {1:.2f}".format(
                  rdV["relDiff"].mean(), rdV["relDiff"].abs().mean())

        plt.figtext(0.15, 0.85, mdText)
        
        ax = plt.axes()
        ax.set_ylabel("frequency")
        ax.set_xlabel("relative difference")

        # Get rid of axis spines
        sns.despine()
        plt.savefig(outFile)
    except:
        f = open(outFile, 'w')
        f.close()
        print("Error making relative difference plot {}".format(outFile))
Ejemplo n.º 2
0
def rsemRelDiffPlots(D, measure, methods, simIDs, outdir, plotname):
    sns.set_style('white')
    sns.set_context('poster')
    plt.clf()
    plt.cla()

    md = {}
    for m in methods:
        md[m] = []
        for i in simIDs:
            rd, _ = AnalysisUtils.relDiff("{}_truth{}".format(measure, str(i)), 
                                          "{}_{}{}".format(measure, m, str(i)), D)
            md[m].append(rd['relDiff'].abs().mean())

    ax = None 
    for mn, mv in md.items():
        weights = np.ones_like(mv)/len(mv)
        ax = sns.distplot(mv, hist_kws={"histtype": "stepfilled"},kde=False, rug=True, label=mn, ax=ax, norm_hist=False)
    sns.despine()
    ax = plt.axes()
    ax.set_xlabel('mean absolute relative difference')
    ax.set_ylabel('frequency')
    plt.legend()
    plt.savefig('{}/{}.pdf'.format(outdir, plotname))
Ejemplo n.º 3
0
def makeTable(methodDict, outpath, outfile, measure, annotPath):
    import pandas as pd
    import seaborn as sns
    import ParsingUtils
    import AnalysisUtils
    dframes = []
    for k, v in methodDict.items():
        if k.upper().startswith('SALMON'):
            d = ParsingUtils.readSalmon(v, '_{}'.format(k))
        elif k.upper().startswith('KALLISTO'):
            d = ParsingUtils.readKallisto(v, '_{}'.format(k))
        elif k.upper().startswith('EXPRESS'):
            d = ParsingUtils.readExpress(v, '_{}'.format(k))
        elif k.upper() == 'SAILFISH':
            d = ParsingUtils.readSailfish(v, '_{}'.format(k))
        elif k.upper() == 'SAILFISH (QUASI)':
            d = ParsingUtils.readSalmon(v, '_{}'.format(k))
        elif k.upper().startswith('TRUTH'):
            suffix = '_{}'.format(k)
            d = ParsingUtils.readProFile(v, suffix) 
            d["TPM{}".format(suffix)] = 1000000.0 * (d["ExpFrac{}".format(suffix)] / d["ExpFrac{}".format(suffix)].sum())
            # Flux sim thinks paired-end = 2 reads . . . sinh
            d["NumReads{}".format(suffix)] = d["SeqNum{}".format(suffix)] * 0.5

        # Add this dataframe to the list
        dframes.append(d)

    M = dframes[0].join(dframes[1:])
    
    # Filter eXpress results
    minVal = np.inf
    for mn in set(methodDict.keys()) - set(["Truth", "eXpress"]):
        newMin = M.loc[M["{}_{}".format(measure, mn)]>0, "{}_{}".format(measure,mn)].min()
        minVal = min(minVal, newMin) 
    print("filtering eXpress results < {} {}".format(minVal, measure))
    AnalysisUtils.filterValues("{}_{}".format(measure, "eXpress"), M, minVal)

    org = outfile.split('/')[-1].split('_')[0] 
    print("org = {}".format(org))
    if org == 'human':
        plotStratifiedDiffs(M, methodDict, annotPath, outpath, measure)

    mrdName = 'abs. mean rel. diff.'
    corrName = 'Spearman corr.'
    propName = 'Proportionality corr.'
    tpefName = 'TP error fraction'
    tpMedErrorName = 'TP median per. error'
    res = pd.DataFrame(data={ m : {tpMedErrorName : np.nan, tpefName : np.nan, mrdName : np.nan, corrName : np.nan, propName : np.nan} for m in (methodDict.keys() - set('Truth'))})

    import scipy as sp
    import scipy.stats

    for k in methodDict:
        if k.upper() != "TRUTH":
            c = sp.stats.spearmanr(M["{}_Truth".format(measure)], M["{}_{}".format(measure, k)])[0]
            res[k][corrName] = c
            mrd, _ = AnalysisUtils.relDiff("{}_Truth".format(measure), "{}_{}".format(measure, k), M) 
            res[k][mrdName] = mrd["relDiff"].abs().mean()

            pc = AnalysisUtils.proportionalityCorrelation("{}_Truth".format(measure), "{}_{}".format(measure, k), M) 
            res[k][propName] = pc 

            tpind =  M[M["{}_Truth".format(measure)] >= 1]
            y = tpind["{}_{}".format(measure, k)] 
            x = tpind["{}_Truth".format(measure)]
            ef = 10.0
            re = (y - x) / x
            are = 100.0 * (y - x).abs() / x
            tpef = len(are[are > ef]) / float(len(are))
            res[k][tpefName] = tpef
            res[k][tpMedErrorName] = re.median()

    res.drop('Truth', axis=1, inplace=True)
    print(res)
    res.to_csv(outfile+".csv")

    with open(outfile, 'w') as ofile:
        ofile.write(res.to_latex(float_format=lambda x: "{0:.2f}".format(x)))
    print("wrote {}".format(outpath))
Ejemplo n.º 4
0
def plotStratifiedDiffs(M, methodDict, annotPath, outpath, measure):
    import pickle
    import seaborn as sns
    tgmap = {}
    with open('{}/tgmap.txt'.format(annotPath),'r') as ifile:
        for l in ifile:
            toks = l.split()
            tgmap[toks[0]] = toks[1]
 
    genes = pd.DataFrame([(k,v) for k,v in tgmap.items()], columns=['Name', 'Gene'])
    genes.set_index('Name', inplace=True)
    M = M.join(genes)
    
    relDiffDict = {}
    for mn, mf in methodDict.items():
        rdiffs, _ = AnalysisUtils.relDiff('{}_Truth'.format(measure), '{}_{}'.format(measure, mn), M)
        M['RelDiff_{}'.format(mn)] = rdiffs
    
    mByGenes = M.groupby('Gene')
    groups = dict(list(mByGenes))
    ntmap = {}
    methods = ["Sailfish", "Salmon", "eXpress"]
    def retainedMethod(mn):
        return mn in methods 

    sns.set_palette(sns.color_palette([methodcolor[m] for m in methods]))

    numExpressedGenes = 0
    for i,(k,group) in enumerate(groups.items()):
        if i % 10000 == 0:
            print("processing group {} of {}".format(i, len(groups)))
        ## remove 0s?
        if group['NumReads_Truth'].sum() == 0:
            continue
        numExpressedGenes += 1
        numTran = len(group)
        if numTran not in ntmap:
            ntmap[numTran] = {}
            for mn, mf in methodDict.items():
                if retainedMethod(mn):
                #if not mn.upper().endswith('TRUTH'):
                    ntmap[numTran][mn] = []
        
        for mn, mf in methodDict.items():
            if retainedMethod(mn):
            #if not mn.upper().endswith('TRUTH'):
                ntmap[numTran][mn].append(group['RelDiff_{}'.format(mn)].abs().mean())

    print("There were {} expressed genes (genes with at least 1 expressed isoform)".format(numExpressedGenes))
    nt = [set([])]
    totInClass = 0
    classSize = 1000
    for k in sorted(ntmap.keys()):
        if k > 1:
            vd = ntmap[k]
            #num = len(vd['Kallisto'])
            num = len(vd['Sailfish'])
            if len(nt[-1]) == 0 or totInClass <= classSize:
                nt[-1].add(k)
            totInClass += num
            if totInClass > classSize:
                totInClass = 0
                nt.append(set([]))
    print(nt)
    ntClasses = {}
    for c in nt:
        # make a name for the class
        minVal = min(list(c))
        maxVal = max(list(c))
        if minVal == maxVal:
            ntClasses[minVal] = str(minVal)
        else:
            name = str(minVal)+"-"+str(maxVal)
            for x in c:
                ntClasses[x] = name 
    
    from pprint import pprint
    pprint(ntClasses)

    data = []
    for k, v in ntmap.items():
        if k > 1:
            for mn, vals in v.items():
                for x in vals:
                    cname = ntClasses[k]
                    data.append((cname, mn, x))
        
    ntxp = '# txp. per gene'
    D = pd.DataFrame(data, columns=[ntxp, 'Method', 'MARDs'])
    
    sns.set_style('white')
    #sns.set_palette('pastel', desat=0.7)
    plt.clf()
    plt.cla()
    
    plt.figure(figsize=(12,8))
    cnames = sorted(list(set([v for k,v in ntClasses.items()])))
    print("Classes = ")
    pprint(cnames)
    x_order = sorted(cnames, key=lambda x : int(x.split('-')[0]))
    import matplotlib
    matplotlib.rc("lines", markersize=0, markeredgewidth=0) 
    g = sns.factorplot(ntxp, "MARDs", "Method", D, kind='box', markers='', x_order=x_order)
    #g = sns.facetgrid(ntxp, "MARDs", "Method", D)
    
    g.set_xticklabels(rotation=30)
    #g.despine(offset=10, trim=True)
    #plt.tight_layout()
    plt.gcf().subplots_adjust(bottom=0.15)
    plt.savefig('{}/GeneStratRD.pdf'.format(outpath))