Exemple #1
0
    def write_feature_summary(self, f, Yi, i):

        self.f_name = f.name
        self.yLen, self.yObs, self.yRate, self.yR = len(Yi), len(
            [y for y in Yi if y > 0]), len([y for y in Yi if y > 0]) / float(
                len(Yi)), self.M.out['covariate_resids'][i]

        cvY, cvR, cvTR = coVar(Yi), coVar(self.yR), coVar(
            self.M.out['resids'][i])

        cvT = np.std(Yi)
        cvR = np.std(self.yR)  #/np.mean(Yi)
        cvTR = np.std(self.M.out['resids'][i])

        self.yMean, self.rMean = np.mean(Yi), np.mean(self.yR)

        pv, bw, topN, PRED = sorted(
            [x for x in self.M.out['params'][i] if x[2] != 'intercept'])[0]

        self.gene_summary.write(
            '%-50s %8s %8s %8.2f %8.2f %8.2f %8.2f %8.2f %8.2f ' %
            (f.name, self.yLen, self.yObs, self.yRate, self.yMean, self.rMean,
             cvY, cvR, cvTR))
        self.gene_summary.write('%6.2f %7.3f %7.3f %30s %10.2e\n' %
                                (self.M.out['bic'][i], self.M.out['rsq'][i],
                                 self.M.out['rsa'][i], topN, pv))
    def set_stat(self, n, n_all):
        key, self.key, self.categorical = {}, {}, {}
        n_data, n_amps = [x for x in n_all if x != 'NA'], [
            self.amps[j] for j in range(len(n_all)) if n_all[j] != 'NA'
        ]
        nLen, first_idx, min_idx, max_idx = len(n_data), list(
            [i for i in range(len(n_all)) if n_all[i] not in ['NA', 0.0]] +
            ['NA'])[0], [], []

        self.categorical[n + '-trend'] = self.eval_change(n, n_data)[0]

        if len(n_data) > 0:
            nMin, nMax, nMed, nMean, nTotal = min(n_data), max(n_data), round(
                np.median(n_data), 3), round(np.mean(n_data),
                                             3), round(sum(n_data), 2)
            max_idxs, min_idxs = [
                i for i in range(len(n_data)) if n_data[i] == nMax
            ], [i for i in range(len(n_data)) if n_data[i] == nMin]
            for a, b in zip([nMin, nMax, nMed, nMean, nTotal],
                            ['min', 'max', 'med', 'avg', 'tot']):
                key[b] = a

            if first_idx != 'NA':
                first_amp, first_p = self.amps[first_idx], (first_idx +
                                                            1.0) / (self.len)
                max_amp, max_p = np.median(
                    [n_amps[j]
                     for j in max_idxs]), (np.median(max_idxs) + 1.0) / nLen
                min_amp, min_p = np.median(
                    [n_amps[j]
                     for j in min_idxs]), (np.median(min_idxs) + 1.0) / nLen
                for a, b in zip(
                    [first_amp, min_amp, max_amp, first_p, min_p, max_p], [
                        'rheoAmp', 'minAmp', 'maxAmp', 'rheoProg', 'minProg',
                        'maxProg'
                    ]):
                    key[b] = a

        if len(n_data) > 2 and nMin != nMax:
            key['cv'] = round(coVar(n_data), 2)
            nR, nPV = stats.pearsonr(n_data, range(len(n_data)))
            if nPV < 0.05: key['corr'] = int(100 * round(nR, 3))
            else: key['corr'] = 0

        if n == 'SPIKES':
            for x in [
                    'min', 'max', 'med', 'avg', 'corr', 'cv', 'tot', 'rheoAmp',
                    'minAmp', 'maxAmp', 'rheoProg', 'minProg', 'maxProg'
            ]:
                if x in key: self.key[n + '-' + x] = key[x]
                else: self.key[n + '-' + x] = 'NA'
        else:
            for x in [
                    'min', 'max', 'med', 'avg', 'tot', 'corr'
            ]:  #'rheoAmp','minAmp','maxAmp','rheoProg','minProg','maxProg']:
                if x in key: self.key[n + '-' + x] = key[x]
                else: self.key[n + '-' + x] = 'NA'

        return self
Exemple #3
0
def list_stats(L):

    xSum, xLen, xObs = sum(L), float(len(L)), len([x for x in L if x > 0])
    avg, obs = xSum / xLen, xObs / xLen
    std = np.std(L)
    cv = coVar(L)
    return int(xLen), xObs, round(avg, 3), round(obs,
                                                 3), round(std,
                                                           3), round(cv, 3)
    def summarize_sample_stats(self):

        seaborn.set(rc={
            'axes.facecolor': 'pink',
            'figure.facecolor': 'lightgray'
        })
        self.progress.start_subtopic('Calculating Summary Stats', '',
                                     self.sLen)
        res = dd(lambda: {})
        subplot = rage_subplots.subplot(3, 2, self.args)
        for s in self.input.samples:
            self.progress.mark_subtopic()
            ordered_logs = sorted([log(1.0 + c) for c in s.cnts.values()],
                                  reverse=True)
            res['#Reads'][s] = s.cnt_total
            halfE, iX, k = sum(ordered_logs) * 0.5, 0, -1
            res['#Observed_Genes'][s] = len(ordered_logs)
            res['#Genes_Above_Mean'][s] = len([
                x for x in ordered_logs if x > np.mean(ordered_logs)
            ]) / float(len(ordered_logs))
            while iX < halfE:
                k += 1
                iX += ordered_logs[k]
            res['%Genes_Required_For_HalfDepth'][s] = k / float(
                len(ordered_logs))
            res['CoeffVar'][s] = coVar(ordered_logs)
            res['#topVals'][s] = 0

        for f in self.input.features:
            for a, b in sorted([(b, a) for (a, b) in f.cnts.items()])[-5::]:
                res['#topVals'][self.input.samples[b]] += 1

        subplot.add_hist(res['#Reads'].values()).update({
            'xlab': 'reads per sample',
            'ylab': 'occurences',
            'title': 'Depth'
        })
        subplot.add_hist(res['#Observed_Genes'].values()).update({
            'xlab':
            'genes per sample',
            'ylab':
            'occurences',
            'title':
            'Library Complexity'
        })
        subplot.add_hist(res['#Genes_Above_Mean'].values()).update({
            'xlab':
            '%',
            'ylab':
            'occurences',
            'title':
            '% genes above mean'
        })
        subplot.add_hist(res['%Genes_Required_For_HalfDepth'].values()).update(
            {
                'xlab': '%Obs Genes',
                'ylab': 'occurences',
                'title': '% Genes Required For 50% Read Depth (Log Space)'
            })
        subplot.add_hist(res['CoeffVar'].values()).update({
            'xlab':
            'CV',
            'ylab':
            'occurences',
            'title':
            'Coefficient of Variation Across Genes (Log Space)'
        })
        subplot.add_hist(res['#topVals'].values()).update({
            'xlab':
            'TopVals',
            'ylab':
            'occurences',
            'title':
            'Number of maximal values (top5)'
        })

        plt.subplots_adjust(left=0.05,
                            bottom=0.05,
                            right=0.95,
                            top=0.90,
                            wspace=0.1,
                            hspace=0.40)
        subplot.save('sample_summary.png',
                     {'title': 'Sample Summary Histograms'})

        rage_outputs.column_stats(self.args).write(res, self.input.samples, {
            'suffix': 'sample_stats.out',
            'width': 20
        })

        self.progress.finish_subtopic()
Exemple #5
0
    def predict_known_ratio_values(self):
        f_pred, s_pred = dd(lambda: dd(float)), dd(lambda: dd(float))
        f_totals = [sum(f.cnts.values()) for f in self.D.features]
        f_log_totals = [log(ft) for ft in f_totals]
        s_totals = [sum(s.cnts.values()) for s in self.D.samples]
        s_log_totals = [log(st) for st in s_totals]
        for si, s in enumerate(self.D.samples):
            s_contained = [i for i in s.cnts]
            s_housekeeping = [i for i in s.cnts if i in self.HOUSEKEEPING]
            for m in s_contained:
                m_infer = []
                for i in s_housekeeping:
                    if i == m: continue
                    try:
                        if i < m:
                            m_infer.append(s.cnts[i] / self.r_key[(i, m)])
                        if i > m:
                            m_infer.append(s.cnts[i] * self.r_key[(m, i)])
                    except KeyError:
                        continue
                if len(m_infer) == 0: infer_val = 0
                else: infer_val = np.mean(m_infer)
                #else:                 infer_val = np.mean(m_infer)
                f_pred[m][s.idx] = infer_val
                s_pred[s.idx][m] = infer_val

        wf = open(self.args.prefix + '_summarize_featureRatios.out', 'w')
        wf.write('%-40s %15s %15s %6s %18s %10s %10s %10s %10s\n' %
                 ('---', 'total_reads', 'total_obs', 'cv', 'predicted_total',
                  'perc_diff', 'R-depth', 'R-pred', 'R-log-pred'))
        for fi, f in enumerate(self.D.features):

            f_key, f_name = f.cnts.keys(), f.name
            f_true, f_log_true = [f.cnts[k] for k in f_key
                                  ], [log(f.cnts[k]) for k in f_key]
            f_predicted = [f_pred[fi][k] for k in f_key]
            f_predicted_total = sum(f_predicted)
            f_cv = coVar(f_true)
            p_diff = perc_diff(f_predicted_total, f_totals[fi])
            fs_log_totals = [s_log_totals[k] for k in f_key]
            fs_totals = [s_totals[k] for k in f_key]
            fRT = stats.pearsonr(fs_log_totals, f_log_true)[0]
            fRP = stats.pearsonr(f_predicted, f_true)[0]
            fRLP = stats.pearsonr([log(x) for x in f_predicted], f_log_true)[0]
            wf.write(
                '%-40s %15d %15d %6.2f %18.1f %10.2f %10.2f %10.2f %10.2f\n' %
                (f.name, f_totals[fi], len(f.cnts), coVar(f_true),
                 f_predicted_total, p_diff, fRT, fRP, fRLP))

        ws = open(self.args.prefix + '_summarize_sampleRatios.out', 'w')
        ws.write('%-40s %15s %15s %6s %18s %10s %10s %10s %10s\n' %
                 ('---', 'total_reads', 'total_obs', 'cv', 'predicted_total',
                  'perc_diff', 'R-depth', 'R-pred', 'R-log-pred'))
        for si, s in enumerate(self.D.samples):

            s_key, s_name = s.cnts.keys(), s.name
            s_true, s_log_true = [s.cnts[k] for k in s_key
                                  ], [log(s.cnts[k]) for k in s_key]
            s_predicted = [s_pred[si][k] for k in s_key]
            s_predicted_total = sum(s_predicted)
            s_cv = coVar(s_true)
            p_diff = perc_diff(s_predicted_total, s_totals[si])
            fs_log_totals = [f_log_totals[k] for k in s_key]
            fs_totals = [f_totals[k] for k in s_key]
            fRT, fRP, fRLP = stats.pearsonr(
                fs_log_totals, s_log_true)[0], stats.pearsonr(
                    s_predicted,
                    s_true)[0], stats.pearsonr([log(x) for x in s_predicted],
                                               s_log_true)[0]
            ws.write(
                '%-40s %15d %15d %6.2f %18.1f %10.2f %10.2f %10.2f %10.2f\n' %
                (s.name, s_totals[si], len(s.cnts), coVar(s_true),
                 s_predicted_total, p_diff, fRT, fRP, fRLP))
        self.progress.end()
        return self
Exemple #6
0
def summary_hists(X, Y, options, progress, X_NAME='SAMPLES'):

    #seaborn.set(rc={'axes.facecolor':'pink', 'figure.facecolor':'lightgray'})
    #seaborn.set(rc={'axes.facecolor':'pink', 'figure.facecolor':'w'})
    seaborn.set(rc={
        'axes.facecolor': 'lightcyan',
        'figure.facecolor': 'whitesmoke'
    })
    res, p_res, subplot = dd(lambda: {}), dd(
        lambda: {}), rage_subplots.subplot(3, 2, options)
    cMax = float(sum([y.len for y in Y]))
    for x in X:
        progress.mark()
        xMissed = [0 for i in range(len(Y) - len(x.cnts))]
        x_raw = [c for c in x.cnts] + xMissed
        res['#CompIndex'][x] = sum([Y[y].len for y in x.cnts.keys()]) / cMax
        ordered_logs = sorted([log(1.0 + c) for c in x.cnts.values()],
                              reverse=True)

        try:
            res['#Obs_gtAvg'][x] = len([
                l for l in ordered_logs if l > np.mean(ordered_logs)
            ]) / float(len(ordered_logs))
        except ZeroDivisionError:
            continue

        try:
            res['#Obs_gtAvg'][x] = len([
                l for l in ordered_logs if l > np.mean(ordered_logs)
            ]) / float(len(ordered_logs))
        except ZeroDivisionError:
            res['#Obs_gtAvg'] = 0

        halfE, iX, k = sum(ordered_logs) * 0.5, 0, -1
        p_res['log_total'][x] = x.cnt_total
        res['total'][x] = x.cnt_total
        res['observations'][x] = len(x.cnts)
        res['Qrt-75'][x] = np.percentile(x_raw, 75)
        res['Perc-90'][x] = round(np.percentile(x_raw, 90), 4)
        res['Perc-95'][x] = np.percentile(x_raw, 95)
        res['Perc-99'][x] = np.percentile(x_raw, 99)

        while iX < halfE:
            k += 1
            iX += ordered_logs[k]
        res['%Obs_HDepth'][x] = k / float(len(x.cnts) + len(xMissed))
        res['CoeffVar'][x] = coVar(ordered_logs)

    subplot.add_hist(res['total']).update({
        'xlab': 'log(reads)',
        'ylab': 'occurences',
        'title': 'Total Depth'
    })

    if X.label == 'samples':
        subplot.add_hist(res['observations']).update({
            'xlab':
            'observations',
            'ylab':
            'occurences',
            'title':
            'Library Diversity (Genes)'
        })
    else:
        subplot.add_hist(res['observations']).update({
            'xlab':
            'observations',
            'ylab':
            'occurences',
            'title':
            'Library Diversity (Samples)'
        })
    #	subplot.add_hist(res['#Obs_AboveMean']).update({'xlab':'%','ylab': 'occurences','title': 'Percentage of counts above mean'})
    subplot.add_hist(res['Qrt-75']).update({
        'xlab': 'cnts',
        'ylab': 'occurences',
        'title': 'Upper Quartile'
    })

    subplot.add_hist(res['CoeffVar']).update({
        'xlab':
        'CV',
        'ylab':
        'occurences',
        'title':
        'Coefficient of Variation (Log Space)'
    })
    subplot.add_hist(res['%Obs_HDepth']).update({
        'xlab':
        '%Obs',
        'ylab':
        'occurences',
        'title':
        '% Obs For 50% Read Depth (Log Space)'
    })
    subplot.add_hist(res['#CompIndex']).update({
        'xlab': '%Comparisons',
        'ylab': 'occurences',
        'title': 'Comparison Index'
    })
    plt.subplots_adjust(left=0.05,
                        bottom=0.05,
                        right=0.95,
                        top=0.90,
                        wspace=0.1,
                        hspace=0.40)
    if X.label == 'samples':
        subplot.save(options.prefix + '_sample_summary.png',
                     {'title': 'Sample Summary Histograms'})
    elif X.label == 'features':
        subplot.save(options.prefix + '_feature_summary.png',
                     {'title': 'Feature Summary Histograms'})
    return res
Exemple #7
0
def summary_trends(X, Y, options, progress, X_NAME='SAMPLES'):

    seaborn.set(rc={
        'axes.facecolor': 'lightcyan',
        'figure.facecolor': 'whitesmoke'
    })
    res, subplot = dd(lambda: {}), rage_subplots.subplot(
        3, 3, options, {'titlepos': [0.0, 1.05]})
    qts, maxV, obsR, means, totals, log_totals, cnt_means = [],[], [] , [] , [] , [] , []
    trends = {}
    stds = []
    cvs = []
    vsx = []
    for x in X:
        zC = len(Y) - len(x.cnts)
        x_all = x.cnts.values() + [0 for s in range(len(Y) - len(x.cnts))]

        x_mean = np.mean(x.cnts.values())

        #x_log = [log(p+1.0) for p in x.cnts.values()] + [0 for s in range(len(Y)-len(x.cnts))]

        #qts.append(log(1.0+np.percentile(x_all,95)))
        #maxV.append(log(1.0+max(x_all)))
        #means.append(log(1.0+np.mean(x_all)))

        qts.append(np.percentile(x_all, 95))
        maxV.append(log(max(x_all), 2))
        means.append(log(np.mean(x_all), 2))
        obsR.append(len(x.cnts) / float(len(Y)))

        totals.append(sum(x_all))
        log_totals.append(log(sum(x_all) + 1.0, 2))
        cnt_means.append(log(x_mean, 2))

        stds.append(np.std(x_all))
        vsx.append(log(np.var(x_all), 2))
        cvs.append(coVar(x_all))

    trends[('observations', 'log_total')] = stats.pearsonr(log_totals, obsR)
    trends[('observations', 'cnt_mean')] = stats.pearsonr(cnt_means, obsR)
    trends[('observations', 'max')] = stats.pearsonr(maxV, obsR)
    subplot.add_scatter_trend(obsR,
                              log_totals,
                              R=trends[('observations',
                                        'log_total')][0]).update({
                                            'title':
                                            'Observations vs Total',
                                            'xlab':
                                            'observation rate',
                                            'ylab':
                                            'total (logs)'
                                        })
    subplot.add_scatter_trend(obsR,
                              cnt_means,
                              R=trends[('observations',
                                        'cnt_mean')][0]).update({
                                            'title':
                                            'Observations vs Cnt Mean',
                                            'xlab':
                                            'observation rate',
                                            'ylab':
                                            'Cnt Mean (logs)'
                                        })
    subplot.add_scatter_trend(obsR, maxV,
                              R=trends[('observations', 'max')][0]).update({
                                  'title':
                                  'Observations vs Max',
                                  'xlab':
                                  'observation rate',
                                  'ylab':
                                  'Max (logs)'
                              })

    trends[('mean', 'var')] = stats.pearsonr(means, vsx)
    subplot.add_scatter_trend(means, vsx,
                              R=trends[('mean', 'var')][0]).update({
                                  'title':
                                  'Mean vs Variance',
                                  'xlab':
                                  'Mean (log)',
                                  'ylab':
                                  'Variance'
                              })

    trends[('mean', 'cv')] = stats.pearsonr(means, cvs)
    subplot.add_scatter_trend(means, cvs, R=trends[('mean', 'cv')][0]).update({
        'title':
        'Mean vs CV',
        'xlab':
        'Mean (log)',
        'ylab':
        'CV'
    })

    trends[('max', 'log_total')] = stats.pearsonr(log_totals, maxV)

    subplot.add_scatter_trend(qts, totals).update({
        'title': 'Upper Quartile vs Total',
        'xlab': 'Upper Quartile',
        'ylab': 'total (logs)'
    })
    #	subplot.add_scatter_trend(qts,maxV).update({'title': 'Upper Quartile vs Max','xlab': 'Upper Quartile','ylab': 'Max (logs)'})
    #	subplot.add_scatter_trend(means,qts).update({'title': 'Mean vs Upper Quartile','xlab': 'Mean (logs)','ylab': 'Upper Quartiles (logs)'})
    subplot.add_scatter_trend(means, maxV).update({
        'title': 'Mean vs Max',
        'xlab': 'Mean',
        'ylab': 'Max (logs)'
    })

    plt.subplots_adjust(left=0.05,
                        bottom=0.04,
                        right=0.95,
                        top=0.90,
                        wspace=0.25,
                        hspace=0.5)
    if X.label == 'samples':
        subplot.save(options.prefix + '_sample_trends.png',
                     {'title': 'Sample Trends'})
    elif X.label == 'features':
        subplot.save(options.prefix + '_feature_trends.png',
                     {'title': 'Feature Trends'})

    return
Exemple #8
0
    def write(self, D, M, Mp=None, suffix='dex'):

        w = open(
            self.options.prefix + '_' + suffix + '_' +
            "_".join(self.options.predictors) + '_covar' +
            "-".join(self.options.covariates) + '.out', 'w')

        if len(M.reg) != len(D.features):
            print 'bad'
            sys.exit()

        Mpreds = [M.X.names[i] for i in M.X.predictor_idx]
        Ppreds = [Mp.X.names[i] for i in Mp.X.predictor_idx]

        parents = [
            p for p in M.X.parents if M.X.PREDICTOR[p] and p != 'intercept'
        ]
        children = [M.X.names[i] for i in M.X.predictor_idx]

        self.seg,self.fracs,self.segMM,self.segLens,self.mKey,self.pKey = {} ,{}, {} ,{},{},{}
        for p in parents:
            self.mKey[p] = {
                M.X.names[i]: i
                for i in M.X.predictor_idx if M.X.names[i] in M.X.children[p]
            }
            self.pKey[p] = {
                Mp.X.names[i]: i
                for i in Mp.X.predictor_idx
                if Mp.X.names[i] in Mp.X.children[p]
            }

            self.seg[p], self.segLens[p] = D.samples.segregate(p)
            self.fracs[p] = {
                g: float(v) / sum(self.segLens[p].values())
                for g, v in self.segLens[p].items()
            }
#			self.segMM[p] = min(seg_lens.values()),max(seg_lens.values())

        w.write(
            '--- RS CV obs len parent maxG maxMeans maxObs maxChi maxP | params\n'
        )
        for zp, zm, f in zip(Mp.reg, M.reg, D.features):

            fObs = len(f.cnts)
            cnts = [f.cnts[i] for i in range(len(D.samples))]

            LS = [f.name, round(zm.model.rsquared, 3), round(coVar(cnts), 3)]

            for p in parents:

                maxC, maxP = sorted([(np.mean([f.cnts[i] for i in grp]), k)
                                     for k, grp in self.seg[p].items()])[-1]
                maxObs = len([
                    x for x in [f.cnts[i] for i in self.seg[p][maxP]] if x > 0
                ]) / float(self.segLens[p][maxP])
                p_srt = sorted([
                    a for b in [[(f.cnts[i], k) for i in grp]
                                for k, grp in self.seg[p].items()] for a in b
                ],
                               reverse=True)

                if len([x for x in p_srt if x[0] > 0]) < 5:
                    continue

                else:
                    if p_srt[self.segLens[p][maxP] - 1][0] == 0:
                        p_split = [ps for ps in p_srt if ps[0] > 0]
                    else:
                        p_split = p_srt[0:self.segLens[p][maxP]]
                    maxL, maxN, maxF, maxFn = len([
                        ps[1] for ps in p_split if ps[1] == maxP
                    ]), len([ps[1] for ps in p_split if ps[1] != maxP
                             ]), self.fracs[p][maxP], 1 - self.fracs[p][maxP]
                    chiVal = chisquare(
                        [maxL, maxN],
                        f_exp=[len(p_split) * maxF,
                               len(p_split) * maxFn])[1]

                try:
                    line_data = LS + [
                        fObs,
                        len(cnts), p, maxP,
                        round(maxC, 3),
                        round(maxObs, 3),
                        '%2.2e' % chiVal,
                        '%2.2e' % zm.model.pvalues[self.mKey[p][maxP]]
                    ]
                except KeyError:
                    line_data = LS + [
                        fObs,
                        len(cnts), p, maxP,
                        round(maxC, 2),
                        round(maxObs, 3),
                        '%1.1e' % chiVal,
                        '%1.1e' % np.mean([
                            zm.model.pvalues[zz]
                            for zz in self.mKey[p].values()
                        ])
                    ]

                for x in self.pKey[p].keys():
                    line_data.extend([
                        '|', x, (zm.model.params[self.mKey[p][x]] > 0),
                        '%2.2e' % zm.model.pvalues[self.mKey[p][x]],
                        '%2.2e' % zp.model.pvalues[self.pKey[p][x]]
                    ])
                w.write(" ".join([str(xx) for xx in line_data]) + '\n')