def write_feature_summary(self, f, Yi, i): self.f_name = f.name self.yLen, self.yObs, self.yRate, self.yR = len(Yi), len( [y for y in Yi if y > 0]), len([y for y in Yi if y > 0]) / float( len(Yi)), self.M.out['covariate_resids'][i] cvY, cvR, cvTR = coVar(Yi), coVar(self.yR), coVar( self.M.out['resids'][i]) cvT = np.std(Yi) cvR = np.std(self.yR) #/np.mean(Yi) cvTR = np.std(self.M.out['resids'][i]) self.yMean, self.rMean = np.mean(Yi), np.mean(self.yR) pv, bw, topN, PRED = sorted( [x for x in self.M.out['params'][i] if x[2] != 'intercept'])[0] self.gene_summary.write( '%-50s %8s %8s %8.2f %8.2f %8.2f %8.2f %8.2f %8.2f ' % (f.name, self.yLen, self.yObs, self.yRate, self.yMean, self.rMean, cvY, cvR, cvTR)) self.gene_summary.write('%6.2f %7.3f %7.3f %30s %10.2e\n' % (self.M.out['bic'][i], self.M.out['rsq'][i], self.M.out['rsa'][i], topN, pv))
def set_stat(self, n, n_all): key, self.key, self.categorical = {}, {}, {} n_data, n_amps = [x for x in n_all if x != 'NA'], [ self.amps[j] for j in range(len(n_all)) if n_all[j] != 'NA' ] nLen, first_idx, min_idx, max_idx = len(n_data), list( [i for i in range(len(n_all)) if n_all[i] not in ['NA', 0.0]] + ['NA'])[0], [], [] self.categorical[n + '-trend'] = self.eval_change(n, n_data)[0] if len(n_data) > 0: nMin, nMax, nMed, nMean, nTotal = min(n_data), max(n_data), round( np.median(n_data), 3), round(np.mean(n_data), 3), round(sum(n_data), 2) max_idxs, min_idxs = [ i for i in range(len(n_data)) if n_data[i] == nMax ], [i for i in range(len(n_data)) if n_data[i] == nMin] for a, b in zip([nMin, nMax, nMed, nMean, nTotal], ['min', 'max', 'med', 'avg', 'tot']): key[b] = a if first_idx != 'NA': first_amp, first_p = self.amps[first_idx], (first_idx + 1.0) / (self.len) max_amp, max_p = np.median( [n_amps[j] for j in max_idxs]), (np.median(max_idxs) + 1.0) / nLen min_amp, min_p = np.median( [n_amps[j] for j in min_idxs]), (np.median(min_idxs) + 1.0) / nLen for a, b in zip( [first_amp, min_amp, max_amp, first_p, min_p, max_p], [ 'rheoAmp', 'minAmp', 'maxAmp', 'rheoProg', 'minProg', 'maxProg' ]): key[b] = a if len(n_data) > 2 and nMin != nMax: key['cv'] = round(coVar(n_data), 2) nR, nPV = stats.pearsonr(n_data, range(len(n_data))) if nPV < 0.05: key['corr'] = int(100 * round(nR, 3)) else: key['corr'] = 0 if n == 'SPIKES': for x in [ 'min', 'max', 'med', 'avg', 'corr', 'cv', 'tot', 'rheoAmp', 'minAmp', 'maxAmp', 'rheoProg', 'minProg', 'maxProg' ]: if x in key: self.key[n + '-' + x] = key[x] else: self.key[n + '-' + x] = 'NA' else: for x in [ 'min', 'max', 'med', 'avg', 'tot', 'corr' ]: #'rheoAmp','minAmp','maxAmp','rheoProg','minProg','maxProg']: if x in key: self.key[n + '-' + x] = key[x] else: self.key[n + '-' + x] = 'NA' return self
def list_stats(L): xSum, xLen, xObs = sum(L), float(len(L)), len([x for x in L if x > 0]) avg, obs = xSum / xLen, xObs / xLen std = np.std(L) cv = coVar(L) return int(xLen), xObs, round(avg, 3), round(obs, 3), round(std, 3), round(cv, 3)
def summarize_sample_stats(self): seaborn.set(rc={ 'axes.facecolor': 'pink', 'figure.facecolor': 'lightgray' }) self.progress.start_subtopic('Calculating Summary Stats', '', self.sLen) res = dd(lambda: {}) subplot = rage_subplots.subplot(3, 2, self.args) for s in self.input.samples: self.progress.mark_subtopic() ordered_logs = sorted([log(1.0 + c) for c in s.cnts.values()], reverse=True) res['#Reads'][s] = s.cnt_total halfE, iX, k = sum(ordered_logs) * 0.5, 0, -1 res['#Observed_Genes'][s] = len(ordered_logs) res['#Genes_Above_Mean'][s] = len([ x for x in ordered_logs if x > np.mean(ordered_logs) ]) / float(len(ordered_logs)) while iX < halfE: k += 1 iX += ordered_logs[k] res['%Genes_Required_For_HalfDepth'][s] = k / float( len(ordered_logs)) res['CoeffVar'][s] = coVar(ordered_logs) res['#topVals'][s] = 0 for f in self.input.features: for a, b in sorted([(b, a) for (a, b) in f.cnts.items()])[-5::]: res['#topVals'][self.input.samples[b]] += 1 subplot.add_hist(res['#Reads'].values()).update({ 'xlab': 'reads per sample', 'ylab': 'occurences', 'title': 'Depth' }) subplot.add_hist(res['#Observed_Genes'].values()).update({ 'xlab': 'genes per sample', 'ylab': 'occurences', 'title': 'Library Complexity' }) subplot.add_hist(res['#Genes_Above_Mean'].values()).update({ 'xlab': '%', 'ylab': 'occurences', 'title': '% genes above mean' }) subplot.add_hist(res['%Genes_Required_For_HalfDepth'].values()).update( { 'xlab': '%Obs Genes', 'ylab': 'occurences', 'title': '% Genes Required For 50% Read Depth (Log Space)' }) subplot.add_hist(res['CoeffVar'].values()).update({ 'xlab': 'CV', 'ylab': 'occurences', 'title': 'Coefficient of Variation Across Genes (Log Space)' }) subplot.add_hist(res['#topVals'].values()).update({ 'xlab': 'TopVals', 'ylab': 'occurences', 'title': 'Number of maximal values (top5)' }) plt.subplots_adjust(left=0.05, bottom=0.05, right=0.95, top=0.90, wspace=0.1, hspace=0.40) subplot.save('sample_summary.png', {'title': 'Sample Summary Histograms'}) rage_outputs.column_stats(self.args).write(res, self.input.samples, { 'suffix': 'sample_stats.out', 'width': 20 }) self.progress.finish_subtopic()
def predict_known_ratio_values(self): f_pred, s_pred = dd(lambda: dd(float)), dd(lambda: dd(float)) f_totals = [sum(f.cnts.values()) for f in self.D.features] f_log_totals = [log(ft) for ft in f_totals] s_totals = [sum(s.cnts.values()) for s in self.D.samples] s_log_totals = [log(st) for st in s_totals] for si, s in enumerate(self.D.samples): s_contained = [i for i in s.cnts] s_housekeeping = [i for i in s.cnts if i in self.HOUSEKEEPING] for m in s_contained: m_infer = [] for i in s_housekeeping: if i == m: continue try: if i < m: m_infer.append(s.cnts[i] / self.r_key[(i, m)]) if i > m: m_infer.append(s.cnts[i] * self.r_key[(m, i)]) except KeyError: continue if len(m_infer) == 0: infer_val = 0 else: infer_val = np.mean(m_infer) #else: infer_val = np.mean(m_infer) f_pred[m][s.idx] = infer_val s_pred[s.idx][m] = infer_val wf = open(self.args.prefix + '_summarize_featureRatios.out', 'w') wf.write('%-40s %15s %15s %6s %18s %10s %10s %10s %10s\n' % ('---', 'total_reads', 'total_obs', 'cv', 'predicted_total', 'perc_diff', 'R-depth', 'R-pred', 'R-log-pred')) for fi, f in enumerate(self.D.features): f_key, f_name = f.cnts.keys(), f.name f_true, f_log_true = [f.cnts[k] for k in f_key ], [log(f.cnts[k]) for k in f_key] f_predicted = [f_pred[fi][k] for k in f_key] f_predicted_total = sum(f_predicted) f_cv = coVar(f_true) p_diff = perc_diff(f_predicted_total, f_totals[fi]) fs_log_totals = [s_log_totals[k] for k in f_key] fs_totals = [s_totals[k] for k in f_key] fRT = stats.pearsonr(fs_log_totals, f_log_true)[0] fRP = stats.pearsonr(f_predicted, f_true)[0] fRLP = stats.pearsonr([log(x) for x in f_predicted], f_log_true)[0] wf.write( '%-40s %15d %15d %6.2f %18.1f %10.2f %10.2f %10.2f %10.2f\n' % (f.name, f_totals[fi], len(f.cnts), coVar(f_true), f_predicted_total, p_diff, fRT, fRP, fRLP)) ws = open(self.args.prefix + '_summarize_sampleRatios.out', 'w') ws.write('%-40s %15s %15s %6s %18s %10s %10s %10s %10s\n' % ('---', 'total_reads', 'total_obs', 'cv', 'predicted_total', 'perc_diff', 'R-depth', 'R-pred', 'R-log-pred')) for si, s in enumerate(self.D.samples): s_key, s_name = s.cnts.keys(), s.name s_true, s_log_true = [s.cnts[k] for k in s_key ], [log(s.cnts[k]) for k in s_key] s_predicted = [s_pred[si][k] for k in s_key] s_predicted_total = sum(s_predicted) s_cv = coVar(s_true) p_diff = perc_diff(s_predicted_total, s_totals[si]) fs_log_totals = [f_log_totals[k] for k in s_key] fs_totals = [f_totals[k] for k in s_key] fRT, fRP, fRLP = stats.pearsonr( fs_log_totals, s_log_true)[0], stats.pearsonr( s_predicted, s_true)[0], stats.pearsonr([log(x) for x in s_predicted], s_log_true)[0] ws.write( '%-40s %15d %15d %6.2f %18.1f %10.2f %10.2f %10.2f %10.2f\n' % (s.name, s_totals[si], len(s.cnts), coVar(s_true), s_predicted_total, p_diff, fRT, fRP, fRLP)) self.progress.end() return self
def summary_hists(X, Y, options, progress, X_NAME='SAMPLES'): #seaborn.set(rc={'axes.facecolor':'pink', 'figure.facecolor':'lightgray'}) #seaborn.set(rc={'axes.facecolor':'pink', 'figure.facecolor':'w'}) seaborn.set(rc={ 'axes.facecolor': 'lightcyan', 'figure.facecolor': 'whitesmoke' }) res, p_res, subplot = dd(lambda: {}), dd( lambda: {}), rage_subplots.subplot(3, 2, options) cMax = float(sum([y.len for y in Y])) for x in X: progress.mark() xMissed = [0 for i in range(len(Y) - len(x.cnts))] x_raw = [c for c in x.cnts] + xMissed res['#CompIndex'][x] = sum([Y[y].len for y in x.cnts.keys()]) / cMax ordered_logs = sorted([log(1.0 + c) for c in x.cnts.values()], reverse=True) try: res['#Obs_gtAvg'][x] = len([ l for l in ordered_logs if l > np.mean(ordered_logs) ]) / float(len(ordered_logs)) except ZeroDivisionError: continue try: res['#Obs_gtAvg'][x] = len([ l for l in ordered_logs if l > np.mean(ordered_logs) ]) / float(len(ordered_logs)) except ZeroDivisionError: res['#Obs_gtAvg'] = 0 halfE, iX, k = sum(ordered_logs) * 0.5, 0, -1 p_res['log_total'][x] = x.cnt_total res['total'][x] = x.cnt_total res['observations'][x] = len(x.cnts) res['Qrt-75'][x] = np.percentile(x_raw, 75) res['Perc-90'][x] = round(np.percentile(x_raw, 90), 4) res['Perc-95'][x] = np.percentile(x_raw, 95) res['Perc-99'][x] = np.percentile(x_raw, 99) while iX < halfE: k += 1 iX += ordered_logs[k] res['%Obs_HDepth'][x] = k / float(len(x.cnts) + len(xMissed)) res['CoeffVar'][x] = coVar(ordered_logs) subplot.add_hist(res['total']).update({ 'xlab': 'log(reads)', 'ylab': 'occurences', 'title': 'Total Depth' }) if X.label == 'samples': subplot.add_hist(res['observations']).update({ 'xlab': 'observations', 'ylab': 'occurences', 'title': 'Library Diversity (Genes)' }) else: subplot.add_hist(res['observations']).update({ 'xlab': 'observations', 'ylab': 'occurences', 'title': 'Library Diversity (Samples)' }) # subplot.add_hist(res['#Obs_AboveMean']).update({'xlab':'%','ylab': 'occurences','title': 'Percentage of counts above mean'}) subplot.add_hist(res['Qrt-75']).update({ 'xlab': 'cnts', 'ylab': 'occurences', 'title': 'Upper Quartile' }) subplot.add_hist(res['CoeffVar']).update({ 'xlab': 'CV', 'ylab': 'occurences', 'title': 'Coefficient of Variation (Log Space)' }) subplot.add_hist(res['%Obs_HDepth']).update({ 'xlab': '%Obs', 'ylab': 'occurences', 'title': '% Obs For 50% Read Depth (Log Space)' }) subplot.add_hist(res['#CompIndex']).update({ 'xlab': '%Comparisons', 'ylab': 'occurences', 'title': 'Comparison Index' }) plt.subplots_adjust(left=0.05, bottom=0.05, right=0.95, top=0.90, wspace=0.1, hspace=0.40) if X.label == 'samples': subplot.save(options.prefix + '_sample_summary.png', {'title': 'Sample Summary Histograms'}) elif X.label == 'features': subplot.save(options.prefix + '_feature_summary.png', {'title': 'Feature Summary Histograms'}) return res
def summary_trends(X, Y, options, progress, X_NAME='SAMPLES'): seaborn.set(rc={ 'axes.facecolor': 'lightcyan', 'figure.facecolor': 'whitesmoke' }) res, subplot = dd(lambda: {}), rage_subplots.subplot( 3, 3, options, {'titlepos': [0.0, 1.05]}) qts, maxV, obsR, means, totals, log_totals, cnt_means = [],[], [] , [] , [] , [] , [] trends = {} stds = [] cvs = [] vsx = [] for x in X: zC = len(Y) - len(x.cnts) x_all = x.cnts.values() + [0 for s in range(len(Y) - len(x.cnts))] x_mean = np.mean(x.cnts.values()) #x_log = [log(p+1.0) for p in x.cnts.values()] + [0 for s in range(len(Y)-len(x.cnts))] #qts.append(log(1.0+np.percentile(x_all,95))) #maxV.append(log(1.0+max(x_all))) #means.append(log(1.0+np.mean(x_all))) qts.append(np.percentile(x_all, 95)) maxV.append(log(max(x_all), 2)) means.append(log(np.mean(x_all), 2)) obsR.append(len(x.cnts) / float(len(Y))) totals.append(sum(x_all)) log_totals.append(log(sum(x_all) + 1.0, 2)) cnt_means.append(log(x_mean, 2)) stds.append(np.std(x_all)) vsx.append(log(np.var(x_all), 2)) cvs.append(coVar(x_all)) trends[('observations', 'log_total')] = stats.pearsonr(log_totals, obsR) trends[('observations', 'cnt_mean')] = stats.pearsonr(cnt_means, obsR) trends[('observations', 'max')] = stats.pearsonr(maxV, obsR) subplot.add_scatter_trend(obsR, log_totals, R=trends[('observations', 'log_total')][0]).update({ 'title': 'Observations vs Total', 'xlab': 'observation rate', 'ylab': 'total (logs)' }) subplot.add_scatter_trend(obsR, cnt_means, R=trends[('observations', 'cnt_mean')][0]).update({ 'title': 'Observations vs Cnt Mean', 'xlab': 'observation rate', 'ylab': 'Cnt Mean (logs)' }) subplot.add_scatter_trend(obsR, maxV, R=trends[('observations', 'max')][0]).update({ 'title': 'Observations vs Max', 'xlab': 'observation rate', 'ylab': 'Max (logs)' }) trends[('mean', 'var')] = stats.pearsonr(means, vsx) subplot.add_scatter_trend(means, vsx, R=trends[('mean', 'var')][0]).update({ 'title': 'Mean vs Variance', 'xlab': 'Mean (log)', 'ylab': 'Variance' }) trends[('mean', 'cv')] = stats.pearsonr(means, cvs) subplot.add_scatter_trend(means, cvs, R=trends[('mean', 'cv')][0]).update({ 'title': 'Mean vs CV', 'xlab': 'Mean (log)', 'ylab': 'CV' }) trends[('max', 'log_total')] = stats.pearsonr(log_totals, maxV) subplot.add_scatter_trend(qts, totals).update({ 'title': 'Upper Quartile vs Total', 'xlab': 'Upper Quartile', 'ylab': 'total (logs)' }) # subplot.add_scatter_trend(qts,maxV).update({'title': 'Upper Quartile vs Max','xlab': 'Upper Quartile','ylab': 'Max (logs)'}) # subplot.add_scatter_trend(means,qts).update({'title': 'Mean vs Upper Quartile','xlab': 'Mean (logs)','ylab': 'Upper Quartiles (logs)'}) subplot.add_scatter_trend(means, maxV).update({ 'title': 'Mean vs Max', 'xlab': 'Mean', 'ylab': 'Max (logs)' }) plt.subplots_adjust(left=0.05, bottom=0.04, right=0.95, top=0.90, wspace=0.25, hspace=0.5) if X.label == 'samples': subplot.save(options.prefix + '_sample_trends.png', {'title': 'Sample Trends'}) elif X.label == 'features': subplot.save(options.prefix + '_feature_trends.png', {'title': 'Feature Trends'}) return
def write(self, D, M, Mp=None, suffix='dex'): w = open( self.options.prefix + '_' + suffix + '_' + "_".join(self.options.predictors) + '_covar' + "-".join(self.options.covariates) + '.out', 'w') if len(M.reg) != len(D.features): print 'bad' sys.exit() Mpreds = [M.X.names[i] for i in M.X.predictor_idx] Ppreds = [Mp.X.names[i] for i in Mp.X.predictor_idx] parents = [ p for p in M.X.parents if M.X.PREDICTOR[p] and p != 'intercept' ] children = [M.X.names[i] for i in M.X.predictor_idx] self.seg,self.fracs,self.segMM,self.segLens,self.mKey,self.pKey = {} ,{}, {} ,{},{},{} for p in parents: self.mKey[p] = { M.X.names[i]: i for i in M.X.predictor_idx if M.X.names[i] in M.X.children[p] } self.pKey[p] = { Mp.X.names[i]: i for i in Mp.X.predictor_idx if Mp.X.names[i] in Mp.X.children[p] } self.seg[p], self.segLens[p] = D.samples.segregate(p) self.fracs[p] = { g: float(v) / sum(self.segLens[p].values()) for g, v in self.segLens[p].items() } # self.segMM[p] = min(seg_lens.values()),max(seg_lens.values()) w.write( '--- RS CV obs len parent maxG maxMeans maxObs maxChi maxP | params\n' ) for zp, zm, f in zip(Mp.reg, M.reg, D.features): fObs = len(f.cnts) cnts = [f.cnts[i] for i in range(len(D.samples))] LS = [f.name, round(zm.model.rsquared, 3), round(coVar(cnts), 3)] for p in parents: maxC, maxP = sorted([(np.mean([f.cnts[i] for i in grp]), k) for k, grp in self.seg[p].items()])[-1] maxObs = len([ x for x in [f.cnts[i] for i in self.seg[p][maxP]] if x > 0 ]) / float(self.segLens[p][maxP]) p_srt = sorted([ a for b in [[(f.cnts[i], k) for i in grp] for k, grp in self.seg[p].items()] for a in b ], reverse=True) if len([x for x in p_srt if x[0] > 0]) < 5: continue else: if p_srt[self.segLens[p][maxP] - 1][0] == 0: p_split = [ps for ps in p_srt if ps[0] > 0] else: p_split = p_srt[0:self.segLens[p][maxP]] maxL, maxN, maxF, maxFn = len([ ps[1] for ps in p_split if ps[1] == maxP ]), len([ps[1] for ps in p_split if ps[1] != maxP ]), self.fracs[p][maxP], 1 - self.fracs[p][maxP] chiVal = chisquare( [maxL, maxN], f_exp=[len(p_split) * maxF, len(p_split) * maxFn])[1] try: line_data = LS + [ fObs, len(cnts), p, maxP, round(maxC, 3), round(maxObs, 3), '%2.2e' % chiVal, '%2.2e' % zm.model.pvalues[self.mKey[p][maxP]] ] except KeyError: line_data = LS + [ fObs, len(cnts), p, maxP, round(maxC, 2), round(maxObs, 3), '%1.1e' % chiVal, '%1.1e' % np.mean([ zm.model.pvalues[zz] for zz in self.mKey[p].values() ]) ] for x in self.pKey[p].keys(): line_data.extend([ '|', x, (zm.model.params[self.mKey[p][x]] > 0), '%2.2e' % zm.model.pvalues[self.mKey[p][x]], '%2.2e' % zp.model.pvalues[self.pKey[p][x]] ]) w.write(" ".join([str(xx) for xx in line_data]) + '\n')