def fit_dispersion(counts, disp_raw, disp_conv, sf, CFG, dmatrix1): mean_count = sp.mean(counts / sf, axis=1)[:, sp.newaxis] index = sp.where(disp_conv)[0] lowerBound = sp.percentile(sp.unique(disp_raw[index]), 1) upperBound = sp.percentile(sp.unique(disp_raw[index]), 99) idx = sp.where((disp_raw > lowerBound) & (disp_raw < upperBound))[0] matrix = sp.ones((idx.shape[0], 2), dtype='float') matrix[:, 0] /= mean_count[idx].ravel() modGamma = sm.GLM(disp_raw[idx], matrix, family=sm.families.Gamma(sm.families.links.identity)) res = modGamma.fit() Lambda = res.params disp_fitted = disp_raw.copy() ok_idx = sp.where(~sp.isnan(disp_fitted))[0] disp_fitted[ok_idx] = Lambda[0] / mean_count[ok_idx] + Lambda[1] if sp.sum(disp_fitted > 0) > 0: print "Found dispersion fit" if CFG['diagnose_plots']: plot.mean_variance_plot(counts=counts, disp=disp_fitted, matrix=dmatrix1, figtitle='Fitted Dispersion Estimate', filename=os.path.join(CFG['plot_dir'], 'dispersion_fitted.pdf'), CFG=CFG) return (disp_fitted, Lambda, idx)
def adjust_dispersion(counts, dmatrix1, disp_raw, disp_fitted, idx, sf, CFG): if CFG['verbose']: print 'Start to estimate adjusted dispersions.' varLogDispSamp = polygamma(1, (dmatrix1.shape[0] - dmatrix1.shape[1] ) / 2) ## number of samples - number of coefficients varPrior = calculate_varPrior(disp_raw, disp_fitted, idx, varLogDispSamp) if CFG['parallel'] > 1: disp_adj = sp.empty((counts.shape[0], 1)) disp_adj.fill(sp.nan) disp_adj_conv = sp.zeros_like(disp_adj, dtype='bool') pool = mp.Pool(processes=CFG['parallel'], initializer=lambda: sig.signal(sig.SIGINT, sig.SIG_IGN)) binsize = 30 idx_chunks = [sp.arange(x, min(x + binsize, counts.shape[0])) for x in range(0, counts.shape[0], binsize)] try: result = [pool.apply_async(adjust_dispersion_chunk, args=(counts[cidx, :], dmatrix1, disp_raw[cidx], disp_fitted[cidx], varPrior, sf, CFG, cidx,)) for cidx in idx_chunks] res_cnt = 0 while result: tmp = result.pop(0).get() for i, j in enumerate(tmp[2]): if CFG['verbose']: log_progress(res_cnt, counts.shape[0]) res_cnt += 1 disp_adj[j] = tmp[0][i] disp_adj_conv[j] = tmp[1][i] if CFG['verbose']: log_progress(counts.shape[0], counts.shape[0]) print '' pool.terminate() pool.join() except KeyboardInterrupt: print >> sys.stderr, 'Keyboard Interrupt - exiting' pool.terminate() pool.join() sys.exit(1) else: (disp_adj, disp_adj_conv, _) = adjust_dispersion_chunk(counts, dmatrix1, disp_raw, disp_fitted, varPrior, sf, CFG, sp.arange(counts.shape[0]), log=CFG['verbose']) if CFG['diagnose_plots']: plot.mean_variance_plot(counts=counts, disp=disp_adj, matrix=dmatrix1, figtitle='Adjusted Dispersion Estimate', filename=os.path.join(CFG['plot_dir'], 'dispersion_adjusted.pdf'), CFG=CFG) return (disp_adj, disp_adj_conv)
def estimate_dispersion(gene_counts, matrix, sf, CFG): if CFG['verbose']: print 'Estimating raw dispersions' if CFG['parallel'] > 1: disp_raw = sp.empty((gene_counts.shape[0], 1), dtype='float') disp_raw.fill(sp.nan) disp_raw_conv = sp.zeros((gene_counts.shape[0], 1), dtype='bool') pool = mp.Pool(processes=CFG['parallel'], initializer=lambda: sig.signal(sig.SIGINT, sig.SIG_IGN)) binsize = 30 idx_chunks = [sp.arange(x, min(x + binsize, gene_counts.shape[0])) for x in range(0, gene_counts.shape[0], binsize)] try: result = [pool.apply_async(estimate_dispersion_chunk, args=(gene_counts[idx, :], matrix, sf, CFG, idx,)) for idx in idx_chunks] res_cnt = 0 while result: tmp = result.pop(0).get() for i, j in enumerate(tmp[2]): if CFG['verbose']: log_progress(res_cnt, gene_counts.shape[0]) res_cnt += 1 disp_raw[j] = tmp[0][i] disp_raw_conv[j] = tmp[1][i] if CFG['verbose']: log_progress(gene_counts.shape[0], gene_counts.shape[0]) print '' pool.terminate() pool.join() except KeyboardInterrupt: print >> sys.stderr, 'Keyboard Interrupt - exiting' pool.terminate() pool.join() sys.exit(1) else: (disp_raw, disp_raw_conv, _) = estimate_dispersion_chunk(gene_counts, matrix, sf, CFG, sp.arange(gene_counts.shape[0]), log=CFG['verbose']) if CFG['diagnose_plots']: plot.mean_variance_plot(counts=gene_counts, disp=disp_raw, matrix=matrix, figtitle='Raw Dispersion Estimate', filename=os.path.join(CFG['plot_dir'], 'dispersion_raw.pdf'), CFG=CFG) return (disp_raw, disp_raw_conv)
def adjust_dispersion(counts, dmatrix1, disp_raw, disp_fitted, idx, sf, CFG): if CFG['verbose']: print 'Start to estimate adjusted dispersions.' varLogDispSamp = polygamma( 1, (dmatrix1.shape[0] - dmatrix1.shape[1]) / 2) ## number of samples - number of coefficients varPrior = calculate_varPrior(disp_raw, disp_fitted, idx, varLogDispSamp) if CFG['parallel'] > 1: disp_adj = sp.empty((counts.shape[0], 1)) disp_adj.fill(sp.nan) disp_adj_conv = sp.zeros_like(disp_adj, dtype='bool') pool = mp.Pool(processes=CFG['parallel'], initializer=lambda: sig.signal(sig.SIGINT, sig.SIG_IGN)) binsize = 30 idx_chunks = [ sp.arange(x, min(x + binsize, counts.shape[0])) for x in range(0, counts.shape[0], binsize) ] try: result = [ pool.apply_async(adjust_dispersion_chunk, args=( counts[cidx, :], dmatrix1, disp_raw[cidx], disp_fitted[cidx], varPrior, sf, CFG, cidx, )) for cidx in idx_chunks ] res_cnt = 0 while result: tmp = result.pop(0).get() for i, j in enumerate(tmp[2]): if CFG['verbose']: log_progress(res_cnt, counts.shape[0]) res_cnt += 1 disp_adj[j] = tmp[0][i] disp_adj_conv[j] = tmp[1][i] if CFG['verbose']: log_progress(counts.shape[0], counts.shape[0]) print '' pool.terminate() pool.join() except KeyboardInterrupt: print >> sys.stderr, 'Keyboard Interrupt - exiting' pool.terminate() pool.join() sys.exit(1) else: (disp_adj, disp_adj_conv, _) = adjust_dispersion_chunk(counts, dmatrix1, disp_raw, disp_fitted, varPrior, sf, CFG, sp.arange(counts.shape[0]), log=CFG['verbose']) if CFG['diagnose_plots']: plot.mean_variance_plot(counts=counts, disp=disp_adj, matrix=dmatrix1, figtitle='Adjusted Dispersion Estimate', filename=os.path.join( CFG['plot_dir'], 'dispersion_adjusted.pdf'), CFG=CFG) return (disp_adj, disp_adj_conv)
def estimate_dispersion(gene_counts, matrix, sf, CFG): if CFG['verbose']: print 'Estimating raw dispersions' if CFG['parallel'] > 1: disp_raw = sp.empty((gene_counts.shape[0], 1), dtype='float') disp_raw.fill(sp.nan) disp_raw_conv = sp.zeros((gene_counts.shape[0], 1), dtype='bool') pool = mp.Pool(processes=CFG['parallel'], initializer=lambda: sig.signal(sig.SIGINT, sig.SIG_IGN)) binsize = 30 idx_chunks = [ sp.arange(x, min(x + binsize, gene_counts.shape[0])) for x in range(0, gene_counts.shape[0], binsize) ] try: result = [ pool.apply_async(estimate_dispersion_chunk, args=( gene_counts[idx, :], matrix, sf, CFG, idx, )) for idx in idx_chunks ] res_cnt = 0 while result: tmp = result.pop(0).get() for i, j in enumerate(tmp[2]): if CFG['verbose']: log_progress(res_cnt, gene_counts.shape[0]) res_cnt += 1 disp_raw[j] = tmp[0][i] disp_raw_conv[j] = tmp[1][i] if CFG['verbose']: log_progress(gene_counts.shape[0], gene_counts.shape[0]) print '' pool.terminate() pool.join() except KeyboardInterrupt: print >> sys.stderr, 'Keyboard Interrupt - exiting' pool.terminate() pool.join() sys.exit(1) else: (disp_raw, disp_raw_conv, _) = estimate_dispersion_chunk(gene_counts, matrix, sf, CFG, sp.arange(gene_counts.shape[0]), log=CFG['verbose']) if CFG['diagnose_plots']: plot.mean_variance_plot(counts=gene_counts, disp=disp_raw, matrix=matrix, figtitle='Raw Dispersion Estimate', filename=os.path.join(CFG['plot_dir'], 'dispersion_raw.pdf'), CFG=CFG) return (disp_raw, disp_raw_conv)