def test_count_chunk(gene_counts, disp_adj, sf, dmatrix0, dmatrix1, CFG, idx, log=False): pval = sp.zeros((gene_counts.shape[0], 1), dtype='float') pval.fill(sp.nan) for i in xrange(idx.shape[0]): if log: log_progress(i, idx.shape[0]) if sp.isnan(disp_adj[i]): continue response = gene_counts[i, :].astype('int') if sp.sum(response[:response.shape[0] / 2] == 0) >= CFG['max_0_frac'] * response.shape[0] / 2: pval[i] = 1 continue modNB0 = sm.GLM(response, dmatrix0, family=sm.families.NegativeBinomial(alpha=disp_adj[i]), offset=sp.log(sf)) modNB1 = sm.GLM(response, dmatrix1, family=sm.families.NegativeBinomial(alpha=disp_adj[i]), offset=sp.log(sf)) result0 = modNB0.fit() result1 = modNB1.fit() pval[i] = 1 - chi2.cdf(result0.deviance - result1.deviance, dmatrix1.shape[1] - dmatrix0.shape[1]) if log: log_progress(idx.shape[0], idx.shape[0]) print '' return (pval, idx)
def adjust_dispersion_chunk(counts, dmatrix1, disp_raw, disp_fitted, varPrior, sf, CFG, idx, log=False): disp_adj = sp.empty((counts.shape[0], 1)) disp_adj.fill(sp.nan) disp_adj_conv = sp.zeros_like(disp_adj, dtype='bool') error_cnt = 0 for i in range(idx.shape[0]): if log: log_progress(i, idx.shape[0]) if not sp.isnan(disp_raw[i]): ### init dispersion and response disp = 0.1 resp = counts[i, :].astype('int') ### run for max 10 iterations for j in range(10): modNB = sm.GLM(resp, dmatrix1, family=sm.families.NegativeBinomial(alpha=disp), offset=sp.log(sf)) result = modNB.fit() dispBef = disp yhat = result.mu sign = -1.0 with warnings.catch_warnings(): warnings.simplefilter("ignore") try: res = minimize_scalar(adj_loglikelihood_shrink_scalar_onedisper, args=(dmatrix1, resp, yhat, disp_fitted[i], varPrior, sign), method='Bounded', bounds=(0, 10.0), tol=1e-5) except TypeError: disp_adj[i] = disp disp_adj_conv[i] = False error_cnt += 1 break disp = res.x if abs(sp.log(disp) - sp.log(dispBef)) < 1e-4: disp_adj[i] = disp disp_adj_conv[i] = True break else: disp_adj[i] = disp disp_adj_conv[i] = False if log: log_progress(idx.shape[0], idx.shape[0]) print '' if error_cnt > 0: print 'Warning: %i events did not fit due to a TypeError' % error_cnt return (disp_adj, disp_adj_conv, idx)
def adjust_dispersion(counts, dmatrix1, disp_raw, disp_fitted, idx, sf, CFG): if CFG['verbose']: print 'Start to estimate adjusted dispersions.' varLogDispSamp = polygamma(1, (dmatrix1.shape[0] - dmatrix1.shape[1] ) / 2) ## number of samples - number of coefficients varPrior = calculate_varPrior(disp_raw, disp_fitted, idx, varLogDispSamp) if CFG['parallel'] > 1: disp_adj = sp.empty((counts.shape[0], 1)) disp_adj.fill(sp.nan) disp_adj_conv = sp.zeros_like(disp_adj, dtype='bool') pool = mp.Pool(processes=CFG['parallel'], initializer=lambda: sig.signal(sig.SIGINT, sig.SIG_IGN)) binsize = 30 idx_chunks = [sp.arange(x, min(x + binsize, counts.shape[0])) for x in range(0, counts.shape[0], binsize)] try: result = [pool.apply_async(adjust_dispersion_chunk, args=(counts[cidx, :], dmatrix1, disp_raw[cidx], disp_fitted[cidx], varPrior, sf, CFG, cidx,)) for cidx in idx_chunks] res_cnt = 0 while result: tmp = result.pop(0).get() for i, j in enumerate(tmp[2]): if CFG['verbose']: log_progress(res_cnt, counts.shape[0]) res_cnt += 1 disp_adj[j] = tmp[0][i] disp_adj_conv[j] = tmp[1][i] if CFG['verbose']: log_progress(counts.shape[0], counts.shape[0]) print '' pool.terminate() pool.join() except KeyboardInterrupt: print >> sys.stderr, 'Keyboard Interrupt - exiting' pool.terminate() pool.join() sys.exit(1) else: (disp_adj, disp_adj_conv, _) = adjust_dispersion_chunk(counts, dmatrix1, disp_raw, disp_fitted, varPrior, sf, CFG, sp.arange(counts.shape[0]), log=CFG['verbose']) if CFG['diagnose_plots']: plot.mean_variance_plot(counts=counts, disp=disp_adj, matrix=dmatrix1, figtitle='Adjusted Dispersion Estimate', filename=os.path.join(CFG['plot_dir'], 'dispersion_adjusted.pdf'), CFG=CFG) return (disp_adj, disp_adj_conv)
def estimate_dispersion_chunk(gene_counts, matrix, sf, CFG, idx, log=False): disp_raw = sp.empty((idx.shape[0], 1), dtype='float') disp_raw.fill(sp.nan) disp_raw_conv = sp.zeros((idx.shape[0], 1), dtype='bool') for i in range(idx.shape[0]): if log: log_progress(i, idx.shape[0]) disp = 0.1 resp = gene_counts[i, :].astype('int') if sum(resp / sf) < CFG['min_count'] or sp.mean(resp == 0) > 0.6: continue for j in range(10): modNB = sm.GLM(resp, matrix, family=sm.families.NegativeBinomial(alpha=disp), offset=sp.log(sf)) result = modNB.fit() last_disp = disp yhat = result.mu sign = -1.0 with warnings.catch_warnings(): warnings.simplefilter("ignore") res = minimize_scalar(likelihood.adj_loglikelihood_scalar, args=(matrix, resp, yhat, sign), method='Bounded', bounds=(0, 10.0), tol=1e-5) disp = res.x if abs(sp.log(disp) - sp.log(last_disp)) < 1e-4: disp_raw[i] = disp disp_raw_conv[i] = True break else: disp_raw[i] = disp disp_raw_conv[i] = False if log: log_progress(idx.shape[0], idx.shape[0]) return (disp_raw, disp_raw_conv, idx)
def estimate_dispersion(gene_counts, matrix, sf, CFG): if CFG['verbose']: print 'Estimating raw dispersions' if CFG['parallel'] > 1: disp_raw = sp.empty((gene_counts.shape[0], 1), dtype='float') disp_raw.fill(sp.nan) disp_raw_conv = sp.zeros((gene_counts.shape[0], 1), dtype='bool') pool = mp.Pool(processes=CFG['parallel'], initializer=lambda: sig.signal(sig.SIGINT, sig.SIG_IGN)) binsize = 30 idx_chunks = [sp.arange(x, min(x + binsize, gene_counts.shape[0])) for x in range(0, gene_counts.shape[0], binsize)] try: result = [pool.apply_async(estimate_dispersion_chunk, args=(gene_counts[idx, :], matrix, sf, CFG, idx,)) for idx in idx_chunks] res_cnt = 0 while result: tmp = result.pop(0).get() for i, j in enumerate(tmp[2]): if CFG['verbose']: log_progress(res_cnt, gene_counts.shape[0]) res_cnt += 1 disp_raw[j] = tmp[0][i] disp_raw_conv[j] = tmp[1][i] if CFG['verbose']: log_progress(gene_counts.shape[0], gene_counts.shape[0]) print '' pool.terminate() pool.join() except KeyboardInterrupt: print >> sys.stderr, 'Keyboard Interrupt - exiting' pool.terminate() pool.join() sys.exit(1) else: (disp_raw, disp_raw_conv, _) = estimate_dispersion_chunk(gene_counts, matrix, sf, CFG, sp.arange(gene_counts.shape[0]), log=CFG['verbose']) if CFG['diagnose_plots']: plot.mean_variance_plot(counts=gene_counts, disp=disp_raw, matrix=matrix, figtitle='Raw Dispersion Estimate', filename=os.path.join(CFG['plot_dir'], 'dispersion_raw.pdf'), CFG=CFG) return (disp_raw, disp_raw_conv)
def test_count_chunk(gene_counts, disp_adj, sf, dmatrix0, dmatrix1, CFG, idx, log=False): pval = sp.zeros((gene_counts.shape[0], 1), dtype='float') pval.fill(sp.nan) for i in xrange(idx.shape[0]): if log: log_progress(i, idx.shape[0]) if sp.isnan(disp_adj[i]): continue response = gene_counts[i, :].astype('int') if sp.sum( response[:response.shape[0] / 2] == 0) >= CFG['max_0_frac'] * response.shape[0] / 2: pval[i] = 1 continue modNB0 = sm.GLM(response, dmatrix0, family=sm.families.NegativeBinomial(alpha=disp_adj[i]), offset=sp.log(sf)) modNB1 = sm.GLM(response, dmatrix1, family=sm.families.NegativeBinomial(alpha=disp_adj[i]), offset=sp.log(sf)) result0 = modNB0.fit() result1 = modNB1.fit() pval[i] = 1 - chi2.cdf(result0.deviance - result1.deviance, dmatrix1.shape[1] - dmatrix0.shape[1]) if log: log_progress(idx.shape[0], idx.shape[0]) print '' return (pval, idx)
def test_count(gene_counts, disp_adj, sf, dmatrix0, dmatrix1, CFG): if CFG['verbose']: print 'Start the statistical test.' if CFG['parallel'] > 1: pval = sp.zeros((gene_counts.shape[0], 1), dtype='float') pval.fill(sp.nan) pool = mp.Pool(processes=CFG['parallel'], initializer=lambda: sig.signal(sig.SIGINT, sig.SIG_IGN)) binsize = 30 idx_chunks = [sp.arange(x, min(x + binsize, gene_counts.shape[0])) for x in range(0, gene_counts.shape[0], binsize)] try: result = [pool.apply_async(test_count_chunk, args=(gene_counts[cidx, :], disp_adj[cidx], sf, dmatrix0, dmatrix1, CFG, cidx,)) for cidx in idx_chunks] res_cnt = 0 while result: tmp = result.pop(0).get() for i, j in enumerate(tmp[1]): if CFG['verbose']: log_progress(res_cnt, gene_counts.shape[0]) res_cnt += 1 pval[j] = tmp[0][i] if CFG['verbose']: log_progress(gene_counts.shape[0], gene_counts.shape[0]) print '' pool.terminate() pool.join() except KeyboardInterrupt: print >> sys.stderr, 'Keyboard Interrupt - exiting' pool.terminate() pool.join() sys.exit(1) else: (pval, _) = test_count_chunk(gene_counts, disp_adj, sf, dmatrix0, dmatrix1, CFG, sp.arange(gene_counts.shape[0]), log=CFG['verbose']) if CFG['verbose']: print '' return pval
def test_count(gene_counts, disp_adj, sf, dmatrix0, dmatrix1, CFG): if CFG['verbose']: print 'Start the statistical test.' if CFG['parallel'] > 1: pval = sp.zeros((gene_counts.shape[0], 1), dtype='float') pval.fill(sp.nan) pool = mp.Pool(processes=CFG['parallel'], initializer=lambda: sig.signal(sig.SIGINT, sig.SIG_IGN)) binsize = 30 idx_chunks = [ sp.arange(x, min(x + binsize, gene_counts.shape[0])) for x in range(0, gene_counts.shape[0], binsize) ] try: result = [ pool.apply_async(test_count_chunk, args=( gene_counts[cidx, :], disp_adj[cidx], sf, dmatrix0, dmatrix1, CFG, cidx, )) for cidx in idx_chunks ] res_cnt = 0 while result: tmp = result.pop(0).get() for i, j in enumerate(tmp[1]): if CFG['verbose']: log_progress(res_cnt, gene_counts.shape[0]) res_cnt += 1 pval[j] = tmp[0][i] if CFG['verbose']: log_progress(gene_counts.shape[0], gene_counts.shape[0]) print '' pool.terminate() pool.join() except KeyboardInterrupt: print >> sys.stderr, 'Keyboard Interrupt - exiting' pool.terminate() pool.join() sys.exit(1) else: (pval, _) = test_count_chunk(gene_counts, disp_adj, sf, dmatrix0, dmatrix1, CFG, sp.arange(gene_counts.shape[0]), log=CFG['verbose']) if CFG['verbose']: print '' return pval
def adjust_dispersion(counts, dmatrix1, disp_raw, disp_fitted, idx, sf, CFG): if CFG['verbose']: print 'Start to estimate adjusted dispersions.' varLogDispSamp = polygamma( 1, (dmatrix1.shape[0] - dmatrix1.shape[1]) / 2) ## number of samples - number of coefficients varPrior = calculate_varPrior(disp_raw, disp_fitted, idx, varLogDispSamp) if CFG['parallel'] > 1: disp_adj = sp.empty((counts.shape[0], 1)) disp_adj.fill(sp.nan) disp_adj_conv = sp.zeros_like(disp_adj, dtype='bool') pool = mp.Pool(processes=CFG['parallel'], initializer=lambda: sig.signal(sig.SIGINT, sig.SIG_IGN)) binsize = 30 idx_chunks = [ sp.arange(x, min(x + binsize, counts.shape[0])) for x in range(0, counts.shape[0], binsize) ] try: result = [ pool.apply_async(adjust_dispersion_chunk, args=( counts[cidx, :], dmatrix1, disp_raw[cidx], disp_fitted[cidx], varPrior, sf, CFG, cidx, )) for cidx in idx_chunks ] res_cnt = 0 while result: tmp = result.pop(0).get() for i, j in enumerate(tmp[2]): if CFG['verbose']: log_progress(res_cnt, counts.shape[0]) res_cnt += 1 disp_adj[j] = tmp[0][i] disp_adj_conv[j] = tmp[1][i] if CFG['verbose']: log_progress(counts.shape[0], counts.shape[0]) print '' pool.terminate() pool.join() except KeyboardInterrupt: print >> sys.stderr, 'Keyboard Interrupt - exiting' pool.terminate() pool.join() sys.exit(1) else: (disp_adj, disp_adj_conv, _) = adjust_dispersion_chunk(counts, dmatrix1, disp_raw, disp_fitted, varPrior, sf, CFG, sp.arange(counts.shape[0]), log=CFG['verbose']) if CFG['diagnose_plots']: plot.mean_variance_plot(counts=counts, disp=disp_adj, matrix=dmatrix1, figtitle='Adjusted Dispersion Estimate', filename=os.path.join( CFG['plot_dir'], 'dispersion_adjusted.pdf'), CFG=CFG) return (disp_adj, disp_adj_conv)
def adjust_dispersion_chunk(counts, dmatrix1, disp_raw, disp_fitted, varPrior, sf, CFG, idx, log=False): disp_adj = sp.empty((counts.shape[0], 1)) disp_adj.fill(sp.nan) disp_adj_conv = sp.zeros_like(disp_adj, dtype='bool') error_cnt = 0 for i in range(idx.shape[0]): if log: log_progress(i, idx.shape[0]) if not sp.isnan(disp_raw[i]): ### init dispersion and response disp = 0.1 resp = counts[i, :].astype('int') ### run for max 10 iterations for j in range(10): modNB = sm.GLM(resp, dmatrix1, family=sm.families.NegativeBinomial(alpha=disp), offset=sp.log(sf)) result = modNB.fit() dispBef = disp yhat = result.mu sign = -1.0 with warnings.catch_warnings(): warnings.simplefilter("ignore") try: res = minimize_scalar( adj_loglikelihood_shrink_scalar_onedisper, args=(dmatrix1, resp, yhat, disp_fitted[i], varPrior, sign), method='Bounded', bounds=(0, 10.0), tol=1e-5) except TypeError: disp_adj[i] = disp disp_adj_conv[i] = False error_cnt += 1 break disp = res.x if abs(sp.log(disp) - sp.log(dispBef)) < 1e-4: disp_adj[i] = disp disp_adj_conv[i] = True break else: disp_adj[i] = disp disp_adj_conv[i] = False if log: log_progress(idx.shape[0], idx.shape[0]) print '' if error_cnt > 0: print 'Warning: %i events did not fit due to a TypeError' % error_cnt return (disp_adj, disp_adj_conv, idx)
def estimate_dispersion(gene_counts, matrix, sf, CFG): if CFG['verbose']: print 'Estimating raw dispersions' if CFG['parallel'] > 1: disp_raw = sp.empty((gene_counts.shape[0], 1), dtype='float') disp_raw.fill(sp.nan) disp_raw_conv = sp.zeros((gene_counts.shape[0], 1), dtype='bool') pool = mp.Pool(processes=CFG['parallel'], initializer=lambda: sig.signal(sig.SIGINT, sig.SIG_IGN)) binsize = 30 idx_chunks = [ sp.arange(x, min(x + binsize, gene_counts.shape[0])) for x in range(0, gene_counts.shape[0], binsize) ] try: result = [ pool.apply_async(estimate_dispersion_chunk, args=( gene_counts[idx, :], matrix, sf, CFG, idx, )) for idx in idx_chunks ] res_cnt = 0 while result: tmp = result.pop(0).get() for i, j in enumerate(tmp[2]): if CFG['verbose']: log_progress(res_cnt, gene_counts.shape[0]) res_cnt += 1 disp_raw[j] = tmp[0][i] disp_raw_conv[j] = tmp[1][i] if CFG['verbose']: log_progress(gene_counts.shape[0], gene_counts.shape[0]) print '' pool.terminate() pool.join() except KeyboardInterrupt: print >> sys.stderr, 'Keyboard Interrupt - exiting' pool.terminate() pool.join() sys.exit(1) else: (disp_raw, disp_raw_conv, _) = estimate_dispersion_chunk(gene_counts, matrix, sf, CFG, sp.arange(gene_counts.shape[0]), log=CFG['verbose']) if CFG['diagnose_plots']: plot.mean_variance_plot(counts=gene_counts, disp=disp_raw, matrix=matrix, figtitle='Raw Dispersion Estimate', filename=os.path.join(CFG['plot_dir'], 'dispersion_raw.pdf'), CFG=CFG) return (disp_raw, disp_raw_conv)
def get_gene_expression(CFG, fn_out=None, strain_subset=None): if CFG['verbose']: sys.stdout.write('Quantifying gene expression ...\n') ### load gene information if CFG['is_matlab']: genes = scio.loadmat(CFG['fname_genes'], struct_as_record=False)['genes'][0, :] numgenes = len(genes) else: genes = cPickle.load(open(CFG['fname_genes'], 'r'))[0] numgenes = genes.shape[0] ### open hdf5 file containing graph count information IN = h5py.File(CFG['fname_count_in'], 'r') strains = IN['strains'][:].astype('str') if strain_subset is None: strain_idx = sp.arange(strains.shape[0]) else: strain_idx = sp.where(sp.in1d(strains, strain_subset))[0] gene_counts = sp.zeros((numgenes, strain_idx.shape[0]), dtype='float') gene_names = sp.array([x.name for x in genes], dtype='str') if CFG['is_matlab']: seg_lens = IN['seg_len'][:, 0] gene_ids_segs = IN['gene_ids_segs'][0, :].astype('int') - 1 else: seg_lens = IN['seg_len'][:] gene_ids_segs = IN['gene_ids_segs'][:].astype('int') ### no longer assume that the gene_ids_segs are sorted by gene ID s_idx = sp.argsort(gene_ids_segs[:, 0], kind='mergesort') _, u_idx = sp.unique(gene_ids_segs[s_idx, 0], return_index=True) s_idx = s_idx[u_idx] ### iterate over genes for gidx, iidx in enumerate(s_idx): if CFG['verbose']: log_progress(gidx, numgenes, 100) ### get idx of non alternative segments if CFG['is_matlab']: non_alt_idx = get_non_alt_seg_ids_matlab(genes[gidx]) seg_idx = sp.arange(iidx, iidx + genes[gidx].segmentgraph[0, 2].shape[0]) if len(seg_idx) == 0: continue else: non_alt_idx = genes[gidx].get_non_alt_seg_ids() seg_idx = sp.arange( iidx, iidx + genes[gidx].segmentgraph.seg_edges.shape[0]) gene_idx = gene_ids_segs[seg_idx] if len(gene_idx.shape) > 0: gene_idx = gene_idx[0] if CFG['is_matlab']: assert (IN['gene_names'][gene_idx] == genes[gidx].name) else: assert (IN['gene_names'][:][gene_idx] == genes[gidx].name) assert (genes[gidx].name == gene_names[gidx]) if CFG['non_alt_norm']: seg_idx = seg_idx[non_alt_idx] ### compute gene expression as the read count over all non alternative segments if CFG['is_matlab']: #gene_counts[gidx, :] = sp.dot(IN['segments'][:, seg_idx], IN['seg_len'][seg_idx, 0]) / sp.sum(IN['seg_len'][seg_idx, 0]) gene_counts[gidx, :] = sp.dot( IN['segments'][:, seg_idx][strain_idx], seg_lens[seg_idx]) / CFG['read_length'] #seg_offset += genes[gidx].segmentgraph[0, 2].shape[0] else: #gene_counts[gidx, :] = sp.dot(IN['segments'][seg_idx, :].T, IN['seg_len'][:][seg_idx]) / sp.sum(IN['seg_len'][:][seg_idx]) if seg_idx.shape[0] > 1: gene_counts[gidx, :] = sp.dot( IN['segments'][seg_idx, :][:, strain_idx].T, seg_lens[seg_idx, 0]) / CFG['read_length'] else: gene_counts[gidx, :] = IN['segments'][ seg_idx, :][strain_idx] * seg_lens[seg_idx, 0] / CFG['read_length'] #seg_offset += genes[gidx].segmentgraph.seg_edges.shape[0] IN.close() if CFG['verbose']: sys.stdout.write('\n... done.\n') ### write results to hdf5 if fn_out is not None: OUT = h5py.File(fn_out, 'w') OUT.create_dataset(name='strains', data=strains[strain_idx]) OUT.create_dataset(name='genes', data=gene_names) OUT.create_dataset(name='raw_count', data=gene_counts, compression="gzip") OUT.close() return (gene_counts, strains, gene_names)
def get_gene_expression(CFG, fn_out=None, strain_subset=None): if CFG['verbose']: sys.stdout.write('Quantifying gene expression ...\n') ### load gene information if CFG['is_matlab']: genes = scio.loadmat(CFG['fname_genes'], struct_as_record=False)['genes'][0, :] numgenes = len(genes) else: genes = cPickle.load(open(CFG['fname_genes'], 'r'))[0] numgenes = genes.shape[0] ### open hdf5 file containing graph count information IN = h5py.File(CFG['fname_count_in'], 'r') strains = IN['strains'][:].astype('str') if strain_subset is None: strain_idx = sp.arange(strains.shape[0]) else: strain_idx = sp.where(sp.in1d(strains, strain_subset))[0] gene_counts = sp.zeros((numgenes, strain_idx.shape[0]), dtype='float') gene_names = sp.array([x.name for x in genes], dtype='str') if CFG['is_matlab']: seg_lens = IN['seg_len'][:, 0] gene_ids_segs = IN['gene_ids_segs'][0, :].astype('int') - 1 else: seg_lens = IN['seg_len'][:] gene_ids_segs = IN['gene_ids_segs'][:].astype('int') ### no longer assume that the gene_ids_segs are sorted by gene ID s_idx = sp.argsort(gene_ids_segs[:, 0], kind='mergesort') _, u_idx = sp.unique(gene_ids_segs[s_idx, 0], return_index=True) s_idx = s_idx[u_idx] ### iterate over genes for gidx, iidx in enumerate(s_idx): if CFG['verbose']: log_progress(gidx, numgenes, 100) ### get idx of non alternative segments if CFG['is_matlab']: non_alt_idx = get_non_alt_seg_ids_matlab(genes[gidx]) seg_idx = sp.arange(iidx, iidx + genes[gidx].segmentgraph[0, 2].shape[0]) if len(seg_idx) == 0: continue else: non_alt_idx = genes[gidx].get_non_alt_seg_ids() seg_idx = sp.arange(iidx, iidx + genes[gidx].segmentgraph.seg_edges.shape[0]) gene_idx = gene_ids_segs[seg_idx] if len(gene_idx.shape) > 0: gene_idx = gene_idx[0] if CFG['is_matlab']: assert(IN['gene_names'][gene_idx] == genes[gidx].name) else: assert(IN['gene_names'][:][gene_idx] == genes[gidx].name) assert(genes[gidx].name == gene_names[gidx]) if CFG['non_alt_norm']: seg_idx = seg_idx[non_alt_idx] ### compute gene expression as the read count over all non alternative segments if CFG['is_matlab']: #gene_counts[gidx, :] = sp.dot(IN['segments'][:, seg_idx], IN['seg_len'][seg_idx, 0]) / sp.sum(IN['seg_len'][seg_idx, 0]) gene_counts[gidx, :] = sp.dot(IN['segments'][:, seg_idx][strain_idx], seg_lens[seg_idx]) / CFG['read_length'] #seg_offset += genes[gidx].segmentgraph[0, 2].shape[0] else: #gene_counts[gidx, :] = sp.dot(IN['segments'][seg_idx, :].T, IN['seg_len'][:][seg_idx]) / sp.sum(IN['seg_len'][:][seg_idx]) if seg_idx.shape[0] > 1: gene_counts[gidx, :] = sp.dot(IN['segments'][seg_idx, :][:, strain_idx].T, seg_lens[seg_idx, 0]) / CFG['read_length'] else: gene_counts[gidx, :] = IN['segments'][seg_idx, :][strain_idx] * seg_lens[seg_idx, 0] / CFG['read_length'] #seg_offset += genes[gidx].segmentgraph.seg_edges.shape[0] IN.close() if CFG['verbose']: sys.stdout.write('\n... done.\n') ### write results to hdf5 if fn_out is not None: OUT = h5py.File(fn_out, 'w') OUT.create_dataset(name='strains', data=strains[strain_idx]) OUT.create_dataset(name='genes', data=gene_names) OUT.create_dataset(name='raw_count', data=gene_counts, compression="gzip") OUT.close() return (gene_counts, strains, gene_names)