def count_graph_coverage_wrapper(fname_in, fname_out, CFG): (genes, inserted) = cPickle.load(open(fname_in, 'r')) if genes[0].segmentgraph is None or genes[0].segmentgraph.is_empty(): for g in genes: g.segmentgraph = Segmentgraph(g) cPickle.dump((genes, inserted), open(fname_in, 'w'), -1) counts = dict() counts['segments'] = [] counts['seg_pos'] = [] counts['gene_ids_segs'] = [] counts['edges'] = [] counts['gene_ids_edges'] = [] counts['seg_len'] = sp.hstack([ x.segmentgraph.segments[1, :] - x.segmentgraph.segments[0, :] for x in genes ]).T counts['gene_names'] = sp.array([x.name for x in genes], dtype='str') if not CFG['rproc']: for s_idx in range(CFG['strains'].shape[0]): print '\n%i/%i' % (s_idx + 1, CFG['strains'].shape[0]) if s_idx == 0: counts_tmp = count_graph_coverage(genes, CFG['bam_fnames'][s_idx], CFG) else: counts_tmp = sp.r_[ sp.atleast_2d(counts_tmp), count_graph_coverage(genes, CFG['bam_fnames'][s_idx], CFG)] for c in range(counts_tmp.shape[1]): counts['segments'].append( sp.hstack( [sp.atleast_2d(x.segments).T for x in counts_tmp[:, c]])) counts['seg_pos'].append( sp.hstack( [sp.atleast_2d(x.seg_pos).T for x in counts_tmp[:, c]])) counts['gene_ids_segs'].append( sp.ones((sp.atleast_2d(counts_tmp[0, c].seg_pos).shape[1], 1), dtype='int') * c) tmp = [ sp.atleast_2d(x.edges) for x in counts_tmp[:, c] if x.edges.shape[0] > 0 ] if len(tmp) == 0: continue tmp = sp.hstack(tmp) if tmp.shape[0] > 0: counts['edges'].append(sp.c_[tmp[:, 0], tmp[:, range(1, tmp.shape[1], 2)]]) counts['gene_ids_edges'].append( sp.ones((tmp.shape[0], 1), dtype='int') * c) ### write result data to hdf5 for key in counts: counts[key] = sp.vstack( counts[key]) if len(counts[key]) > 0 else counts[key] counts['edge_idx'] = counts['edges'][:, 0] if len( counts['edges']) > 0 else sp.array([]) counts['edges'] = counts['edges'][:, 1:] if len( counts['edges']) > 0 else sp.array([]) h5fid = h5py.File(fname_out, 'w') h5fid.create_dataset(name='strains', data=CFG['strains']) for key in counts: h5fid.create_dataset(name=key, data=counts[key]) h5fid.close() else: ### have an adaptive chunk size, that takes into account the number of strains (take as many genes as it takes to have ~10K strains) chunksize = int(max(1, math.floor(10000 / len(CFG['strains'])))) jobinfo = [] PAR = dict() PAR['CFG'] = CFG for c_idx in range(0, genes.shape[0], chunksize): cc_idx = min(genes.shape[0], c_idx + chunksize) fn = fname_out.replace('.pickle', '.chunk_%i_%i.pickle' % (c_idx, cc_idx)) if os.path.exists(fn): continue else: print 'submitting chunk %i to %i' % (c_idx, cc_idx) PAR['genes'] = genes[c_idx:cc_idx] PAR['fn_bam'] = CFG['bam_fnames'] PAR['fn_out'] = fn PAR['CFG'] = CFG jobinfo.append( rp.rproc('count_graph_coverage', PAR, 6000, CFG['options_rproc'], 60 * 48)) rp.rproc_wait(jobinfo, 30, 1.0, -1) ### merge results from count chunks if 'verbose' in CFG and CFG['verbose']: print '\nCollecting count data from chunks ...\n' print 'writing data to %s' % fname_out ### write data to hdf5 continuously h5fid = h5py.File(fname_out, 'w') h5fid.create_dataset(name='gene_names', data=counts['gene_names']) h5fid.create_dataset(name='seg_len', data=counts['seg_len']) h5fid.create_dataset(name='strains', data=CFG['strains']) for c_idx in range(0, genes.shape[0], chunksize): cc_idx = min(genes.shape[0], c_idx + chunksize) if 'verbose' in CFG and CFG['verbose']: print 'collecting chunk %i-%i (%i)' % (c_idx, cc_idx, genes.shape[0]) fn = fname_out.replace('.pickle', '.chunk_%i_%i.pickle' % (c_idx, cc_idx)) if not os.path.exists(fn): print >> sys.stderr, 'ERROR: Not all chunks in counting graph coverage completed!' sys.exit(1) else: counts_tmp = cPickle.load(open(fn, 'r')) for c in range(counts_tmp.shape[1]): if 'segments' in h5fid: appendToHDF5( h5fid, sp.hstack([ sp.atleast_2d(x.segments).T for x in counts_tmp[:, c] ]), 'segments') appendToHDF5( h5fid, sp.hstack([ sp.atleast_2d(x.seg_pos).T for x in counts_tmp[:, c] ]), 'seg_pos') appendToHDF5( h5fid, sp.ones((sp.atleast_2d( counts_tmp[0, c].seg_pos).shape[1], 1), dtype='int') * (c_idx + c), 'gene_ids_segs') else: h5fid.create_dataset(name='segments', data=sp.hstack([ sp.atleast_2d(x.segments).T for x in counts_tmp[:, c] ]), chunks=True, compression='gzip', maxshape=(None, len(CFG['strains']))) h5fid.create_dataset(name='seg_pos', data=sp.hstack([ sp.atleast_2d(x.seg_pos).T for x in counts_tmp[:, c] ]), chunks=True, compression='gzip', maxshape=(None, len(CFG['strains']))) h5fid.create_dataset( name='gene_ids_segs', data=sp.ones((sp.atleast_2d( counts_tmp[0, c].seg_pos).shape[1], 1), dtype='int') * (c_idx + c), chunks=True, compression='gzip', maxshape=(None, 1)) #counts['segments'].append(sp.hstack([sp.atleast_2d(x.segments).T for x in counts_tmp[:, c]])) #counts['seg_pos'].append(sp.hstack([sp.atleast_2d(x.seg_pos).T for x in counts_tmp[:, c]])) #counts['gene_ids_segs'].append(sp.ones((sp.atleast_2d(counts_tmp[0, c].seg_pos).shape[1], 1), dtype='int') * (c_idx + c)) tmp = [ sp.atleast_2d(x.edges) for x in counts_tmp[:, c] if x.edges.shape[0] > 0 ] if len(tmp) == 0: continue tmp = sp.hstack(tmp) if tmp.shape[0] > 0: if 'edges' in h5fid: appendToHDF5(h5fid, tmp[:, range(1, tmp.shape[1], 2)], 'edges') appendToHDF5(h5fid, tmp[:, 0], 'edge_idx') appendToHDF5( h5fid, sp.ones((tmp.shape[0], 1), dtype='int') * (c_idx + c), 'gene_ids_edges') else: h5fid.create_dataset( name='edges', data=tmp[:, range(1, tmp.shape[1], 2)], chunks=True, compression='gzip', maxshape=(None, tmp.shape[1] / 2)) h5fid.create_dataset(name='edge_idx', data=tmp[:, 0], chunks=True, compression='gzip', maxshape=(None, )) h5fid.create_dataset( name='gene_ids_edges', data=sp.ones((tmp.shape[0], 1), dtype='int') * (c_idx + c), chunks=True, compression='gzip', maxshape=(None, 1)) #counts['edges'].append(sp.c_[tmp[:, 0], tmp[:, range(1, tmp.shape[1], 2)]]) #counts['gene_ids_edges'].append(sp.ones((tmp.shape[0], 1), dtype='int') * (c_idx + c)) del tmp, counts_tmp h5fid.close()
def count_graph_coverage_wrapper(fname_in, fname_out, CFG, sample_idx=None): (genes, inserted) = cPickle.load(open(fname_in, 'r')) if genes[0].segmentgraph is None or genes[0].segmentgraph.is_empty(): for g in genes: g.segmentgraph = Segmentgraph(g) cPickle.dump((genes, inserted), open(fname_in, 'w'), -1) counts = dict() counts['segments'] = [] counts['seg_pos'] = [] counts['gene_ids_segs'] = [] counts['edges'] = [] counts['gene_ids_edges'] = [] counts['seg_len'] = sp.hstack([x.segmentgraph.segments[1, :] - x.segmentgraph.segments[0, :] for x in genes]).T counts['gene_names'] = sp.array([x.name for x in genes], dtype='str') if not CFG['rproc']: if CFG['merge_strategy'] == 'single': print '\nprocessing %s' % (CFG['samples'][sample_idx]) counts_tmp = count_graph_coverage(genes, CFG['bam_fnames'][sample_idx], CFG) else: for s_idx in range(CFG['strains'].shape[0]): print '\n%i/%i' % (s_idx + 1, CFG['strains'].shape[0]) if s_idx == 0: counts_tmp = count_graph_coverage(genes, CFG['bam_fnames'][s_idx], CFG) else: counts_tmp = sp.r_[sp.atleast_2d(counts_tmp), count_graph_coverage(genes, CFG['bam_fnames'][s_idx], CFG)] for c in range(counts_tmp.shape[1]): counts['segments'].append(sp.hstack([sp.atleast_2d(x.segments).T for x in counts_tmp[:, c]])) counts['seg_pos'].append(sp.hstack([sp.atleast_2d(x.seg_pos).T for x in counts_tmp[:, c]])) counts['gene_ids_segs'].append(sp.ones((sp.atleast_2d(counts_tmp[0, c].seg_pos).shape[1], 1), dtype='int') * c) tmp = [sp.atleast_2d(x.edges) for x in counts_tmp[:, c] if x.edges.shape[0] > 0] if len(tmp) == 0: continue tmp = sp.hstack(tmp) if tmp.shape[0] > 0: counts['edges'].append(sp.c_[tmp[:, 0], tmp[:, range(1, tmp.shape[1], 2)]]) counts['gene_ids_edges'].append(sp.ones((tmp.shape[0], 1), dtype='int') * c) ### write result data to hdf5 for key in counts: counts[key] = sp.vstack(counts[key]) if len(counts[key]) > 0 else counts[key] counts['edge_idx'] = counts['edges'][:, 0] if len(counts['edges']) > 0 else sp.array([]) counts['edges'] = counts['edges'][:, 1:] if len(counts['edges']) > 0 else sp.array([]) h5fid = h5py.File(fname_out, 'w') h5fid.create_dataset(name='strains', data=CFG['strains']) for key in counts: h5fid.create_dataset(name=key, data=counts[key]) h5fid.close() else: ### have an adaptive chunk size, that takes into account the number of strains (take as many genes as it takes to have ~10K strains) chunksize = int(max(1, math.floor(10000 / len(CFG['strains'])))) jobinfo = [] PAR = dict() PAR['CFG'] = CFG.copy() if CFG['merge_strategy'] == 'single': PAR['CFG']['bam_fnames'] = PAR['CFG']['bam_fnames'][sample_idx] PAR['CFG']['samples'] = PAR['CFG']['samples'][sample_idx] PAR['CFG']['strains'] = PAR['CFG']['strains'][sample_idx] #s_idx = sp.argsort([x.chr for x in genes]) # TODO s_idx = sp.arange(genes.shape[0]) for c_idx in range(0, s_idx.shape[0], chunksize): cc_idx = min(s_idx.shape[0], c_idx + chunksize) fn = re.sub(r'.hdf5$', '', fname_out) + '.chunk_%i_%i.pickle' % (c_idx, cc_idx) if os.path.exists(fn): continue else: print 'submitting chunk %i to %i (%i)' % (c_idx, cc_idx, s_idx.shape[0]) PAR['genes'] = genes[s_idx][c_idx:cc_idx] PAR['fn_bam'] = CFG['bam_fnames'] PAR['fn_out'] = fn PAR['CFG'] = CFG jobinfo.append(rp.rproc('count_graph_coverage', PAR, 15000, CFG['options_rproc'], 60*12)) rp.rproc_wait(jobinfo, 30, 1.0, -1) del genes ### merge results from count chunks if 'verbose' in CFG and CFG['verbose']: print '\nCollecting count data from chunks ...\n' print 'writing data to %s' % fname_out ### write data to hdf5 continuously h5fid = h5py.File(fname_out, 'w') h5fid.create_dataset(name='gene_names', data=counts['gene_names']) h5fid.create_dataset(name='seg_len', data=counts['seg_len']) h5fid.create_dataset(name='strains', data=CFG['strains']) for c_idx in range(0, s_idx.shape[0], chunksize): cc_idx = min(s_idx.shape[0], c_idx + chunksize) if 'verbose' in CFG and CFG['verbose']: print 'collecting chunk %i-%i (%i)' % (c_idx, cc_idx, s_idx.shape[0]) fn = re.sub(r'.hdf5$', '', fname_out) + '.chunk_%i_%i.pickle' % (c_idx, cc_idx) if not os.path.exists(fn): print >> sys.stderr, 'ERROR: Not all chunks in counting graph coverage completed!' sys.exit(1) else: counts_tmp = cPickle.load(open(fn, 'r')) for c in range(counts_tmp.shape[1]): if 'segments' in h5fid: appendToHDF5(h5fid, sp.hstack([sp.atleast_2d(x.segments).T for x in counts_tmp[:, c]]), 'segments') appendToHDF5(h5fid, sp.hstack([sp.atleast_2d(x.seg_pos).T for x in counts_tmp[:, c]]), 'seg_pos') appendToHDF5(h5fid, sp.ones((sp.atleast_2d(counts_tmp[0, c].seg_pos).shape[1], 1), dtype='int') * (s_idx[c_idx + c]), 'gene_ids_segs') else: h5fid.create_dataset(name='segments', data=sp.hstack([sp.atleast_2d(x.segments).T for x in counts_tmp[:, c]]), chunks=True, compression='gzip', maxshape=(None, len(CFG['strains']))) h5fid.create_dataset(name='seg_pos', data=sp.hstack([sp.atleast_2d(x.seg_pos).T for x in counts_tmp[:, c]]), chunks=True, compression='gzip', maxshape=(None, len(CFG['strains']))) h5fid.create_dataset(name='gene_ids_segs', data=sp.ones((sp.atleast_2d(counts_tmp[0, c].seg_pos).shape[1], 1), dtype='int') * (s_idx[c_idx + c]), chunks=True, compression='gzip', maxshape=(None, 1)) tmp = [sp.atleast_2d(x.edges) for x in counts_tmp[:, c] if x.edges.shape[0] > 0] if len(tmp) == 0: continue tmp = sp.hstack(tmp) if tmp.shape[0] > 0: if 'edges' in h5fid: appendToHDF5(h5fid, tmp[:, range(1, tmp.shape[1], 2)], 'edges') appendToHDF5(h5fid, tmp[:, 0], 'edge_idx') appendToHDF5(h5fid, sp.ones((tmp.shape[0], 1), dtype='int') * (s_idx[c_idx + c]), 'gene_ids_edges') else: h5fid.create_dataset(name='edges', data=tmp[:, range(1, tmp.shape[1], 2)], chunks=True, compression='gzip', maxshape=(None, tmp.shape[1] / 2)) h5fid.create_dataset(name='edge_idx', data=tmp[:, 0], chunks=True, compression='gzip', maxshape=(None,)) h5fid.create_dataset(name='gene_ids_edges', data=sp.ones((tmp.shape[0], 1), dtype='int') * (s_idx[c_idx + c]), chunks=True, compression='gzip', maxshape=(None, 1)) del tmp, counts_tmp h5fid.close()
def run_merge(CFG): merge_all = (CFG['merge_strategy'] == 'merge_all') merge_all_tag = '' if merge_all: merge_all_tag = '_merged_bams' prune_tag = '' if CFG['do_prune']: prune_tag = '_pruned' chunksize = 50 fn_out = '%s/spladder/genes_graph_conf%i.%s%s.pickle' % (CFG['out_dirname'] , CFG['confidence_level'], CFG['merge_strategy'], prune_tag) fn_out_val = '%s/spladder/genes_graph_conf%i.%s%s.validated.pickle' % (CFG['out_dirname'], CFG['confidence_level'], CFG['merge_strategy'], prune_tag) if CFG['validate_splicegraphs']: fn_out_count = '%s/spladder/genes_graph_conf%i.%s%s.validated.count.pickle' % (CFG['out_dirname'], CFG['confidence_level'], CFG['merge_strategy'] , prune_tag) else: fn_out_count = '%s/spladder/genes_graph_conf%i.%s%s.count.pickle' % (CFG['out_dirname'], CFG['confidence_level'], CFG['merge_strategy'] , prune_tag) if not os.path.exists(fn_out): if not CFG['rproc']: merge_genes_by_splicegraph(CFG) else: jobinfo = [] PAR = dict() PAR['CFG'] = CFG if chunksize > 0: merge_list_len = len(CFG['samples']) if merge_all: merge_list_len += 1 for c_idx in range(0, merge_list_len, chunksize): fn = '%s/spladder/genes_graph_conf%i.%s%s_chunk%i_%i.pickle' % (CFG['out_dirname'], CFG['confidence_level'], CFG['merge_strategy'], prune_tag, c_idx, min(merge_list_len, c_idx + chunksize)) if os.path.exists(fn): continue else: print 'submitting chunk %i to %i' % (c_idx, min(merge_list_len, c_idx + chunksize)) PAR['chunk_idx'] = range(c_idx, min(merge_list_len, c_idx + chunksize)) jobinfo.append(rp.rproc('merge_genes_by_splicegraph', PAR, 50000, CFG['options_rproc'], 40*60)) else: jobinfo.append(rp.rproc('merge_genes_by_splicegraph', PAR, 10000, CFG['options_rproc'], 40*60)) rp.rproc_wait(jobinfo, 30, 1.0, -1) ### merge chunks if chunksize > 0: PAR['chunksize'] = chunksize merge_chunks_by_splicegraph(PAR) else: print 'File %s already exists!' % fn_out ### generate validated version of splice graph if CFG['validate_splicegraphs'] and not os.path.exists(fn_out_val): (genes, inserted) = cPickle.load(open(fn_out, 'r')) genes = filter_by_edgecount(genes, CFG) cPickle.dump((genes, inserted), open(fn_out_val, 'w'), -1) del genes ### count segment graph if CFG['validate_splicegraphs']: count_graph_coverage_wrapper(fn_out_val, fn_out_count, CFG) else: count_graph_coverage_wrapper(fn_out, fn_out_count, CFG) if CFG['do_gen_isoforms']: fn_out = '%s/spladder/genes_graph_conf%i.%s%s_isoforms.pickle' % (CFG['out_dirname'], CFG['confidence_level'], CFG['merge_strategy'], prune_tag) if not os.path.exists(fn_out): if not CFG['rproc']: merge_genes_by_isoform(CFG['out_dirname'], CFG['confidence_level'], merge_all, experiment) else: jobinfo = [rp.rproc('merge_genes_by_isoform', PAR, 10000, CFG['options_rproc'], 40*60)] rp.rproc_wait(jobinfo, 30, 1.0, 1) else: print 'File %s already exists!' % fn_out
def run_merge(CFG): merge_all = (CFG['merge_strategy'] == 'merge_all') merge_all_tag = '' if merge_all: merge_all_tag = '_merged_bams' prune_tag = '' if CFG['do_prune']: prune_tag = '_pruned' chunksize = 10 fn_out = '%s/spladder/genes_graph_conf%i.%s%s.pickle' % (CFG['out_dirname'] , CFG['confidence_level'], CFG['merge_strategy'], prune_tag) #fn_out_val = '%s/spladder/genes_graph_conf%i.%s%s.validated.pickle' % (CFG['out_dirname'], CFG['confidence_level'], CFG['merge_strategy'], prune_tag) if CFG['validate_splicegraphs']: fn_out_count = '%s/spladder/genes_graph_conf%i.%s%s.validated.count.pickle' % (CFG['out_dirname'], CFG['confidence_level'], CFG['merge_strategy'] , prune_tag) else: fn_out_count = '%s/spladder/genes_graph_conf%i.%s%s.count.pickle' % (CFG['out_dirname'], CFG['confidence_level'], CFG['merge_strategy'] , prune_tag) if not os.path.exists(fn_out): if not CFG['rproc']: merge_list = sp.array(['%s/spladder/genes_graph_conf%i.%s%s.pickle' % (CFG['out_dirname'], CFG['confidence_level'], x, prune_tag) for x in CFG['samples']]) merge_genes_by_splicegraph(CFG, merge_list=merge_list, fn_out=fn_out) else: jobinfo = [] PAR = dict() PAR['CFG'] = CFG if chunksize > 0: levels = int(math.ceil(math.log(len(CFG['samples']), chunksize))) level_files = dict() for level in range(1, levels + 1): print 'merging files on level %i' % level if level == 1: merge_list = sp.array(['%s/spladder/genes_graph_conf%i.%s%s.pickle' % (CFG['out_dirname'], CFG['confidence_level'], x, prune_tag) for x in CFG['samples']]) else: merge_list = sp.array(level_files[level - 1]) level_files[level] = [] for c_idx in range(0, len(merge_list), chunksize): if level == levels: assert(len(merge_list) <= chunksize) fn = fn_out else: fn = '%s/spladder/genes_graph_conf%i.%s%s_level%i_chunk%i_%i.pickle' % (CFG['out_dirname'], CFG['confidence_level'], CFG['merge_strategy'], prune_tag, level, c_idx, min(len(merge_list), c_idx + chunksize)) level_files[level].append(fn) if os.path.exists(fn): continue else: print 'submitting level %i chunk %i to %i' % (level, c_idx, min(len(merge_list), c_idx + chunksize)) chunk_idx = range(c_idx, min(len(merge_list), c_idx + chunksize)) PAR['merge_list'] = merge_list[chunk_idx] PAR['fn_out'] = fn jobinfo.append(rp.rproc('merge_genes_by_splicegraph', PAR, 20000*level, CFG['options_rproc'], 40*60)) rp.rproc_wait(jobinfo, 30, 1.0, -1) else: PAR['merge_list'] = CFG['samples'] PAR['fn_out'] = fn_out jobinfo.append(rp.rproc('merge_genes_by_splicegraph', PAR, 10000, CFG['options_rproc'], 40*60)) rp.rproc_wait(jobinfo, 30, 1.0, -1) else: print 'File %s already exists!' % fn_out ### generate validated version of splice graph #if CFG['validate_splicegraphs'] and not os.path.exists(fn_out_val): # (genes, inserted) = cPickle.load(open(fn_out, 'r')) # genes = filter_by_edgecount(genes, CFG) # cPickle.dump((genes, inserted), open(fn_out_val, 'w'), -1) # del genes ### count segment graph #if CFG['validate_splicegraphs']: # count_graph_coverage_wrapper(fn_out_val, fn_out_count, CFG) #else: # count_graph_coverage_wrapper(fn_out, fn_out_count, CFG) if CFG['do_gen_isoforms']: fn_out = '%s/spladder/genes_graph_conf%i.%s%s_isoforms.pickle' % (CFG['out_dirname'], CFG['confidence_level'], CFG['merge_strategy'], prune_tag) if not os.path.exists(fn_out): if not CFG['rproc']: merge_genes_by_isoform(CFG['out_dirname'], CFG['confidence_level'], merge_all, experiment) else: jobinfo = [rp.rproc('merge_genes_by_isoform', PAR, 10000, CFG['options_rproc'], 40*60)] rp.rproc_wait(jobinfo, 30, 1.0, 1) else: print 'File %s already exists!' % fn_out
def count_graph_coverage_wrapper(fname_in, fname_out, CFG): (genes, inserted) = cPickle.load(open(fname_in, 'r')) if genes[0].segmentgraph is None: for g in genes: g.segmentgraph = Segmentgraph(g) cPickle.dump((genes, inserted), open(fname_in, 'w'), -1) counts = dict() counts['segments'] = [] counts['seg_pos'] = [] counts['gene_ids_segs'] = [] counts['edges'] = [] counts['gene_ids_edges'] = [] if not CFG['rproc']: for s_idx in range(CFG['strains'].shape[0]): print '\n%i/%i' % (s_idx + 1, CFG['strains'].shape[0]) if s_idx == 0: counts_tmp = count_graph_coverage(genes, CFG['bam_fnames'][s_idx], CFG) else: counts_tmp = sp.r_[sp.atleast_2d(counts_tmp), count_graph_coverage(genes, CFG['bam_fnames'][s_idx], CFG)] for c in range(counts_tmp.shape[1]): counts['segments'].append(sp.hstack([sp.atleast_2d(x.segments).T for x in counts_tmp[:, c]])) counts['seg_pos'].append(sp.hstack([sp.atleast_2d(x.seg_pos).T for x in counts_tmp[:, c]])) counts['gene_ids_segs'].append(sp.ones((sp.atleast_2d(counts_tmp[0, c].seg_pos).shape[1], 1), dtype='int') * c) tmp = sp.hstack([sp.atleast_2d(x.edges) for x in counts_tmp[:, c]]) if tmp.shape[0] > 0: counts['edges'].append(sp.c_[tmp[:, 0], tmp[:, range(1, tmp.shape[1], 2)]]) counts['gene_ids_edges'].append(sp.ones((tmp.shape[0], 1), dtype='int') * c) else: ### have an adaptive chunk size, that takes into account the number of strains (take as many genes as it takes to have ~10K strains) chunksize = int(max(1, math.floor(10000 / len(CFG['strains'])))) jobinfo = [] PAR = dict() PAR['CFG'] = CFG for c_idx in range(0, genes.shape[0], chunksize): cc_idx = min(genes.shape[0], c_idx + chunksize) fn = fname_out.replace('.pickle', '.chunk_%i_%i.pickle' % (c_idx, cc_idx)) if os.path.exists(fn): continue else: print 'submitting chunk %i to %i' % (c_idx, cc_idx) PAR['genes'] = genes[c_idx:cc_idx] PAR['fn_bam'] = CFG['bam_fnames'] PAR['fn_out'] = fn PAR['CFG'] = CFG jobinfo.append(rp.rproc('count_graph_coverage', PAR, 30000, CFG['options_rproc'], 60)) rp.rproc_wait(jobinfo, 30, 1.0, -1) ### merge results for c_idx in range(0, genes.shape[0], chunksize): cc_idx = min(genes.shape[0], c_idx + chunksize) fn = fname_out.replace('.pickle', '.chunk_%i_%i.pickle' % (c_idx, cc_idx)) if not os.path.exists(fn): print >> sys.stderr, 'ERROR: Not all chunks in counting graph coverage completed!' sys.exit(1) else: counts_tmp = cPickle.load(open(fn, 'r')) for c in range(counts_tmp.shape[1]): counts['segments'].append(sp.hstack([sp.atleast_2d(x.segments).T for x in counts_tmp[:, c]])) counts['seg_pos'].append(sp.hstack([sp.atleast_2d(x.seg_pos).T for x in counts_tmp[:, c]])) counts['gene_ids_segs'].append(sp.ones((sp.atleast_2d(counts_tmp[0, c].seg_pos).shape[1], 1), dtype='int') * (c_idx + c)) tmp = sp.hstack([sp.atleast_2d(x.edges) for x in counts_tmp[:, c]]) if tmp.shape[0] > 0: counts['edges'].append(sp.c_[tmp[:, 0], tmp[:, range(1, tmp.shape[1], 2)]]) counts['gene_ids_edges'].append(sp.ones((tmp.shape[0], 1), dtype='int') * (c_idx + c)) for key in counts: if len(counts[key]) > 0: counts[key] = sp.vstack(counts[key]) if len(counts['edges']) > 0: counts['edge_idx'] = counts['edges'][:, 0] counts['edges'] = counts['edges'][:, 1:] else: counts['edge_idx'] = sp.array([]) counts['edges'] = sp.array([]) counts['seg_len'] = sp.hstack([x.segmentgraph.segments[1, :] - x.segmentgraph.segments[0, :] for x in genes]).T ### write result data to hdf5 h5fid = h5py.File(fname_out, 'w') h5fid.create_dataset(name='gene_names', data=sp.array([x.name for x in genes], dtype='str')) h5fid.create_dataset(name='strains', data=CFG['strains']) for key in counts: h5fid.create_dataset(name=key, data=counts[key]) h5fid.close()