def create_polish_pickle(n_polish_chunks_in_bins, flnc_files, out_pickle): """ Parameters: n_polish_chunks_in_bins -- number of ice_polish chunks in each bin flnc_files -- full-length non-chimeric files in bins out_pickle -- output pickle for saving PolishChunkTask objects """ n_bins = len(flnc_files) assert isinstance(n_polish_chunks_in_bins, list) assert len(n_polish_chunks_in_bins) == n_bins log.info("Writing %s ice_polish chunk tasks to %s.", str(sum(n_polish_chunks_in_bins)), out_pickle) p = ChunkTasksPickle() for i, flnc_file in enumerate(flnc_files): log.debug("Creating %s ice_polish chunks for bin index=%s.", str(n_polish_chunks_in_bins[i]), str(i)) cluster_out_dir = _get_cluster_out_dir(flnc_file) for j in range(0, n_polish_chunks_in_bins[i]): # Create Polish chunk tasks. task_ = PolishChunkTask(cluster_bin_index=i, flnc_file=flnc_file, cluster_out_dir=cluster_out_dir, polish_index=j, n_polish_chunks=n_polish_chunks_in_bins[i]) p.append(task_) p.write(out_pickle) log.info("Saved %s polish chunk tasks to %s.", str(sum(n_polish_chunks_in_bins)), out_pickle)
def create_partial_pickle(flnc_files, chunked_nfl_files, out_pickle): """ Parameters: flnc_files -- full-length non-chimeric files in bins chunked_nfl_files -- chunked non-chimeric files out_pickle -- output pickle for saving PolishChunkTask objects """ n_bins = len(flnc_files) n_nfl_chunks = len(chunked_nfl_files) log.info("Writing %s ice_partial chunk tasks to %s.", str(n_bins * n_nfl_chunks), out_pickle) p = ChunkTasksPickle() for i, flnc_file in enumerate(flnc_files): log.debug("Processing cluster bin index=%s.", i) cluster_out_dir = _get_cluster_out_dir(flnc_file) for j, nfl_file in enumerate(chunked_nfl_files): # Create Partial chunk tasks. task_ = PartialChunkTask(cluster_bin_index=i, flnc_file=flnc_file, cluster_out_dir=cluster_out_dir, nfl_file=nfl_file, nfl_index=j, n_nfl_chunks=n_nfl_chunks) p.append(task_) p.write(out_pickle) log.info("Saved %s partial chunk tasks to %s.", str(n_bins * n_nfl_chunks), out_pickle)
def resolved_tool_contract_runner(rtc): """Given resolved tool contract, run""" p = ChunkTasksPickle.read(rtc.task.input_files[0]) assert all([isinstance(task, PartialChunkTask) for task in p]) dummy_sentinel_file = rtc.task.input_files[1] ccs_file = rtc.task.input_files[2] nproc = rtc.task.nproc tmp_dir = rtc.task.tmpdir_resources[0].path \ if len(rtc.task.tmpdir_resources) > 0 else None log.info("Looking for QVs in CCS input...") with ConsensusReadSet(ccs_file) as ds: for bam in ds.resourceReaders(): qvs = bam.pulseFeaturesAvailable() if qvs != set(['SubstitutionQV', 'InsertionQV', 'DeletionQV']): log.warn("Missing QV fields from %s, will use default probabilities", bam.filename) ccs_file = None break with open(rtc.task.output_files[0], 'w') as writer: for task in p: log.info("Running ice_partial on cluster bin %s, nfl chunk %s/%s", str(task.cluster_bin_index), str(task.nfl_index), str(task.n_nfl_chunks)) task_runner(task=task, ccs_file=ccs_file, nproc=nproc, tmp_dir=tmp_dir) writer.write("ice_partial of cluster bin %s, nfl chunk %s/%s in %s is DONE: %s\n" % (task.cluster_bin_index, task.nfl_index, task.n_nfl_chunks, task.cluster_out_dir, task.nfl_pickle))
def resolved_tool_contract_runner(rtc): """ For each cluster bin, clean up intermediate files under tmp. """ p = ChunkTasksPickle.read(rtc.task.input_files[0]) assert all([isinstance(task, ClusterChunkTask) for task in p]) cluster_bin_indices = [task.cluster_bin_index for task in p] # sanity check that Cluster indices are unique! assert len(set(cluster_bin_indices)) == len(cluster_bin_indices) sentinel_out = rtc.task.output_files[0] with open(sentinel_out, 'w') as writer: for task in p: icef = IceFiles(prog_name="ice_cleanup", root_dir=task.cluster_out_dir) tmp_dir = icef.tmp_dir log.info("Cleaning up, removing %s", tmp_dir) writer.write("removing %s\n" % tmp_dir) execute("rm -rf %s" % tmp_dir) quivered_dir = icef.quivered_dir log.info("Cleaning up, removing %s", quivered_dir) writer.write("removing %s\n" % quivered_dir) execute("rm -rf %s" % quivered_dir)
def resolved_tool_contract_runner(rtc): """resolved tool contract runner.""" p = ChunkTasksPickle.read(rtc.task.input_files[0]) assert all([isinstance(task, PolishChunkTask) for task in p]) dummy_sentinel_file = rtc.task.input_files[1] subread_set = rtc.task.input_files[2] nproc = rtc.task.nproc tmp_dir = rtc.task.tmpdir_resources[0].path \ if len(rtc.task.tmpdir_resources) > 0 else None with open(rtc.task.output_files[0], 'w') as writer: for task in p: log.info( "Running ice_polish on cluster bin %s, polish chunk %s/%s", str(task.cluster_bin_index), str(task.polish_index), str(task.n_polish_chunks)) log.debug("ice_quiver root_dir is %s", task.cluster_out_dir) log.debug("consensus_isoforms is %s", task.consensus_isoforms_file) task_runner(task=task, subread_set=subread_set, nproc=nproc, tmp_dir=tmp_dir) writer.write( "ice_polish of cluster bin %s, polish chunk %s/%s in %s is DONE.\n" % (task.cluster_bin_index, task.polish_index, task.n_polish_chunks, task.cluster_out_dir))
def resolved_tool_contract_runner(rtc): """Given resolved tool contract, run""" p = ChunkTasksPickle.read(rtc.task.input_files[0]) assert all([isinstance(task, PartialChunkTask) for task in p]) dummy_sentinel_file = rtc.task.input_files[1] ccs_file = rtc.task.input_files[2] nproc = rtc.task.nproc tmp_dir = rtc.task.tmpdir_resources[0].path \ if len(rtc.task.tmpdir_resources) > 0 else None log.info("Looking for QVs in CCS input...") with ConsensusReadSet(ccs_file) as ds: for bam in ds.resourceReaders(): qvs = bam.pulseFeaturesAvailable() if qvs != set(['SubstitutionQV', 'InsertionQV', 'DeletionQV']): log.warn( "Missing QV fields from %s, will use default probabilities", bam.filename) ccs_file = None break with open(rtc.task.output_files[0], 'w') as writer: for task in p: log.info("Running ice_partial on cluster bin %s, nfl chunk %s/%s", str(task.cluster_bin_index), str(task.nfl_index), str(task.n_nfl_chunks)) task_runner(task=task, ccs_file=ccs_file, nproc=nproc, tmp_dir=tmp_dir) writer.write( "ice_partial of cluster bin %s, nfl chunk %s/%s in %s is DONE: %s\n" % (task.cluster_bin_index, task.nfl_index, task.n_nfl_chunks, task.cluster_out_dir, task.nfl_pickle))
def resolved_tool_contract_runner(rtc): """ For each cluster bin, clean up intermediate files under tmp. """ p = ChunkTasksPickle.read(rtc.task.input_files[0]) assert all([isinstance(task, ClusterChunkTask) for task in p]) cluster_bin_indices = [task.cluster_bin_index for task in p] # sanity check that Cluster indices are unique! assert len(set(cluster_bin_indices)) == len(cluster_bin_indices) sentinel_out = rtc.task.output_files[0] with open(sentinel_out, 'w') as writer: for task in p: icef = IceFiles(prog_name="ice_cleanup", root_dir=task.cluster_out_dir) tmp_dir = icef.tmp_dir log.info("Cleaning up, removing %s", tmp_dir) writer.write("removing %s\n" % tmp_dir) execute("rm -rf %s" % real_upath(tmp_dir)) quivered_dir = icef.quivered_dir log.info("Cleaning up, removing %s", quivered_dir) writer.write("removing %s\n" % quivered_dir) execute("rm -rf %s" % real_upath(quivered_dir))
def resolved_tool_contract_runner(rtc): """run all tasks in cluster_chunks.pickle given rtc""" p = ChunkTasksPickle.read(rtc.task.input_files[0]) assert all([isinstance(task, ClusterChunkTask) for task in p]) ccs_file = rtc.task.input_files[1] assert op.exists(ccs_file) nproc = rtc.task.nproc use_finer_qv = False #if rtc.task.options.get(Constants.USE_FINER_QV_ID, False): # use_finer_qv = True with open(rtc.task.output_files[0], 'w') as writer: for i, task in enumerate(p): args = task_to_args(task=task, ccs_file=ccs_file, nproc=nproc, use_finer_qv=use_finer_qv) log.info("ARGUMENTS of Task %s/%s:\n%s", str(i), str(len(p)), str(args)) log.info("Running ICE on cluster bin %s", task.cluster_bin_index) PBTranscript(args, subCommand="cluster").start() writer.write("ICE of cluster bin %s in %s is DONE: %s\n" % (task.cluster_bin_index, task.cluster_out_dir, task.consensus_isoforms_file))
def run_main(cluster_chunks_pickle_file, ccs_file, output_json_file, max_nchunks): """Scatter items in cluster_chunks_pickle Parameters: cluster_chunks_pickle_file -- ChunkTasksPickle of ClusterChunkTask objects. ccs_file -- ccs.consensusreadset.xml output_json_file -- chunk.json max_nchunks -- maximum # of chunks """ p = ChunkTasksPickle.read(cluster_chunks_pickle_file) assert all([isinstance(r, ClusterChunkTask) for r in p]) out_dir = op.dirname(output_json_file) # sort and group tasks groups = p.sort_and_group_tasks(max_nchunks=max_nchunks) # Writing chunk.json base_name = "spawned_cluster_chunk" chunks = [] spawned_pickles = [] for group_index in range(0, len(groups)): chunk_id = "_".join([base_name, 'group', str(group_index)]) spawned_pickle_file = op.join(out_dir, chunk_id + ".pickle") d = { Constants.CHUNK_KEYS[0]: spawned_pickle_file, Constants.CHUNK_KEYS[1]: ccs_file } c = PipelineChunk(chunk_id, **d) chunks.append(c) spawned_pickles.append(spawned_pickle_file) log.info("Spawning %s into %d files", cluster_chunks_pickle_file, len(groups)) p.spawn_pickles_by_groups(groups, spawned_pickles) log.debug("Spawned files: %s.", ", ".join(spawned_pickles)) # n_chunks = len(p) # for i in range(0, n_chunks): # chunk_id = "_".join([base_name, str(i)]) # spawned_pickle_file = op.join(out_dir, chunk_id + ".pickle") # d = {Constants.CHUNK_KEYS[0]: spawned_pickle_file, # Constants.CHUNK_KEYS[1]: ccs_file} # c = PipelineChunk(chunk_id, **d) # chunks.append(c) # spawned_pickles.append(spawned_pickle_file) # # log.info("Spawning %s into %s files", cluster_chunks_pickle_file, str(n_chunks)) # p.spawn_pickles(spawned_pickles) # log.debug("Spawned files: %s.", ", ".join(spawned_pickles)) log.info("Writing chunk.json to %s", output_json_file) write_pipeline_chunks(chunks, output_json_file, "created by %s" % Constants.TOOL_ID) return 0
def create_partial_pickle(flnc_files, chunked_nfl_files, out_pickle): """ Parameters: flnc_files -- full-length non-chimeric files in bins chunked_nfl_files -- chunked non-chimeric files out_pickle -- output pickle for saving PolishChunkTask objects """ n_bins = len(flnc_files) n_nfl_chunks = max(1, len(chunked_nfl_files)) log.info("Writing %s ice_partial chunk tasks to %s.", str(n_bins * n_nfl_chunks), out_pickle) p = ChunkTasksPickle() for i, flnc_file in enumerate(flnc_files): log.debug("Processing cluster bin index=%s.", i) cluster_out_dir = _get_cluster_out_dir(flnc_file) for j, nfl_file in enumerate(chunked_nfl_files): # Create Partial chunk tasks. task_ = PartialChunkTask(cluster_bin_index=i, flnc_file=flnc_file, cluster_out_dir=cluster_out_dir, nfl_file=nfl_file, nfl_index=j, n_nfl_chunks=n_nfl_chunks) p.append(task_) p.write(out_pickle) log.info("Saved %s partial chunk tasks to %s.", str(n_bins * n_nfl_chunks), out_pickle)
def run_main(cluster_chunks_pickle_file, ccs_file, output_json_file, max_nchunks): """Scatter items in cluster_chunks_pickle Parameters: cluster_chunks_pickle_file -- ChunkTasksPickle of ClusterChunkTask objects. ccs_file -- ccs.consensusreadset.xml output_json_file -- chunk.json max_nchunks -- maximum # of chunks """ p = ChunkTasksPickle.read(cluster_chunks_pickle_file) assert all([isinstance(r, ClusterChunkTask) for r in p]) out_dir = op.dirname(output_json_file) # sort and group tasks groups = p.sort_and_group_tasks(max_nchunks=max_nchunks) # Writing chunk.json base_name = "spawned_cluster_chunk" chunks = [] spawned_pickles = [] for group_index in range(0, len(groups)): chunk_id = "_".join([base_name, 'group', str(group_index)]) spawned_pickle_file = op.join(out_dir, chunk_id + ".pickle") d = {Constants.CHUNK_KEYS[0]: spawned_pickle_file, Constants.CHUNK_KEYS[1]: ccs_file} c = PipelineChunk(chunk_id, **d) chunks.append(c) spawned_pickles.append(spawned_pickle_file) log.info("Spawning %s into %d files", cluster_chunks_pickle_file, len(groups)) p.spawn_pickles_by_groups(groups, spawned_pickles) log.debug("Spawned files: %s.", ", ".join(spawned_pickles)) # n_chunks = len(p) # for i in range(0, n_chunks): # chunk_id = "_".join([base_name, str(i)]) # spawned_pickle_file = op.join(out_dir, chunk_id + ".pickle") # d = {Constants.CHUNK_KEYS[0]: spawned_pickle_file, # Constants.CHUNK_KEYS[1]: ccs_file} # c = PipelineChunk(chunk_id, **d) # chunks.append(c) # spawned_pickles.append(spawned_pickle_file) # # log.info("Spawning %s into %s files", cluster_chunks_pickle_file, str(n_chunks)) # p.spawn_pickles(spawned_pickles) # log.debug("Spawned files: %s.", ", ".join(spawned_pickles)) log.info("Writing chunk.json to %s", output_json_file) write_pipeline_chunks(chunks, output_json_file, "created by %s" % Constants.TOOL_ID) return 0
def resolved_tool_contract_runner(rtc): """Given resolved tool contract, run""" p = ChunkTasksPickle.read(rtc.task.input_files[0]) p.sorted_by_attr(attr='cluster_bin_index') assert all([isinstance(task, PartialChunkTask) for task in p]) with open(rtc.task.output_files[0], 'w') as writer: for i, group in groupby(p, lambda x: x.cluster_bin_index): gs = [g for g in group] nfl_pickles_of_bin_i = [g.nfl_pickle for g in gs] out_pickle = IceFiles(prog_name="", root_dir=gs[0].cluster_out_dir, no_log_f=True).nfl_all_pickle_fn log.info("Combining nfl pickles of cluster bin %s.", str(i)) log.debug("nfl pickles are: %s.", (", ".join(nfl_pickles_of_bin_i))) log.debug("Output merged nfl pickle is %s.", out_pickle) combine_nfl_pickles(splitted_pickles=nfl_pickles_of_bin_i, out_pickle=out_pickle) writer.write("Merge nfl pickles of cluster bin %s DONE: %s\n" % (i, out_pickle))
def run_main(partial_chunks_pickle_file, sentinel_file, ccs_file, output_json_file, max_nchunks): """ Spawn partial Chunk Tasks in pickle. Parameters: partial_chunks_pickle_file -- ChunkTasksPickle of PartialChunkTask objects ccs_file -- ccs dataset sentinel_file -- sentinel file to connect pbsmrtpipe tasks output_json -- chunk.json """ p = ChunkTasksPickle.read(partial_chunks_pickle_file) assert all([isinstance(r, PartialChunkTask) for r in p]) out_dir = op.dirname(output_json_file) # sort and group tasks groups = p.sort_and_group_tasks(max_nchunks=max_nchunks) # Writing chunk.json base_name = "spawned_partial_chunk" chunks = [] spawned_pickles = [] for group_index in range(0, len(groups)): chunk_id = "_".join([base_name, 'group', str(group_index)]) spawned_pickle_file = op.join(out_dir, chunk_id + ".pickle") d = { Constants.CHUNK_KEYS[0]: spawned_pickle_file, Constants.CHUNK_KEYS[1]: sentinel_file, Constants.CHUNK_KEYS[2]: ccs_file } c = PipelineChunk(chunk_id, **d) chunks.append(c) spawned_pickles.append(spawned_pickle_file) log.info("Spawning %s into %d files", partial_chunks_pickle_file, len(groups)) p.spawn_pickles_by_groups(groups=groups, out_pickle_fns=spawned_pickles) log.debug("Spawned files: %s.", ", ".join(spawned_pickles)) log.info("Writing chunk.json to %s", output_json_file) write_pipeline_chunks(chunks, output_json_file, "created by %s" % Constants.TOOL_ID) return 0
def run_main(polish_chunks_pickle_file, sentinel_file, subreads_file, output_json_file, max_nchunks): """ Spawn polish Chunk Tasks in pickle. Parameters: polish_chunks_pickle_file -- ChunkTasksPickle of PolishChunkTask objects subreads_file -- ccs dataset sentinel_file -- sentinel file to connect pbsmrtpipe tasks. output_json -- chunk.json """ p = ChunkTasksPickle.read(polish_chunks_pickle_file) assert all([isinstance(r, PolishChunkTask) for r in p]) out_dir = op.dirname(output_json_file) # sort and group tasks groups = p.sort_and_group_tasks(max_nchunks=max_nchunks) # Writing chunk.json base_name = "spawned_polish_chunk" chunks = [] spawned_pickles = [] for group_index in range(0, len(groups)): chunk_id = "_".join([base_name, 'group', str(group_index)]) spawned_pickle_file = op.join(out_dir, chunk_id + ".pickle") d = {Constants.CHUNK_KEYS[0]: spawned_pickle_file, Constants.CHUNK_KEYS[1]: sentinel_file, Constants.CHUNK_KEYS[2]: subreads_file} c = PipelineChunk(chunk_id, **d) chunks.append(c) spawned_pickles.append(spawned_pickle_file) log.info("Spawning %s into %d files", polish_chunks_pickle_file, len(groups)) p.spawn_pickles_by_groups(groups=groups, out_pickle_fns=spawned_pickles) log.debug("Spawned files: %s.", ", ".join(spawned_pickles)) log.info("Writing chunk.json to %s", output_json_file) write_pipeline_chunks(chunks, output_json_file, "created by %s" % Constants.TOOL_ID) return 0
def resolved_tool_contract_runner(rtc): """resolved tool contract runner.""" p = ChunkTasksPickle.read(rtc.task.input_files[0]) assert all([isinstance(task, PolishChunkTask) for task in p]) dummy_sentinel_file = rtc.task.input_files[1] subread_set = rtc.task.input_files[2] nproc = rtc.task.nproc tmp_dir = rtc.task.tmpdir_resources[0].path \ if len(rtc.task.tmpdir_resources) > 0 else None with open(rtc.task.output_files[0], 'w') as writer: for task in p: log.info("Running ice_polish on cluster bin %s, polish chunk %s/%s", str(task.cluster_bin_index), str(task.polish_index), str(task.n_polish_chunks)) log.debug("ice_quiver root_dir is %s", task.cluster_out_dir) log.debug("consensus_isoforms is %s", task.consensus_isoforms_file) task_runner(task=task, subread_set=subread_set, nproc=nproc, tmp_dir=tmp_dir) writer.write("ice_polish of cluster bin %s, polish chunk %s/%s in %s is DONE.\n" % (task.cluster_bin_index, task.polish_index, task.n_polish_chunks, task.cluster_out_dir))
def resolved_tool_contract_runner(rtc): """ For each cluster bin, create summary.json, cluster_report.csv, hq_isoforms.fa|fq, lq_isoforms.fa|fq """ p = ChunkTasksPickle.read(rtc.task.input_files[0]) assert all([isinstance(task, PolishChunkTask) for task in p]) p.sorted_by_attr(attr='cluster_bin_index') opts = rtc.task.options ipq_opts = IceQuiverHQLQOptions(qv_trim_5=opts[Constants.QV_TRIM_FIVEPRIME_ID], qv_trim_3=opts[Constants.QV_TRIM_THREEPRIME_ID], hq_quiver_min_accuracy=opts[Constants.HQ_QUIVER_MIN_ACCURACY_ID]) with open(rtc.task.output_files[0], 'w') as writer: for cluster_bin_index, cluster_out_dir in p.sorted_no_redundant_cluster_bins(): log.info("ice_quiver_postprocess of cluster bin index %s in %s.", str(cluster_bin_index), str(cluster_out_dir)) good_hq, bad_hq = \ ice_quiver_postprocess_a_cluster_bin(cluster_out_dir=cluster_out_dir, ipq_opts=ipq_opts) writer.write("ice_quiver_postprocess of cluster bin index %s in %s DONE:\n%s\n%s\n" % (cluster_bin_index, cluster_out_dir, good_hq, bad_hq))
def create_cluster_pickle(flnc_files, out_pickle): """Create cluster chunk task pickle. Parameters: n_bins -- number of bins flnc_files -- full-length non-chimeric files in bins out_pickle -- output pickle for saving ClusterChunkTask objects """ n_bins = len(flnc_files) log.info("Writing %s cluster chunk tasks to %s.", str(n_bins), out_pickle) p = ChunkTasksPickle() for i, flnc_file in enumerate(flnc_files): log.debug("Processing cluster bin index=%s.", i) cluster_out_dir = _get_cluster_out_dir(flnc_file) # Create Cluster chunk tasks. task_ = ClusterChunkTask(cluster_bin_index=i, flnc_file=flnc_file, cluster_out_dir=cluster_out_dir) p.append(task_) p.write(out_pickle) log.info("Saved %s cluster chunk tasks to %s.", str(n_bins), out_pickle)
def resolved_tool_contract_runner(rtc): """ For each cluster bin, create summary.json, cluster_report.csv, hq_isoforms.fa|fq, lq_isoforms.fa|fq Finally, merge all cluster bins and save all outputs to 'combined'. """ p = ChunkTasksPickle.read(rtc.task.input_files[0]) assert all([isinstance(task, ClusterChunkTask) for task in p]) p.sorted_by_attr(attr='cluster_bin_index') opts = rtc.task.options ipq_opts = IceQuiverHQLQOptions( qv_trim_5=opts[Constants.QV_TRIM_FIVEPRIME_ID], qv_trim_3=opts[Constants.QV_TRIM_THREEPRIME_ID], hq_quiver_min_accuracy=opts[Constants.HQ_QUIVER_MIN_ACCURACY_ID]) sample_name = get_sample_name( input_sample_name=opts[Constants.SAMPLE_NAME_ID]) out_consensus_isoforms_cs = rtc.task.output_files[0] out_summary = rtc.task.output_files[1] out_report = rtc.task.output_files[2] out_hq_cs = rtc.task.output_files[3] out_hq_fq = rtc.task.output_files[4] out_lq_cs = rtc.task.output_files[5] out_lq_fq = rtc.task.output_files[6] out_hq_lq_prefix_dict_pickle = rtc.task.output_files[7] assert out_consensus_isoforms_cs.endswith(".contigset.xml") assert out_hq_cs.endswith(".contigset.xml") assert out_lq_cs.endswith(".contigset.xml") out_consensus_isoforms_fa = out_consensus_isoforms_cs.replace( ".contigset.xml", ".fasta") out_hq_fa = out_hq_cs.replace('.contigset.xml', '.fasta') out_lq_fa = out_lq_cs.replace('.contigset.xml', '.fasta') hq_fq_fns, lq_fq_fns = [], [] split_uc_pickles, split_partial_uc_pickles = [], [] split_consensus_isoforms = [] cluster_bin_indices = [task.cluster_bin_index for task in p] cluster_out_dirs = [task.cluster_out_dir for task in p] # sanity check that Cluster indices are unique! assert len(set(cluster_bin_indices)) == len(cluster_bin_indices) for task in p: ice_pq = IceQuiverPostprocess(root_dir=task.cluster_out_dir, ipq_opts=ipq_opts) hq_fq_fns.append(ice_pq.quivered_good_fq) lq_fq_fns.append(ice_pq.quivered_bad_fq) split_uc_pickles.append(ice_pq.final_pickle_fn) split_partial_uc_pickles.append(ice_pq.nfl_all_pickle_fn) split_consensus_isoforms.append(ice_pq.final_consensus_fa) combined_dir = op.join(op.dirname(op.dirname(cluster_out_dirs[0])), "combined") mkdir(combined_dir) combined_files = CombinedFiles(combined_dir) log.info("Combining results of all cluster bins to %s.", combined_dir) log.info("Merging HQ|LQ isoforms from all cluster bins.") log.info("HQ isoforms are: %s.", ",".join(hq_fq_fns)) log.info("LQ isoforms are: %s.", ",".join(lq_fq_fns)) combine_polished_isoforms( split_indices=cluster_bin_indices, split_hq_fns=hq_fq_fns, split_lq_fns=lq_fq_fns, combined_hq_fa=combined_files.all_hq_fa, combined_hq_fq=combined_files.all_hq_fq, combined_lq_fa=combined_files.all_lq_fa, combined_lq_fq=combined_files.all_lq_fq, hq_lq_prefix_dict_pickle=combined_files.hq_lq_prefix_dict_pickle, sample_name=sample_name) ln(combined_files.all_hq_fa, out_hq_fa) #'HQ isoforms' ln(combined_files.all_hq_fq, out_hq_fq) #'HQ isoforms' ln(combined_files.all_lq_fa, out_lq_fa) #'LQ isoforms' ln(combined_files.all_lq_fq, out_lq_fq) #'LQ isoforms' ln(combined_files.hq_lq_prefix_dict_pickle, out_hq_lq_prefix_dict_pickle) as_contigset(out_hq_fa, out_hq_cs) as_contigset(out_lq_fa, out_lq_cs) log.info("Merging consensus isoforms from all cluster bins.") combine_consensus_isoforms(split_indices=cluster_bin_indices, split_files=split_consensus_isoforms, combined_consensus_isoforms_fa=combined_files. all_consensus_isoforms_fa, sample_name=sample_name) ln(combined_files.all_consensus_isoforms_fa, out_consensus_isoforms_fa) #consensus isoforms as_contigset(out_consensus_isoforms_fa, out_consensus_isoforms_cs) log.info("Writing cluster summary to %s", combined_files.all_cluster_summary_fn) write_cluster_summary(summary_fn=combined_files.all_cluster_summary_fn, isoforms_fa=out_consensus_isoforms_cs, hq_fa=out_hq_fa, lq_fa=out_lq_fa) ln(combined_files.all_cluster_summary_fn, out_summary) # "cluster summary" log.info("Writing cluster report to %s", combined_files.all_cluster_report_fn) write_combined_cluster_report( split_indices=cluster_bin_indices, split_uc_pickles=split_uc_pickles, split_partial_uc_pickles=split_partial_uc_pickles, report_fn=combined_files.all_cluster_report_fn, sample_name=sample_name) ln(combined_files.all_cluster_report_fn, out_report) # "cluster report"
def resolved_tool_contract_runner(rtc): """ For each cluster bin, create summary.json, cluster_report.csv, hq_isoforms.fa|fq, lq_isoforms.fa|fq Finally, merge all cluster bins and save all outputs to 'combined'. """ p = ChunkTasksPickle.read(rtc.task.input_files[0]) assert all([isinstance(task, ClusterChunkTask) for task in p]) p.sorted_by_attr(attr='cluster_bin_index') opts = rtc.task.options ipq_opts = IceQuiverHQLQOptions(qv_trim_5=opts[Constants.QV_TRIM_FIVEPRIME_ID], qv_trim_3=opts[Constants.QV_TRIM_THREEPRIME_ID], hq_quiver_min_accuracy=opts[Constants.HQ_QUIVER_MIN_ACCURACY_ID]) sample_name = get_sample_name(input_sample_name=opts[Constants.SAMPLE_NAME_ID]) out_consensus_isoforms_cs = rtc.task.output_files[0] out_summary = rtc.task.output_files[1] out_report = rtc.task.output_files[2] out_hq_cs = rtc.task.output_files[3] out_hq_fq = rtc.task.output_files[4] out_lq_cs = rtc.task.output_files[5] out_lq_fq = rtc.task.output_files[6] out_hq_lq_prefix_dict_pickle = rtc.task.output_files[7] assert out_consensus_isoforms_cs.endswith(".contigset.xml") assert out_hq_cs.endswith(".contigset.xml") assert out_lq_cs.endswith(".contigset.xml") out_consensus_isoforms_fa = out_consensus_isoforms_cs.replace(".contigset.xml", ".fasta") out_hq_fa = out_hq_cs.replace('.contigset.xml', '.fasta') out_lq_fa = out_lq_cs.replace('.contigset.xml', '.fasta') hq_fq_fns, lq_fq_fns = [], [] split_uc_pickles, split_partial_uc_pickles = [], [] split_consensus_isoforms = [] cluster_bin_indices = [task.cluster_bin_index for task in p] cluster_out_dirs = [task.cluster_out_dir for task in p] # sanity check that Cluster indices are unique! assert len(set(cluster_bin_indices)) == len(cluster_bin_indices) for task in p: ice_pq = IceQuiverPostprocess(root_dir=task.cluster_out_dir, ipq_opts=ipq_opts) hq_fq_fns.append(ice_pq.quivered_good_fq) lq_fq_fns.append(ice_pq.quivered_bad_fq) split_uc_pickles.append(ice_pq.final_pickle_fn) split_partial_uc_pickles.append(ice_pq.nfl_all_pickle_fn) split_consensus_isoforms.append(ice_pq.final_consensus_fa) combined_dir = op.join(op.dirname(op.dirname(cluster_out_dirs[0])), "combined") mkdir(combined_dir) combined_files = CombinedFiles(combined_dir) log.info("Combining results of all cluster bins to %s.", combined_dir) log.info("Merging HQ|LQ isoforms from all cluster bins.") log.info("HQ isoforms are: %s.", ",".join(hq_fq_fns)) log.info("LQ isoforms are: %s.", ",".join(lq_fq_fns)) combine_polished_isoforms(split_indices=cluster_bin_indices, split_hq_fns=hq_fq_fns, split_lq_fns=lq_fq_fns, combined_hq_fa=combined_files.all_hq_fa, combined_hq_fq=combined_files.all_hq_fq, combined_lq_fa=combined_files.all_lq_fa, combined_lq_fq=combined_files.all_lq_fq, hq_lq_prefix_dict_pickle=combined_files.hq_lq_prefix_dict_pickle, sample_name=sample_name) ln(combined_files.all_hq_fa, out_hq_fa) #'HQ isoforms' ln(combined_files.all_hq_fq, out_hq_fq) #'HQ isoforms' ln(combined_files.all_lq_fa, out_lq_fa) #'LQ isoforms' ln(combined_files.all_lq_fq, out_lq_fq) #'LQ isoforms' ln(combined_files.hq_lq_prefix_dict_pickle, out_hq_lq_prefix_dict_pickle) as_contigset(out_hq_fa, out_hq_cs) as_contigset(out_lq_fa, out_lq_cs) log.info("Merging consensus isoforms from all cluster bins.") combine_consensus_isoforms(split_indices=cluster_bin_indices, split_files=split_consensus_isoforms, combined_consensus_isoforms_fa=combined_files.all_consensus_isoforms_fa, sample_name=sample_name) ln(combined_files.all_consensus_isoforms_fa, out_consensus_isoforms_fa) #consensus isoforms as_contigset(out_consensus_isoforms_fa, out_consensus_isoforms_cs) log.info("Writing cluster summary to %s", combined_files.all_cluster_summary_fn) write_cluster_summary(summary_fn=combined_files.all_cluster_summary_fn, isoforms_fa=out_consensus_isoforms_cs, hq_fa=out_hq_fa, lq_fa=out_lq_fa) ln(combined_files.all_cluster_summary_fn, out_summary) # "cluster summary" log.info("Writing cluster report to %s", combined_files.all_cluster_report_fn) write_combined_cluster_report(split_indices=cluster_bin_indices, split_uc_pickles=split_uc_pickles, split_partial_uc_pickles=split_partial_uc_pickles, report_fn=combined_files.all_cluster_report_fn, sample_name=sample_name) ln(combined_files.all_cluster_report_fn, out_report) # "cluster report"