def run_main(fastq_file, gmap_ref_file, output_json_file, max_nchunks): """ Parameters: fastq_file -- HQ isoforms in FASTQ gmap_ref_file -- GMAP reference set xml output_json -- chunk.json """ # Chunk FASTQ output_fastq_json = output_json_file + ".fastq.json" output_dir = op.dirname(output_json_file) CU.write_fastq_chunks_to_file(output_fastq_json, fastq_file, max_nchunks, output_dir, "scattered-fastq", "fastq") # get fastq_ids from output_fastq_json fastq_chunks = load_pipeline_chunks_from_json(output_fastq_json) fastq_files = get_datum_from_chunks_by_chunk_key(fastq_chunks, "$chunk.fastq_id") log.debug("Chunked FASTQ files are %s.", (', '.join(fastq_files))) # Writing chunk.json chunks = [] for i, fastq_file in enumerate(fastq_files): chunk_id = "_".join(["map_isoforms_to_genome_chunk", str(i)]) d = { Constants.CHUNK_KEYS[0]: fastq_file, Constants.CHUNK_KEYS[1]: gmap_ref_file } c = PipelineChunk(chunk_id, **d) chunks.append(c) log.info("Writing chunk.json to %s", output_json_file) write_pipeline_chunks(chunks, output_json_file, "created by %s" % Constants.TOOL_ID) return 0
def run_main(chunk_json, contigset_output, chunk_key): """run main""" chunks = load_pipeline_chunks_from_json(chunk_json) # Allow looseness if not chunk_key.startswith('$chunk.'): chunk_key = '$chunk.' + chunk_key log.warn("Prepending chunk key with '$chunk.' to '%s'", str(chunk_key)) fasta_files = get_datum_from_chunks_by_chunk_key(chunks, chunk_key) log.debug("Chunked consensus isoforms files are %s.", (', '.join(fasta_files))) out_fa = CombinedFiles(combined_dir=op.dirname(contigset_output)).all_consensus_isoforms_fa combine_consensus_isoforms(split_indices=range(0, len(fasta_files)), split_files=fasta_files, combined_consensus_isoforms_fa=out_fa) log.info("Combining files to %s.", out_fa) log.info("Writing contigset %s", contigset_output) assert contigset_output.endswith('xml') as_contigset(out_fa, contigset_output) #cs = ContigSet(*fasta_files) #cs.newUuid() #cs.write(contigset_output) return 0
def _run_main(chunk_input_json, output_file, chunk_key): chunks = load_pipeline_chunks_from_json(chunk_input_json) chunked_files = [] for chunk in chunks: if chunk_key in chunk.chunk_keys: chunked_files.append(chunk.chunk_d[chunk_key]) else: raise KeyError("Unable to find chunk key '{i}' in {p}".format(i=chunk_key, p=chunk)) return gather_kinetics_h5_byref(chunked_files, output_file)
def run_after(self, rtc, output_dir): json_file = rtc.task.output_files[0] chunks = load_pipeline_chunks_from_json(json_file) for chunk in chunks: d = chunk.chunk_d # the cluster pickle (file index 2) is chunked, the rest not self.assertNotEqual(d["$chunk.pickle_id"], self.INPUT_FILES[2]) self.assertEqual(d["$chunk.subreadset_id"], self.INPUT_FILES[0]) self.assertEqual(d["$chunk.contigset_id"], self.INPUT_FILES[1]) self.assertEqual(d["$chunk.nfl_pickle_id"], self.INPUT_FILES[3])
def _run_main(chunk_input_json, output_file, chunk_key): chunks = load_pipeline_chunks_from_json(chunk_input_json) chunked_files = [] for chunk in chunks: if chunk_key in chunk.chunk_keys: chunked_files.append(chunk.chunk_d[chunk_key]) else: raise KeyError("Unable to find chunk key '{i}' in {p}".format( i=chunk_key, p=chunk)) return gather_kinetics_h5_byref(chunked_files, output_file)
def __gather_runner(func, chunk_input_json, output_file, chunk_key, **kwargs): chunks = load_pipeline_chunks_from_json(chunk_input_json) # Allow looseness if not chunk_key.startswith('$chunk.'): chunk_key = '$chunk.' + chunk_key log.warn("Prepending chunk key with '$chunk.' to '{c}'".format(c=chunk_key)) chunked_files = get_datum_from_chunks_by_chunk_key(chunks, chunk_key) _ = func(chunked_files, output_file, **kwargs) return 0
def run_after(self, rtc, output_dir): json_file = rtc.task.output_files[0] chunks = load_pipeline_chunks_from_json(json_file) windows = [] for chunk in chunks: d = chunk.chunk_d chunked = d[self.CHUNK_KEYS[0]] with self.READER_CLASS(chunked, **self.READER_KWARGS) as ds: windows.append(ds.refWindows) self.assertEqual(windows, [[('lambda_NEB3011', 0, 24251)], [('lambda_NEB3011', 24251, 48502)]])
def __gather_runner(func, chunk_input_json, output_file, chunk_key, **kwargs): chunks = load_pipeline_chunks_from_json(chunk_input_json) # Allow looseness if not chunk_key.startswith('$chunk.'): chunk_key = '$chunk.' + chunk_key log.warning( "Prepending chunk key with '$chunk.' to '{c}'".format(c=chunk_key)) chunked_files = get_datum_from_chunks_by_chunk_key(chunks, chunk_key) _ = func(chunked_files, output_file, **kwargs) return 0
def run_after(self, rtc, output_dir): gathered_file = rtc.task.output_files[0] chunks = load_pipeline_chunks_from_json(self.INPUT_FILES[0]) n_rec = 0 with self.READER_CLASS(gathered_file, **self.READER_KWARGS) as f: n_rec = len([r for r in f]) n_rec_chunked = 0 for chunk in chunks: d = chunk.chunk_d chunked = d[self.CHUNK_KEY] with self.READER_CLASS(chunked, **self.READER_KWARGS) as cs: n_rec_chunked += len([r for r in cs]) self.assertEqual(n_rec_chunked, n_rec)
def run_after(self, rtc, output_dir): json_file = rtc.task.output_files[0] chunks = load_pipeline_chunks_from_json(json_file) windows = [] for chunk in chunks: d = chunk.chunk_d chunked = d[self.CHUNK_KEYS[0]] with self.READER_CLASS(chunked, **self.READER_KWARGS) as ds: windows.append(ds.refWindows) self.assertEqual(windows, [ [('lambda_NEB3011', 0, 24251)], [('lambda_NEB3011', 24251, 48502)] ])
def run_after(self, rtc, output_dir): json_file = rtc.task.output_files[0] chunks = load_pipeline_chunks_from_json(json_file) n_rec = 0 with ContigSet(self.INPUT_FILES[0]) as f: n_rec = len(f) n_rec_chunked = 0 for chunk in chunks: d = chunk.chunk_d with ContigSet(d['$chunk.contigset_id']) as cs: n_rec_chunked += len([r for r in cs]) self._check_unchunked_files(d) self.assertEqual(n_rec_chunked, n_rec)
def chunk_contigset(in_file, n_chunks, out_dir, out_chunk_json): """ Chunk input contigset into n_chunks under out_dir, and write chunk info to out_chunk_json, return chunked files. """ log.info("Splitting %s into %s chunks", in_file, str(n_chunks)) CU.write_contigset_chunks_to_file(out_chunk_json, in_file, n_chunks, out_dir, "scattered-nfl", "contigset.xml") out_chunks = load_pipeline_chunks_from_json(out_chunk_json) chunked_files = get_datum_from_chunks_by_chunk_key(out_chunks, '$chunk.contigset_id') log.info("Splitted files are %s\n", ("\n".join(chunked_files))) # Return chunked files from out_chunk_json return chunked_files
def run_main(chunk_json, sam_output, chunk_key): """run main""" chunks = load_pipeline_chunks_from_json(chunk_json) # Allow looseness if not chunk_key.startswith('$chunk.'): chunk_key = '$chunk.' + chunk_key log.warn("Prepending chunk key with '$chunk.' to '%s'", str(chunk_key)) sam_files = get_datum_from_chunks_by_chunk_key(chunks, chunk_key) log.debug("Chunked SAM files are %s.", (', '.join(sam_files))) log.info("Concatenate chunked SAM files to %s.", sam_output) concatenate_sam(sam_files, sam_output) return 0
def run_after(self, rtc, output_dir): unchunked = self.INPUT_FILES[0] json_file = rtc.task.output_files[0] chunks = load_pipeline_chunks_from_json(json_file) if self.NCHUNKS_EXPECTED is not None: self.assertEqual(len(chunks), self.NCHUNKS_EXPECTED) n_rec = 0 with self.READER_CLASS(unchunked, **self.READER_KWARGS) as f: n_rec = len([rec for rec in f]) self.assertTrue(n_rec > 0) n_rec_chunked = 0 for chunk in chunks: d = chunk.chunk_d chunked = d[self.CHUNK_KEYS[0]] with self.READER_CLASS(chunked, **self.READER_KWARGS) as cs: n_rec_chunk = len([rec for rec in cs]) self.assertTrue(n_rec_chunk > 0) n_rec_chunked += n_rec_chunk self.assertEqual(n_rec_chunked, n_rec)
def run_main(fastq_file, gmap_ref_file, output_json_file, max_nchunks): """ Parameters: fastq_file -- HQ isoforms in FASTQ gmap_ref_file -- GMAP reference set xml output_json -- chunk.json """ # Check size of fastq_file before scattering, so that a meaningful # error message can be displayed instead of 'float division by zero' if os.stat(fastq_file).st_size == 0: raise IOError("Fastq file %s is empty, exiting." % fastq_file) # Chunk FASTQ output_fastq_json = output_json_file + ".fastq.json" output_dir = op.dirname(output_json_file) CU.write_fastq_chunks_to_file(output_fastq_json, fastq_file, max_nchunks, output_dir, "scattered-fastq", "fastq") # get fastq_ids from output_fastq_json fastq_chunks = load_pipeline_chunks_from_json(output_fastq_json) fastq_files = get_datum_from_chunks_by_chunk_key(fastq_chunks, "$chunk.fastq_id") log.debug("Chunked FASTQ files are %s.", (', '.join(fastq_files))) # Writing chunk.json chunks = [] for i, fastq_file in enumerate(fastq_files): chunk_id = "_".join(["map_isoforms_to_genome_chunk", str(i)]) d = { Constants.CHUNK_KEYS[0]: fastq_file, Constants.CHUNK_KEYS[1]: gmap_ref_file } c = PipelineChunk(chunk_id, **d) chunks.append(c) log.info("Writing chunk.json to %s", output_json_file) write_pipeline_chunks(chunks, output_json_file, "created by %s" % Constants.TOOL_ID) return 0
def run_main(fastq_file, gmap_ref_file, output_json_file, max_nchunks): """ Parameters: fastq_file -- HQ isoforms in FASTQ gmap_ref_file -- GMAP reference set xml output_json -- chunk.json """ # Check size of fastq_file before scattering, so that a meaningful # error message can be displayed instead of 'float division by zero' if os.stat(fastq_file).st_size == 0: raise IOError("Fastq file %s is empty, exiting." % fastq_file) # Chunk FASTQ output_fastq_json = output_json_file + ".fastq.json" output_dir = op.dirname(output_json_file) CU.write_fastq_chunks_to_file(output_fastq_json, fastq_file, max_nchunks, output_dir, "scattered-fastq", "fastq") # get fastq_ids from output_fastq_json fastq_chunks = load_pipeline_chunks_from_json(output_fastq_json) fastq_files = get_datum_from_chunks_by_chunk_key(fastq_chunks, "$chunk.fastq_id") log.debug("Chunked FASTQ files are %s.", (', '.join(fastq_files))) # Writing chunk.json chunks = [] for i, fastq_file in enumerate(fastq_files): chunk_id = "_".join(["map_isoforms_to_genome_chunk", str(i)]) d = {Constants.CHUNK_KEYS[0]: fastq_file, Constants.CHUNK_KEYS[1]: gmap_ref_file} c = PipelineChunk(chunk_id, **d) chunks.append(c) log.info("Writing chunk.json to %s", output_json_file) write_pipeline_chunks(chunks, output_json_file, "created by %s" % Constants.TOOL_ID) return 0
def run_main(chunk_json, sam_output, chunk_key): """run main""" chunks = load_pipeline_chunks_from_json(chunk_json) # Allow looseness if not chunk_key.startswith('$chunk.'): chunk_key = '$chunk.' + chunk_key log.warn("Prepending chunk key with '$chunk.' to '%s'", str(chunk_key)) sam_files = get_datum_from_chunks_by_chunk_key(chunks, chunk_key) log.debug("Chunked SAM files are %s.", (', '.join(sam_files))) log.info("Concatenate chunked SAM files to %s.", sam_output) # concatenate sam files unsorted_sam_output = sam_output + ".unsorted.sam" concatenate_sam(sam_files, unsorted_sam_output) # then sort sort_sam(unsorted_sam_output, sam_output) # remove intermediate file rmpath(unsorted_sam_output) return 0
def _validate_chunk_json_file(path): chunks = load_pipeline_chunks_from_json(path) return path
def run(chunk_input_json, output_file, chunk_key): chunks = load_pipeline_chunks_from_json(chunk_input_json) chunked_files = get_datum_from_chunks_by_chunk_key(chunks, chunk_key) _ = combine_nfl_pickles(chunked_files, output_file) return 0