def _to_zmw_chunked_dataset_files(dataset_type, dataset_path, max_total_nchunks, chunk_key, dir_name, base_name, ext, extra_chunk_keys=None, extra_split_args=None): """ Similar to to_chunked_subreadset_files, but chunks reads by ZMW ranges for input to pbccs or pbtranscript. """ dset = dataset_type(dataset_path, strict=True) kwargs = {"chunks": max_total_nchunks, "zmws": True} if extra_split_args is not None: kwargs.update(extra_split_args) dset_chunks = dset.split(**kwargs) d = {} for i, dset in enumerate(dset_chunks): chunk_id = '_'.join([base_name, str(i)]) chunk_name = '.'.join([chunk_id, ext]) chunk_path = os.path.join(dir_name, chunk_name) _add_chunked_tag_if_missing(dset) dset.write(chunk_path) d[chunk_key] = os.path.abspath(chunk_path) if extra_chunk_keys is not None: d.update(extra_chunk_keys) c = PipelineChunk(chunk_id, **d) yield c
def __to_chunked_fastx_files(write_records_func, pbcore_reader_class, pbcore_writer_class, chunk_key, input_file, max_total_nchunks, dir_name, base_name, ext, extra_chunk_keys=None): """Convert a Fasta/Fasta file to a chunked list of files :param write_records_func: Func(writer_class, records, file_name) :param pbcore_reader_class Pbcore IO Reader :param pbcore_writer_class Pbcore IO Writer :param chunk_key: Chunk key to assign to PipelineChunk :param input_file: Path to input file """ # grab the number of records so we can chunk it with pbcore_reader_class(input_file) as f: nrecords = __get_nrecords_from_reader(f) max_total_nchunks = max(1, min(nrecords, max_total_nchunks)) n_per_chunk = int(math.ceil(float(nrecords) / max_total_nchunks)) log.info( "Found {n} total records. Max total chunks {m}. Splitting into chunks of approximately {x} records each" .format(n=nrecords, x=n_per_chunk, m=max_total_nchunks)) nchunks = 0 with pbcore_reader_class(input_file) as r: it = iter(r) for i in range(max_total_nchunks): records = [] chunk_id = "_".join([base_name, str(nchunks)]) chunk_name = ".".join([chunk_id, ext]) nchunks += 1 fasta_chunk_path = os.path.join(dir_name, chunk_name) if i != max_total_nchunks: n_left = nrecords - (n_per_chunk * i) if n_left < 0 or (n_left == 0 and nchunks != 1): break for _ in range(min(n_per_chunk, n_left)): records.append(next(it)) else: for x in it: records.append(x) write_records_func(pbcore_writer_class, records, fasta_chunk_path) total_bases = sum(len(r.sequence) for r in records) d = dict(total_bases=total_bases, nrecords=len(records)) d[chunk_key] = os.path.abspath(fasta_chunk_path) if extra_chunk_keys is not None: d.update(extra_chunk_keys) c = PipelineChunk(chunk_id, **d) yield c
def _to_barcode_chunked_dataset_files(dataset_type, dataset_path, max_total_nchunks, chunk_key, dir_name, base_name, ext, extra_chunk_keys=None): """ Similar to to_chunked_subreadset_files, but chunks reads by barcode lists. """ dset = dataset_type(dataset_path, strict=True) dset_chunks = dset.split(chunks=max_total_nchunks, barcodes=True) d = {} for i, dset in enumerate(dset_chunks): chunk_id = '_'.join([base_name, str(i)]) chunk_name = '.'.join([chunk_id, ext]) chunk_path = os.path.join(dir_name, chunk_name) _add_chunked_tag_if_missing(dset) dset.write(chunk_path) d[chunk_key] = os.path.abspath(chunk_path) if extra_chunk_keys is not None: for key, value in extra_chunk_keys.items(): d[key] = value c = PipelineChunk(chunk_id, **d) yield c
def _to_bam_chunked_dataset_files(dataset_type, dataset_path, max_total_nchunks, chunk_key, dir_name, base_name, ext, extra_chunk_keys=None): """ Similar to to_chunked_subreadset_files, but chunks reads by ZMW ranges for input to pbccs or pbtranscript. """ dset = dataset_type(dataset_path, strict=True) dset_chunks = dset.split(chunks=max_total_nchunks, zmws=False, ignoreSubDatasets=True) d = {} for i, dset in enumerate(dset_chunks): chunk_id = '_'.join([base_name, str(i)]) chunk_name = '.'.join([chunk_id, ext]) chunk_path = os.path.join(dir_name, chunk_name) dset.write(chunk_path) d[chunk_key] = os.path.abspath(chunk_path) if extra_chunk_keys is not None: d.update(extra_chunk_keys) c = PipelineChunk(chunk_id, **d) yield c
def __to_chunked_fastx_files(fastx_reader_klass, fastax_writer_klass, chunk_key, fastx_path, max_total_nchunks, dir_name, base_name, ext): """Convert a Fasta/Fasta file to a chunked list of files""" # grab the number of records so we can chunk it with fastx_reader_klass(fastx_path) as f: nrecords = __get_nrecords_from_reader(f) max_total_nchunks = min(nrecords, max_total_nchunks) n = int(math.ceil(float(nrecords)) / max_total_nchunks) nchunks = 0 with fastx_reader_klass(fastx_path) as r: it = iter(r) for i in xrange(max_total_nchunks): records = [] chunk_id = "_".join([base_name, str(nchunks)]) chunk_name = ".".join([chunk_id, ext]) nchunks += 1 fasta_chunk_path = os.path.join(dir_name, chunk_name) if i != max_total_nchunks: for _ in xrange(n): records.append(it.next()) else: for x in it: records.append(x) write_fasta_records(fastax_writer_klass, records, fasta_chunk_path) total_bases = sum(len(r.sequence) for r in records) d = dict(total_bases=total_bases, nrecords=len(records)) d[chunk_key] = os.path.abspath(fasta_chunk_path) c = PipelineChunk(chunk_id, **d) yield c
def run_main(subreads_file, isoforms_file, cluster_pickle_file, nfl_pickle_file, output_json, max_nchunks): log.info("Running {f} into {n} chunks".format(f=cluster_pickle_file, n=max_nchunks)) uc = {} with open(cluster_pickle_file, 'rb') as f: a = cPickle.load(f) uc = a['uc'] assert len(uc) > 0 n_chunks = min(len(uc), max_nchunks) base_name = "cluster_chunk" dir_name = os.path.dirname(output_json) chunks = [] for i in range(n_chunks): chunk_id = "_".join([base_name, str(i)]) chunk_name = ".".join([chunk_id, "pickle"]) chunk_pickle_file = os.path.join(dir_name, chunk_name) with open(chunk_pickle_file, 'wb') as f: cPickle.dump({ '__chunk_i': i, '__chunk_n': n_chunks, 'pickle_file': cluster_pickle_file, }, f) d = { '$chunk.subreadset_id': subreads_file, '$chunk.contigset_id': isoforms_file, '$chunk.nfl_pickle_id': nfl_pickle_file, '$chunk.pickle_id': chunk_pickle_file, } c = PipelineChunk(chunk_id, **d) chunks.append(c) write_pipeline_chunks(chunks, output_json, "created by pbtranscript.tasks.scatter_clusters") return 0
def to_chunked_grouped_fofn(fofn_groups, chunk_id_prefix, fofn_chunk_key, report_chunk_key, chunk_dir_name): """ :param fofn_groups: A list of FofnGroups :param chunk_id_prefix: Prefix used to create the chunk key and grouped Fofn files :param fofn_chunk_key: Value of the chunk key to write to the chunk file (e.g., $chunk.my_key) :param chunk_dir_name: Directory where the Grouped Fofn files will be written to :return: list of pipeline chunks """ chunks = [] for i, fofn_group in enumerate(fofn_groups): chunk_id = "_".join([chunk_id_prefix, str(i)]) fofn_group_name = "".join([chunk_id, ".fofn"]) fofn_group_path = os.path.join(chunk_dir_name, fofn_group_name) write_fofn(fofn_group, fofn_group_path) # Write the companion fofn metadata report fofn_report_name = "".join([chunk_id, "_report", '.json']) fofn_report_path = os.path.join(chunk_dir_name, fofn_report_name) fofn_report = fofn_to_report(len(fofn_group)) fofn_report.write_json(fofn_report_path) d = dict(nfofns=len(fofn_group)) d[fofn_chunk_key] = fofn_group_path d[report_chunk_key] = fofn_report_path c = PipelineChunk(chunk_id, **d) chunks.append(c) return chunks
def run_main(fastq_file, gmap_ref_file, output_json_file, max_nchunks): """ Parameters: fastq_file -- HQ isoforms in FASTQ gmap_ref_file -- GMAP reference set xml output_json -- chunk.json """ # Chunk FASTQ output_fastq_json = output_json_file + ".fastq.json" output_dir = op.dirname(output_json_file) CU.write_fastq_chunks_to_file(output_fastq_json, fastq_file, max_nchunks, output_dir, "scattered-fastq", "fastq") # get fastq_ids from output_fastq_json fastq_chunks = load_pipeline_chunks_from_json(output_fastq_json) fastq_files = get_datum_from_chunks_by_chunk_key(fastq_chunks, "$chunk.fastq_id") log.debug("Chunked FASTQ files are %s.", (', '.join(fastq_files))) # Writing chunk.json chunks = [] for i, fastq_file in enumerate(fastq_files): chunk_id = "_".join(["map_isoforms_to_genome_chunk", str(i)]) d = { Constants.CHUNK_KEYS[0]: fastq_file, Constants.CHUNK_KEYS[1]: gmap_ref_file } c = PipelineChunk(chunk_id, **d) chunks.append(c) log.info("Writing chunk.json to %s", output_json_file) write_pipeline_chunks(chunks, output_json_file, "created by %s" % Constants.TOOL_ID) return 0
def setUp(self): data_files = [op.join(DATA, fn) for fn in os.listdir(DATA) if fn.startswith("summary")] chunks = [PipelineChunk(chunk_id="chunk_data_{i}".format(i=i), **({self.CHUNK_KEY:fn})) for i, fn in enumerate(data_files)] write_chunks_to_json(chunks, self.INPUT_FILES[0])
def _generate_chunk_json(self, data_files): chunks = [ PipelineChunk(chunk_id="chunk_data_{i}".format(i=i), **({ self.CHUNK_KEY: fn })) for i, fn in enumerate(data_files) ] write_pipeline_chunks(chunks, self.INPUT_FILES[0], None)
def run_main(json_file, output_json_file, max_nchunks): """ Spawn a json with scripts into multiple json files each containing a script. Parameters: json_file -- json <- dict{p_id: args}, where args <- dict{'script_fn': script_fn, ...} output_json -- chunk.json """ a = json.load(open(json_file, 'r')) if len(a) == 0: raise ValueError("script json %s is empty" % json_file) out_dir = op.dirname(output_json_file) num_chunks = min(max_nchunks, len(a)) num_scripts_in_chunks = num_items_in_chunks(num_items=len(a), num_chunks=num_chunks) # Writing chunk.json base_name = "spawned_json_w_scripts_chunk" chunks = [] spawned_jsons = [] p_ids = sorted(a.keys()) for chunk_idx in range(0, num_chunks): chunk_id = "_".join([base_name, str(chunk_idx)]) spawned_json_file = op.join(out_dir, chunk_id + ".json") # make a chunk d = {Constants.CHUNK_KEYS[0]: spawned_json_file} c = PipelineChunk(chunk_id, **d) chunks.append(c) # make content for the spawned json scripts_dict = dict() num_scripts = num_scripts_in_chunks[chunk_idx] for script_idx in range(0, num_scripts): p_id = p_ids[script_idx] scripts_dict[p_id] = a[p_id] # delete p_ids[0: num_scripts] p_ids = p_ids[num_scripts:] # Write script_dict, which is a dict of {p_id: args} to spawned json with open(spawned_json_file, 'w') as writer: writer.write(json.dumps(scripts_dict) + "\n") spawned_jsons.append(spawned_json_file) if len(p_ids) != 0: raise AssertionError("Scripts of p_ids %s are not scattered." % repr(p_ids)) log.info("Spawning %s into %d files", json_file, num_chunks) log.debug("Spawned files: %s.", ", ".join(spawned_jsons)) log.info("Writing chunk.json to %s", output_json_file) write_pipeline_chunks(chunks, output_json_file, "created by %s" % Constants.TOOL_ID) return 0
def fofn_to_chunks(fofn): files = fofn_to_files(fofn) chunks = [] for i, f in enumerate(files): chunk_id = "chunk-{i}".format(i=i) _d = {Constants.CHUNK_KEY_FOFN: f} p = PipelineChunk(chunk_id, **_d) chunks.append(p) return chunks
def setUpClass(cls): super(TestGatherH5ToolContract, cls).setUpClass() cls.makeInputs() chunks = [ PipelineChunk(chunk_id="chunk_data_{i}".format(i=i), **({ cls.CHUNK_KEY: fn })) for i, fn in enumerate(cls.CHUNKED_FILES) ] write_pipeline_chunks(chunks, cls.INPUT_FILES[0], None)
def run_main(cluster_chunks_pickle_file, ccs_file, output_json_file, max_nchunks): """Scatter items in cluster_chunks_pickle Parameters: cluster_chunks_pickle_file -- ChunkTasksPickle of ClusterChunkTask objects. ccs_file -- ccs.consensusreadset.xml output_json_file -- chunk.json max_nchunks -- maximum # of chunks """ p = ChunkTasksPickle.read(cluster_chunks_pickle_file) assert all([isinstance(r, ClusterChunkTask) for r in p]) out_dir = op.dirname(output_json_file) # sort and group tasks groups = p.sort_and_group_tasks(max_nchunks=max_nchunks) # Writing chunk.json base_name = "spawned_cluster_chunk" chunks = [] spawned_pickles = [] for group_index in range(0, len(groups)): chunk_id = "_".join([base_name, 'group', str(group_index)]) spawned_pickle_file = op.join(out_dir, chunk_id + ".pickle") d = { Constants.CHUNK_KEYS[0]: spawned_pickle_file, Constants.CHUNK_KEYS[1]: ccs_file } c = PipelineChunk(chunk_id, **d) chunks.append(c) spawned_pickles.append(spawned_pickle_file) log.info("Spawning %s into %d files", cluster_chunks_pickle_file, len(groups)) p.spawn_pickles_by_groups(groups, spawned_pickles) log.debug("Spawned files: %s.", ", ".join(spawned_pickles)) # n_chunks = len(p) # for i in range(0, n_chunks): # chunk_id = "_".join([base_name, str(i)]) # spawned_pickle_file = op.join(out_dir, chunk_id + ".pickle") # d = {Constants.CHUNK_KEYS[0]: spawned_pickle_file, # Constants.CHUNK_KEYS[1]: ccs_file} # c = PipelineChunk(chunk_id, **d) # chunks.append(c) # spawned_pickles.append(spawned_pickle_file) # # log.info("Spawning %s into %s files", cluster_chunks_pickle_file, str(n_chunks)) # p.spawn_pickles(spawned_pickles) # log.debug("Spawned files: %s.", ", ".join(spawned_pickles)) log.info("Writing chunk.json to %s", output_json_file) write_pipeline_chunks(chunks, output_json_file, "created by %s" % Constants.TOOL_ID) return 0
def to_chunked_hdfsubreadset_files(hdfsubreadset_path, max_total_nchunks, chunk_key, dir_name, base_name, ext): dset = HdfSubreadSet(hdfsubreadset_path, strict=True) dset_chunks = dset.split(chunks=max_total_nchunks, ignoreSubDatasets=True) d = {} for i, dset in enumerate(dset_chunks): chunk_id = '_'.join([base_name, str(i)]) chunk_name = '.'.join([chunk_id, ext]) chunk_path = os.path.join(dir_name, chunk_name) dset.write(chunk_path) d[chunk_key] = os.path.abspath(chunk_path) c = PipelineChunk(chunk_id, **d) yield c
def setUpClass(cls): super(TextRecordsGatherBase, cls).setUpClass() json_file = cls.INPUT_FILES[0] base = ".".join(json_file.split(".")[:-2]) chunks = [] for i in range(2): file_name = "%s.%d.%s" % (base, i + 1, cls.EXTENSION) with open(file_name, 'w') as f: if cls.RECORD_HEADER is not None: f.write(cls.RECORD_HEADER) f.write("\n".join(cls.RECORDS[i * 2:(i + 1) * 2])) f.write("\n") # XXX we need this for CSV gather d = {cls.CHUNK_KEY: op.abspath(file_name)} c = PipelineChunk("%s_%i" % (cls.EXTENSION, i + 1), **d) chunks.append(c) write_pipeline_chunks(chunks, json_file, None)
def _to_barcode_chunked_dataset_files(dataset_type, dataset_path, max_total_nchunks, chunk_key, dir_name, base_name, ext): """ Similar to to_chunked_subreadset_files, but chunks reads by barcode lists. """ dset = dataset_type(dataset_path, strict=True) dset_chunks = dset.split(chunks=max_total_nchunks, barcodes=True) d = {} for i, dset in enumerate(dset_chunks): chunk_id = '_'.join([base_name, str(i)]) chunk_name = '.'.join([chunk_id, ext]) chunk_path = os.path.join(dir_name, chunk_name) dset.write(chunk_path) d[chunk_key] = os.path.abspath(chunk_path) c = PipelineChunk(chunk_id, **d) yield c
def _to_chunked_dataset_files(dataset_type, dataset_path, reference_path, max_total_nchunks, chunk_key, dir_name, base_name, ext): dset = dataset_type(dataset_path, strict=True) dset_chunks = dset.split(chunks=max_total_nchunks, ignoreSubDatasets=True) d = {} # sanity checking reference_set = ReferenceSet(reference_path) for i, dset in enumerate(dset_chunks): chunk_id = '_'.join([base_name, str(i)]) chunk_name = '.'.join([chunk_id, ext]) chunk_path = os.path.join(dir_name, chunk_name) dset.write(chunk_path) d[chunk_key] = os.path.abspath(chunk_path) d['$chunk.reference_id'] = reference_path c = PipelineChunk(chunk_id, **d) yield c
def chunk(): # cmds is actually a list of small bash scripts, including linefeeds. cmds = get_daligner_job_descriptions(open(run_jobs_fn), db_prefix).values() if max_total_nchunks < len(cmds): log.debug("max_total_nchunks < # daligner cmds: %d < %d" %( max_total_nchunks, len(cmds))) cmds = joined_strs(cmds, max_total_nchunks) symlink_dazzdb(os.path.dirname(run_jobs_fn), db_prefix) for i, script in enumerate(cmds): chunk_id = '_'.join([chunk_base_name, str(i)]) chunk_name = '.'.join([chunk_id, chunk_ext]) chunk_path = os.path.join(dir_name, chunk_name) script = xform_script(script) open(chunk_path, 'w').write(script) d = {} d[chunk_keys[1]] = os.path.abspath(chunk_path) d[chunk_keys[0]] = config_json_fn c = PipelineChunk(chunk_id, **d) yield c
def write_chunked_csv(chunk_key, csv_path, max_total_nchunks, dir_name, base_name, ext): # This needs to have an ignore emtpy file mode with open(csv_path, 'r') as csv_fh: reader = csv.DictReader(csv_fh) field_names = reader.fieldnames nrecords = __get_nrecords_from_reader(reader) max_total_nchunks = min(nrecords, max_total_nchunks) n = int(math.ceil(float(nrecords)) / max_total_nchunks) nchunks = 0 with open(csv_path, 'r') as csv_fh: reader = csv.DictReader(csv_fh) it = iter(reader) for i in xrange(max_total_nchunks): chunk_id = "_".join([base_name, str(nchunks)]) chunk_name = ".".join([chunk_id, ext]) nchunks += 1 nchunk_records = 0 csv_chunk_path = os.path.join(dir_name, chunk_name) with open(csv_chunk_path, 'w+') as csv_chunk_fh: writer = csv.DictWriter(csv_chunk_fh, field_names) writer.writeheader() if i != max_total_nchunks: for _ in xrange(n): nchunk_records += 1 writer.writerow(it.next()) else: for x in it: nchunk_records += 1 writer.writerow(x) d = dict(nrecords=nchunk_records) d[chunk_key] = os.path.abspath(csv_chunk_path) c = PipelineChunk(chunk_id, **d) yield c
def run_main(partial_chunks_pickle_file, sentinel_file, ccs_file, output_json_file, max_nchunks): """ Spawn partial Chunk Tasks in pickle. Parameters: partial_chunks_pickle_file -- ChunkTasksPickle of PartialChunkTask objects ccs_file -- ccs dataset sentinel_file -- sentinel file to connect pbsmrtpipe tasks output_json -- chunk.json """ p = ChunkTasksPickle.read(partial_chunks_pickle_file) assert all([isinstance(r, PartialChunkTask) for r in p]) out_dir = op.dirname(output_json_file) # sort and group tasks groups = p.sort_and_group_tasks(max_nchunks=max_nchunks) # Writing chunk.json base_name = "spawned_partial_chunk" chunks = [] spawned_pickles = [] for group_index in range(0, len(groups)): chunk_id = "_".join([base_name, 'group', str(group_index)]) spawned_pickle_file = op.join(out_dir, chunk_id + ".pickle") d = { Constants.CHUNK_KEYS[0]: spawned_pickle_file, Constants.CHUNK_KEYS[1]: sentinel_file, Constants.CHUNK_KEYS[2]: ccs_file } c = PipelineChunk(chunk_id, **d) chunks.append(c) spawned_pickles.append(spawned_pickle_file) log.info("Spawning %s into %d files", partial_chunks_pickle_file, len(groups)) p.spawn_pickles_by_groups(groups=groups, out_pickle_fns=spawned_pickles) log.debug("Spawned files: %s.", ", ".join(spawned_pickles)) log.info("Writing chunk.json to %s", output_json_file) write_pipeline_chunks(chunks, output_json_file, "created by %s" % Constants.TOOL_ID) return 0
def to_chunked_alignmentset_files(alignmentset_path, reference_path, max_total_nchunks, chunk_key, dir_name, base_name, ext): dset = AlignmentSet(alignmentset_path, strict=True) dset_chunks = dset.split(contigs=True, maxChunks=max_total_nchunks, breakContigs=True) # sanity checking reference_set = ReferenceSet(reference_path, strict=True) d = {} for i, dset in enumerate(dset_chunks): chunk_id = '_'.join([base_name, str(i)]) chunk_name = '.'.join([chunk_id, ext]) chunk_path = os.path.join(dir_name, chunk_name) dset.write(chunk_path) d[chunk_key] = os.path.abspath(chunk_path) d['$chunk.reference_id'] = reference_path c = PipelineChunk(chunk_id, **d) yield c
def load_pipeline_chunks_from_json(path): """Returns a list of Pipeline Chunks :rtype: list[PipelineChunk] """ try: with open(path, 'r') as f: d = json.loads(f.read()) chunks = [] for cs in d['chunks']: chunk_id = cs['chunk_id'] chunk_datum = cs['chunk'] c = PipelineChunk(chunk_id, **chunk_datum) chunks.append(c) return chunks except Exception: msg = "Unable to load pipeline chunks from {f}".format(f=path) sys.stderr.write(msg + "\n") raise
def run_main(fastq_file, gmap_ref_file, output_json_file, max_nchunks): """ Parameters: fastq_file -- HQ isoforms in FASTQ gmap_ref_file -- GMAP reference set xml output_json -- chunk.json """ # Check size of fastq_file before scattering, so that a meaningful # error message can be displayed instead of 'float division by zero' if os.stat(fastq_file).st_size == 0: raise IOError("Fastq file %s is empty, exiting." % fastq_file) # Chunk FASTQ output_fastq_json = output_json_file + ".fastq.json" output_dir = op.dirname(output_json_file) CU.write_fastq_chunks_to_file(output_fastq_json, fastq_file, max_nchunks, output_dir, "scattered-fastq", "fastq") # get fastq_ids from output_fastq_json fastq_chunks = load_pipeline_chunks_from_json(output_fastq_json) fastq_files = get_datum_from_chunks_by_chunk_key(fastq_chunks, "$chunk.fastq_id") log.debug("Chunked FASTQ files are %s.", (', '.join(fastq_files))) # Writing chunk.json chunks = [] for i, fastq_file in enumerate(fastq_files): chunk_id = "_".join(["map_isoforms_to_genome_chunk", str(i)]) d = { Constants.CHUNK_KEYS[0]: fastq_file, Constants.CHUNK_KEYS[1]: gmap_ref_file } c = PipelineChunk(chunk_id, **d) chunks.append(c) log.info("Writing chunk.json to %s", output_json_file) write_pipeline_chunks(chunks, output_json_file, "created by %s" % Constants.TOOL_ID) return 0
def test_write_chunks(self): def f(i): return { "{c}movie_fofn_id".format(c=PipelineChunk.CHUNK_KEY_PREFIX): "/path/to_movie-{i}.fofn".format(i=i), "{c}region_fofn_id".format(c=PipelineChunk.CHUNK_KEY_PREFIX): "/path/rgn_{i}.fofn".format(i=i) } to_i = lambda i: "chunk-id-{i}".format(i=i) to_p = lambda i: PipelineChunk(to_i(i), **f(i)) nchunks = 5 pipeline_chunks = [to_p(i) for i in xrange(nchunks)] log.debug(pipeline_chunks) tmp_name = get_temp_file("_chunk.json") IO.write_pipeline_chunks(pipeline_chunks, tmp_name, "Example chunk file") pchunks = IO.load_pipeline_chunks_from_json(tmp_name) self.assertEquals(len(pchunks), nchunks)
def to_zmw_chunked_datastore_files(datastore_path, reference_path, max_total_nchunks, chunk_key, dir_name, base_name, ext): """ dataset_path --- datastore.json file """ datastorefile_objs, dataset_type_id, cls, dataset_ext = datastore_to_datastorefile_objs( datastore_path) dset = cls(*[f.path for f in datastorefile_objs], strict=True) dset.newUuid() merged_dataset_xml = os.path.join(dir_name, base_name + '.merged.' + dataset_ext) dset.write(merged_dataset_xml) dset = cls(merged_dataset_xml, strict=True) kwargs = {"chunks": max_total_nchunks, "zmws": True} if cls == TranscriptSet: kwargs.update(TRANSCRIPTSET_EXTRA_SPLIT_ARGS) dset_chunks = dset.split(**kwargs) d = {} for i, _dset in enumerate(dset_chunks): chunk_id = '_'.join([base_name, str(i)]) # write chunk xml file, e.g., chunk_1.subreadset.xml chunk_dataset_path = os.path.abspath( os.path.join(dir_name, chunk_id + '.' + dataset_ext)) _add_chunked_tag_if_missing(_dset) _dset.write(chunk_dataset_path) # write chunk datastore.json file. chunk_datastore_path = os.path.abspath( os.path.join(dir_name, chunk_id + '.' + ext)) dataset_to_datastore(chunk_dataset_path, chunk_datastore_path) d[chunk_key] = chunk_datastore_path d['$chunk.reference_id'] = reference_path c = PipelineChunk(chunk_id, **d) yield c
def to_p(i): return PipelineChunk(to_i(i), **f(i))