コード例 #1
0
ファイル: chunk_utils.py プロジェクト: MShaffar19/pbcoretools
def _to_zmw_chunked_dataset_files(dataset_type,
                                  dataset_path,
                                  max_total_nchunks,
                                  chunk_key,
                                  dir_name,
                                  base_name,
                                  ext,
                                  extra_chunk_keys=None,
                                  extra_split_args=None):
    """
    Similar to to_chunked_subreadset_files, but chunks reads by ZMW ranges
    for input to pbccs or pbtranscript.
    """
    dset = dataset_type(dataset_path, strict=True)
    kwargs = {"chunks": max_total_nchunks, "zmws": True}
    if extra_split_args is not None:
        kwargs.update(extra_split_args)
    dset_chunks = dset.split(**kwargs)
    d = {}
    for i, dset in enumerate(dset_chunks):
        chunk_id = '_'.join([base_name, str(i)])
        chunk_name = '.'.join([chunk_id, ext])
        chunk_path = os.path.join(dir_name, chunk_name)
        _add_chunked_tag_if_missing(dset)
        dset.write(chunk_path)
        d[chunk_key] = os.path.abspath(chunk_path)
        if extra_chunk_keys is not None:
            d.update(extra_chunk_keys)
        c = PipelineChunk(chunk_id, **d)
        yield c
コード例 #2
0
ファイル: chunk_utils.py プロジェクト: MShaffar19/pbcoretools
def __to_chunked_fastx_files(write_records_func,
                             pbcore_reader_class,
                             pbcore_writer_class,
                             chunk_key,
                             input_file,
                             max_total_nchunks,
                             dir_name,
                             base_name,
                             ext,
                             extra_chunk_keys=None):
    """Convert a Fasta/Fasta file to a chunked list of files

    :param write_records_func: Func(writer_class, records, file_name)
    :param pbcore_reader_class Pbcore IO Reader
    :param pbcore_writer_class Pbcore IO Writer
    :param chunk_key: Chunk key to assign to PipelineChunk
    :param input_file: Path to input file


    """
    # grab the number of records so we can chunk it
    with pbcore_reader_class(input_file) as f:
        nrecords = __get_nrecords_from_reader(f)

    max_total_nchunks = max(1, min(nrecords, max_total_nchunks))

    n_per_chunk = int(math.ceil(float(nrecords) / max_total_nchunks))

    log.info(
        "Found {n} total records. Max total chunks {m}. Splitting into chunks of approximately {x} records each"
        .format(n=nrecords, x=n_per_chunk, m=max_total_nchunks))
    nchunks = 0
    with pbcore_reader_class(input_file) as r:
        it = iter(r)
        for i in range(max_total_nchunks):
            records = []

            chunk_id = "_".join([base_name, str(nchunks)])
            chunk_name = ".".join([chunk_id, ext])
            nchunks += 1
            fasta_chunk_path = os.path.join(dir_name, chunk_name)

            if i != max_total_nchunks:
                n_left = nrecords - (n_per_chunk * i)
                if n_left < 0 or (n_left == 0 and nchunks != 1):
                    break
                for _ in range(min(n_per_chunk, n_left)):
                    records.append(next(it))
            else:
                for x in it:
                    records.append(x)

            write_records_func(pbcore_writer_class, records, fasta_chunk_path)
            total_bases = sum(len(r.sequence) for r in records)
            d = dict(total_bases=total_bases, nrecords=len(records))
            d[chunk_key] = os.path.abspath(fasta_chunk_path)
            if extra_chunk_keys is not None:
                d.update(extra_chunk_keys)
            c = PipelineChunk(chunk_id, **d)
            yield c
コード例 #3
0
ファイル: chunk_utils.py プロジェクト: MShaffar19/pbcoretools
def _to_barcode_chunked_dataset_files(dataset_type,
                                      dataset_path,
                                      max_total_nchunks,
                                      chunk_key,
                                      dir_name,
                                      base_name,
                                      ext,
                                      extra_chunk_keys=None):
    """
    Similar to to_chunked_subreadset_files, but chunks reads by barcode lists.
    """
    dset = dataset_type(dataset_path, strict=True)
    dset_chunks = dset.split(chunks=max_total_nchunks, barcodes=True)
    d = {}
    for i, dset in enumerate(dset_chunks):
        chunk_id = '_'.join([base_name, str(i)])
        chunk_name = '.'.join([chunk_id, ext])
        chunk_path = os.path.join(dir_name, chunk_name)
        _add_chunked_tag_if_missing(dset)
        dset.write(chunk_path)
        d[chunk_key] = os.path.abspath(chunk_path)
        if extra_chunk_keys is not None:
            for key, value in extra_chunk_keys.items():
                d[key] = value
        c = PipelineChunk(chunk_id, **d)
        yield c
コード例 #4
0
def _to_bam_chunked_dataset_files(dataset_type,
                                  dataset_path,
                                  max_total_nchunks,
                                  chunk_key,
                                  dir_name,
                                  base_name,
                                  ext,
                                  extra_chunk_keys=None):
    """
    Similar to to_chunked_subreadset_files, but chunks reads by ZMW ranges
    for input to pbccs or pbtranscript.
    """
    dset = dataset_type(dataset_path, strict=True)
    dset_chunks = dset.split(chunks=max_total_nchunks,
                             zmws=False,
                             ignoreSubDatasets=True)
    d = {}
    for i, dset in enumerate(dset_chunks):
        chunk_id = '_'.join([base_name, str(i)])
        chunk_name = '.'.join([chunk_id, ext])
        chunk_path = os.path.join(dir_name, chunk_name)
        dset.write(chunk_path)
        d[chunk_key] = os.path.abspath(chunk_path)
        if extra_chunk_keys is not None:
            d.update(extra_chunk_keys)
        c = PipelineChunk(chunk_id, **d)
        yield c
コード例 #5
0
def __to_chunked_fastx_files(fastx_reader_klass, fastax_writer_klass, chunk_key, fastx_path, max_total_nchunks, dir_name, base_name, ext):
    """Convert a Fasta/Fasta file to a chunked list of files"""

    # grab the number of records so we can chunk it
    with fastx_reader_klass(fastx_path) as f:
        nrecords = __get_nrecords_from_reader(f)

    max_total_nchunks = min(nrecords, max_total_nchunks)

    n = int(math.ceil(float(nrecords)) / max_total_nchunks)

    nchunks = 0
    with fastx_reader_klass(fastx_path) as r:
        it = iter(r)
        for i in xrange(max_total_nchunks):
            records = []

            chunk_id = "_".join([base_name, str(nchunks)])
            chunk_name = ".".join([chunk_id, ext])
            nchunks += 1
            fasta_chunk_path = os.path.join(dir_name, chunk_name)

            if i != max_total_nchunks:
                for _ in xrange(n):
                    records.append(it.next())
            else:
                for x in it:
                    records.append(x)

            write_fasta_records(fastax_writer_klass, records, fasta_chunk_path)
            total_bases = sum(len(r.sequence) for r in records)
            d = dict(total_bases=total_bases, nrecords=len(records))
            d[chunk_key] = os.path.abspath(fasta_chunk_path)
            c = PipelineChunk(chunk_id, **d)
            yield c
コード例 #6
0
def run_main(subreads_file, isoforms_file, cluster_pickle_file,
             nfl_pickle_file, output_json, max_nchunks):
    log.info("Running {f} into {n} chunks".format(f=cluster_pickle_file,
                                                  n=max_nchunks))
    uc = {}
    with open(cluster_pickle_file, 'rb') as f:
        a = cPickle.load(f)
        uc = a['uc']
    assert len(uc) > 0
    n_chunks = min(len(uc), max_nchunks)
    base_name = "cluster_chunk"
    dir_name = os.path.dirname(output_json)
    chunks = []
    for i in range(n_chunks):
        chunk_id = "_".join([base_name, str(i)])
        chunk_name = ".".join([chunk_id, "pickle"])
        chunk_pickle_file = os.path.join(dir_name, chunk_name)
        with open(chunk_pickle_file, 'wb') as f:
            cPickle.dump({
                '__chunk_i': i,
                '__chunk_n': n_chunks,
                'pickle_file': cluster_pickle_file,
            }, f)
        d = {
            '$chunk.subreadset_id': subreads_file,
            '$chunk.contigset_id': isoforms_file,
            '$chunk.nfl_pickle_id': nfl_pickle_file,
            '$chunk.pickle_id': chunk_pickle_file,
        }
        c = PipelineChunk(chunk_id, **d)
        chunks.append(c)
    write_pipeline_chunks(chunks, output_json,
        "created by pbtranscript.tasks.scatter_clusters")
    return 0
コード例 #7
0
def to_chunked_grouped_fofn(fofn_groups, chunk_id_prefix, fofn_chunk_key,
                            report_chunk_key, chunk_dir_name):
    """

    :param fofn_groups: A list of FofnGroups
    :param chunk_id_prefix: Prefix used to create the chunk key and grouped
    Fofn files
    :param fofn_chunk_key: Value of the chunk key to write to the chunk file (e.g., $chunk.my_key)
    :param chunk_dir_name: Directory where the Grouped Fofn files will be
    written to
    :return: list of pipeline chunks
    """

    chunks = []
    for i, fofn_group in enumerate(fofn_groups):
        chunk_id = "_".join([chunk_id_prefix, str(i)])
        fofn_group_name = "".join([chunk_id, ".fofn"])
        fofn_group_path = os.path.join(chunk_dir_name, fofn_group_name)

        write_fofn(fofn_group, fofn_group_path)

        # Write the companion fofn metadata report
        fofn_report_name = "".join([chunk_id, "_report", '.json'])
        fofn_report_path = os.path.join(chunk_dir_name, fofn_report_name)
        fofn_report = fofn_to_report(len(fofn_group))
        fofn_report.write_json(fofn_report_path)

        d = dict(nfofns=len(fofn_group))
        d[fofn_chunk_key] = fofn_group_path
        d[report_chunk_key] = fofn_report_path

        c = PipelineChunk(chunk_id, **d)
        chunks.append(c)

    return chunks
コード例 #8
0
def run_main(fastq_file, gmap_ref_file, output_json_file, max_nchunks):
    """
    Parameters:
      fastq_file -- HQ isoforms in FASTQ
      gmap_ref_file -- GMAP reference set xml
      output_json -- chunk.json
    """
    # Chunk FASTQ
    output_fastq_json = output_json_file + ".fastq.json"
    output_dir = op.dirname(output_json_file)
    CU.write_fastq_chunks_to_file(output_fastq_json, fastq_file, max_nchunks,
                                  output_dir, "scattered-fastq", "fastq")

    # get fastq_ids from output_fastq_json
    fastq_chunks = load_pipeline_chunks_from_json(output_fastq_json)
    fastq_files = get_datum_from_chunks_by_chunk_key(fastq_chunks,
                                                     "$chunk.fastq_id")
    log.debug("Chunked FASTQ files are %s.", (', '.join(fastq_files)))

    # Writing chunk.json
    chunks = []
    for i, fastq_file in enumerate(fastq_files):
        chunk_id = "_".join(["map_isoforms_to_genome_chunk", str(i)])
        d = {
            Constants.CHUNK_KEYS[0]: fastq_file,
            Constants.CHUNK_KEYS[1]: gmap_ref_file
        }
        c = PipelineChunk(chunk_id, **d)
        chunks.append(c)

    log.info("Writing chunk.json to %s", output_json_file)
    write_pipeline_chunks(chunks, output_json_file,
                          "created by %s" % Constants.TOOL_ID)
    return 0
コード例 #9
0
 def setUp(self):
     data_files = [op.join(DATA, fn) for fn in os.listdir(DATA)
                   if fn.startswith("summary")]
     chunks = [PipelineChunk(chunk_id="chunk_data_{i}".format(i=i),
                             **({self.CHUNK_KEY:fn}))
               for i, fn in enumerate(data_files)]
     write_chunks_to_json(chunks, self.INPUT_FILES[0])
コード例 #10
0
 def _generate_chunk_json(self, data_files):
     chunks = [
         PipelineChunk(chunk_id="chunk_data_{i}".format(i=i),
                       **({
                           self.CHUNK_KEY: fn
                       })) for i, fn in enumerate(data_files)
     ]
     write_pipeline_chunks(chunks, self.INPUT_FILES[0], None)
コード例 #11
0
def run_main(json_file, output_json_file, max_nchunks):
    """
    Spawn a json with scripts into multiple json files each containing a script.
    Parameters:
      json_file -- json <- dict{p_id: args}, where args <- dict{'script_fn': script_fn, ...}
      output_json -- chunk.json
    """
    a = json.load(open(json_file, 'r'))

    if len(a) == 0:
        raise ValueError("script json %s is empty" % json_file)
    out_dir = op.dirname(output_json_file)

    num_chunks = min(max_nchunks, len(a))
    num_scripts_in_chunks = num_items_in_chunks(num_items=len(a),
                                                num_chunks=num_chunks)

    # Writing chunk.json
    base_name = "spawned_json_w_scripts_chunk"
    chunks = []
    spawned_jsons = []

    p_ids = sorted(a.keys())
    for chunk_idx in range(0, num_chunks):
        chunk_id = "_".join([base_name, str(chunk_idx)])
        spawned_json_file = op.join(out_dir, chunk_id + ".json")
        # make a chunk
        d = {Constants.CHUNK_KEYS[0]: spawned_json_file}
        c = PipelineChunk(chunk_id, **d)
        chunks.append(c)

        # make content for the spawned json
        scripts_dict = dict()
        num_scripts = num_scripts_in_chunks[chunk_idx]
        for script_idx in range(0, num_scripts):
            p_id = p_ids[script_idx]
            scripts_dict[p_id] = a[p_id]

        # delete p_ids[0: num_scripts]
        p_ids = p_ids[num_scripts:]

        # Write script_dict, which is a dict of {p_id: args} to spawned json
        with open(spawned_json_file, 'w') as writer:
            writer.write(json.dumps(scripts_dict) + "\n")

        spawned_jsons.append(spawned_json_file)

    if len(p_ids) != 0:
        raise AssertionError("Scripts of p_ids %s are not scattered." %
                             repr(p_ids))

    log.info("Spawning %s into %d files", json_file, num_chunks)
    log.debug("Spawned files: %s.", ", ".join(spawned_jsons))
    log.info("Writing chunk.json to %s", output_json_file)
    write_pipeline_chunks(chunks, output_json_file,
                          "created by %s" % Constants.TOOL_ID)
    return 0
コード例 #12
0
def fofn_to_chunks(fofn):
    files = fofn_to_files(fofn)
    chunks = []
    for i, f in enumerate(files):
        chunk_id = "chunk-{i}".format(i=i)
        _d = {Constants.CHUNK_KEY_FOFN: f}
        p = PipelineChunk(chunk_id, **_d)
        chunks.append(p)
    return chunks
コード例 #13
0
ファイル: test_gather_h5.py プロジェクト: khjia/kineticsTools
 def setUpClass(cls):
     super(TestGatherH5ToolContract, cls).setUpClass()
     cls.makeInputs()
     chunks = [
         PipelineChunk(chunk_id="chunk_data_{i}".format(i=i),
                       **({
                           cls.CHUNK_KEY: fn
                       })) for i, fn in enumerate(cls.CHUNKED_FILES)
     ]
     write_pipeline_chunks(chunks, cls.INPUT_FILES[0], None)
コード例 #14
0
def run_main(cluster_chunks_pickle_file, ccs_file, output_json_file,
             max_nchunks):
    """Scatter items in cluster_chunks_pickle
    Parameters:
      cluster_chunks_pickle_file -- ChunkTasksPickle of ClusterChunkTask objects.
      ccs_file -- ccs.consensusreadset.xml
      output_json_file -- chunk.json
      max_nchunks -- maximum # of chunks
    """
    p = ChunkTasksPickle.read(cluster_chunks_pickle_file)
    assert all([isinstance(r, ClusterChunkTask) for r in p])
    out_dir = op.dirname(output_json_file)

    # sort and group tasks
    groups = p.sort_and_group_tasks(max_nchunks=max_nchunks)

    # Writing chunk.json
    base_name = "spawned_cluster_chunk"
    chunks = []
    spawned_pickles = []
    for group_index in range(0, len(groups)):
        chunk_id = "_".join([base_name, 'group', str(group_index)])
        spawned_pickle_file = op.join(out_dir, chunk_id + ".pickle")
        d = {
            Constants.CHUNK_KEYS[0]: spawned_pickle_file,
            Constants.CHUNK_KEYS[1]: ccs_file
        }
        c = PipelineChunk(chunk_id, **d)
        chunks.append(c)
        spawned_pickles.append(spawned_pickle_file)

    log.info("Spawning %s into %d files", cluster_chunks_pickle_file,
             len(groups))
    p.spawn_pickles_by_groups(groups, spawned_pickles)
    log.debug("Spawned files: %s.", ", ".join(spawned_pickles))

    #    n_chunks = len(p)
    #    for i in range(0, n_chunks):
    #        chunk_id = "_".join([base_name, str(i)])
    #        spawned_pickle_file = op.join(out_dir, chunk_id + ".pickle")
    #        d = {Constants.CHUNK_KEYS[0]: spawned_pickle_file,
    #             Constants.CHUNK_KEYS[1]: ccs_file}
    #        c = PipelineChunk(chunk_id, **d)
    #        chunks.append(c)
    #        spawned_pickles.append(spawned_pickle_file)
    #
    #    log.info("Spawning %s into %s files", cluster_chunks_pickle_file, str(n_chunks))
    #    p.spawn_pickles(spawned_pickles)
    #    log.debug("Spawned files: %s.", ", ".join(spawned_pickles))

    log.info("Writing chunk.json to %s", output_json_file)
    write_pipeline_chunks(chunks, output_json_file,
                          "created by %s" % Constants.TOOL_ID)
    return 0
コード例 #15
0
def to_chunked_hdfsubreadset_files(hdfsubreadset_path, max_total_nchunks,
                                   chunk_key, dir_name, base_name, ext):
    dset = HdfSubreadSet(hdfsubreadset_path, strict=True)
    dset_chunks = dset.split(chunks=max_total_nchunks, ignoreSubDatasets=True)
    d = {}
    for i, dset in enumerate(dset_chunks):
        chunk_id = '_'.join([base_name, str(i)])
        chunk_name = '.'.join([chunk_id, ext])
        chunk_path = os.path.join(dir_name, chunk_name)
        dset.write(chunk_path)
        d[chunk_key] = os.path.abspath(chunk_path)
        c = PipelineChunk(chunk_id, **d)
        yield c
コード例 #16
0
 def setUpClass(cls):
     super(TextRecordsGatherBase, cls).setUpClass()
     json_file = cls.INPUT_FILES[0]
     base = ".".join(json_file.split(".")[:-2])
     chunks = []
     for i in range(2):
         file_name = "%s.%d.%s" % (base, i + 1, cls.EXTENSION)
         with open(file_name, 'w') as f:
             if cls.RECORD_HEADER is not None:
                 f.write(cls.RECORD_HEADER)
             f.write("\n".join(cls.RECORDS[i * 2:(i + 1) * 2]))
             f.write("\n")  # XXX we need this for CSV gather
         d = {cls.CHUNK_KEY: op.abspath(file_name)}
         c = PipelineChunk("%s_%i" % (cls.EXTENSION, i + 1), **d)
         chunks.append(c)
     write_pipeline_chunks(chunks, json_file, None)
コード例 #17
0
def _to_barcode_chunked_dataset_files(dataset_type, dataset_path,
                                      max_total_nchunks, chunk_key, dir_name,
                                      base_name, ext):
    """
    Similar to to_chunked_subreadset_files, but chunks reads by barcode lists.
    """
    dset = dataset_type(dataset_path, strict=True)
    dset_chunks = dset.split(chunks=max_total_nchunks, barcodes=True)
    d = {}
    for i, dset in enumerate(dset_chunks):
        chunk_id = '_'.join([base_name, str(i)])
        chunk_name = '.'.join([chunk_id, ext])
        chunk_path = os.path.join(dir_name, chunk_name)
        dset.write(chunk_path)
        d[chunk_key] = os.path.abspath(chunk_path)
        c = PipelineChunk(chunk_id, **d)
        yield c
コード例 #18
0
def _to_chunked_dataset_files(dataset_type, dataset_path, reference_path,
                              max_total_nchunks, chunk_key, dir_name,
                              base_name, ext):
    dset = dataset_type(dataset_path, strict=True)
    dset_chunks = dset.split(chunks=max_total_nchunks, ignoreSubDatasets=True)
    d = {}

    # sanity checking
    reference_set = ReferenceSet(reference_path)
    for i, dset in enumerate(dset_chunks):
        chunk_id = '_'.join([base_name, str(i)])
        chunk_name = '.'.join([chunk_id, ext])
        chunk_path = os.path.join(dir_name, chunk_name)
        dset.write(chunk_path)
        d[chunk_key] = os.path.abspath(chunk_path)
        d['$chunk.reference_id'] = reference_path
        c = PipelineChunk(chunk_id, **d)
        yield c
コード例 #19
0
ファイル: chunk.py プロジェクト: dayedepps/FALCON-pbsmrtpipe
 def chunk():
     # cmds is actually a list of small bash scripts, including linefeeds.
     cmds = get_daligner_job_descriptions(open(run_jobs_fn), db_prefix).values()
     if max_total_nchunks < len(cmds):
         log.debug("max_total_nchunks < # daligner cmds: %d < %d" %(
             max_total_nchunks, len(cmds)))
         cmds = joined_strs(cmds, max_total_nchunks)
     symlink_dazzdb(os.path.dirname(run_jobs_fn), db_prefix)
     for i, script in enumerate(cmds):
         chunk_id = '_'.join([chunk_base_name, str(i)])
         chunk_name = '.'.join([chunk_id, chunk_ext])
         chunk_path = os.path.join(dir_name, chunk_name)
         script = xform_script(script)
         open(chunk_path, 'w').write(script)
         d = {}
         d[chunk_keys[1]] = os.path.abspath(chunk_path)
         d[chunk_keys[0]] = config_json_fn
         c = PipelineChunk(chunk_id, **d)
         yield c
コード例 #20
0
def write_chunked_csv(chunk_key, csv_path, max_total_nchunks, dir_name,
                      base_name, ext):
    # This needs to have an ignore emtpy file mode

    with open(csv_path, 'r') as csv_fh:
        reader = csv.DictReader(csv_fh)
        field_names = reader.fieldnames
        nrecords = __get_nrecords_from_reader(reader)

    max_total_nchunks = min(nrecords, max_total_nchunks)

    n = int(math.ceil(float(nrecords)) / max_total_nchunks)

    nchunks = 0
    with open(csv_path, 'r') as csv_fh:
        reader = csv.DictReader(csv_fh)

        it = iter(reader)
        for i in xrange(max_total_nchunks):

            chunk_id = "_".join([base_name, str(nchunks)])
            chunk_name = ".".join([chunk_id, ext])
            nchunks += 1
            nchunk_records = 0
            csv_chunk_path = os.path.join(dir_name, chunk_name)

            with open(csv_chunk_path, 'w+') as csv_chunk_fh:
                writer = csv.DictWriter(csv_chunk_fh, field_names)
                writer.writeheader()
                if i != max_total_nchunks:
                    for _ in xrange(n):
                        nchunk_records += 1
                        writer.writerow(it.next())
                else:
                    for x in it:
                        nchunk_records += 1
                        writer.writerow(x)

            d = dict(nrecords=nchunk_records)
            d[chunk_key] = os.path.abspath(csv_chunk_path)
            c = PipelineChunk(chunk_id, **d)
            yield c
コード例 #21
0
def run_main(partial_chunks_pickle_file, sentinel_file, ccs_file,
             output_json_file, max_nchunks):
    """
    Spawn partial Chunk Tasks in pickle.
    Parameters:
      partial_chunks_pickle_file -- ChunkTasksPickle of PartialChunkTask objects
      ccs_file -- ccs dataset
      sentinel_file -- sentinel file to connect pbsmrtpipe tasks
      output_json -- chunk.json
    """
    p = ChunkTasksPickle.read(partial_chunks_pickle_file)
    assert all([isinstance(r, PartialChunkTask) for r in p])
    out_dir = op.dirname(output_json_file)

    # sort and group tasks
    groups = p.sort_and_group_tasks(max_nchunks=max_nchunks)

    # Writing chunk.json
    base_name = "spawned_partial_chunk"
    chunks = []
    spawned_pickles = []
    for group_index in range(0, len(groups)):
        chunk_id = "_".join([base_name, 'group', str(group_index)])
        spawned_pickle_file = op.join(out_dir, chunk_id + ".pickle")
        d = {
            Constants.CHUNK_KEYS[0]: spawned_pickle_file,
            Constants.CHUNK_KEYS[1]: sentinel_file,
            Constants.CHUNK_KEYS[2]: ccs_file
        }
        c = PipelineChunk(chunk_id, **d)
        chunks.append(c)
        spawned_pickles.append(spawned_pickle_file)

    log.info("Spawning %s into %d files", partial_chunks_pickle_file,
             len(groups))
    p.spawn_pickles_by_groups(groups=groups, out_pickle_fns=spawned_pickles)
    log.debug("Spawned files: %s.", ", ".join(spawned_pickles))

    log.info("Writing chunk.json to %s", output_json_file)
    write_pipeline_chunks(chunks, output_json_file,
                          "created by %s" % Constants.TOOL_ID)
    return 0
コード例 #22
0
def to_chunked_alignmentset_files(alignmentset_path, reference_path,
                                  max_total_nchunks, chunk_key, dir_name,
                                  base_name, ext):
    dset = AlignmentSet(alignmentset_path, strict=True)
    dset_chunks = dset.split(contigs=True,
                             maxChunks=max_total_nchunks,
                             breakContigs=True)

    # sanity checking
    reference_set = ReferenceSet(reference_path, strict=True)
    d = {}
    for i, dset in enumerate(dset_chunks):
        chunk_id = '_'.join([base_name, str(i)])
        chunk_name = '.'.join([chunk_id, ext])
        chunk_path = os.path.join(dir_name, chunk_name)
        dset.write(chunk_path)
        d[chunk_key] = os.path.abspath(chunk_path)
        d['$chunk.reference_id'] = reference_path
        c = PipelineChunk(chunk_id, **d)
        yield c
コード例 #23
0
ファイル: common.py プロジェクト: mpkocher/pbcommand
def load_pipeline_chunks_from_json(path):
    """Returns a list of Pipeline Chunks


    :rtype: list[PipelineChunk]
    """

    try:
        with open(path, 'r') as f:
            d = json.loads(f.read())

        chunks = []
        for cs in d['chunks']:
            chunk_id = cs['chunk_id']
            chunk_datum = cs['chunk']
            c = PipelineChunk(chunk_id, **chunk_datum)
            chunks.append(c)
        return chunks
    except Exception:
        msg = "Unable to load pipeline chunks from {f}".format(f=path)
        sys.stderr.write(msg + "\n")
        raise
コード例 #24
0
def run_main(fastq_file, gmap_ref_file, output_json_file, max_nchunks):
    """
    Parameters:
      fastq_file -- HQ isoforms in FASTQ
      gmap_ref_file -- GMAP reference set xml
      output_json -- chunk.json
    """
    # Check size of fastq_file before scattering, so that a meaningful
    # error message can be displayed instead of 'float division by zero'
    if os.stat(fastq_file).st_size == 0:
        raise IOError("Fastq file %s is empty, exiting." % fastq_file)

    # Chunk FASTQ
    output_fastq_json = output_json_file + ".fastq.json"
    output_dir = op.dirname(output_json_file)
    CU.write_fastq_chunks_to_file(output_fastq_json, fastq_file, max_nchunks,
                                  output_dir, "scattered-fastq", "fastq")

    # get fastq_ids from output_fastq_json
    fastq_chunks = load_pipeline_chunks_from_json(output_fastq_json)
    fastq_files = get_datum_from_chunks_by_chunk_key(fastq_chunks,
                                                     "$chunk.fastq_id")
    log.debug("Chunked FASTQ files are %s.", (', '.join(fastq_files)))

    # Writing chunk.json
    chunks = []
    for i, fastq_file in enumerate(fastq_files):
        chunk_id = "_".join(["map_isoforms_to_genome_chunk", str(i)])
        d = {
            Constants.CHUNK_KEYS[0]: fastq_file,
            Constants.CHUNK_KEYS[1]: gmap_ref_file
        }
        c = PipelineChunk(chunk_id, **d)
        chunks.append(c)

    log.info("Writing chunk.json to %s", output_json_file)
    write_pipeline_chunks(chunks, output_json_file,
                          "created by %s" % Constants.TOOL_ID)
    return 0
コード例 #25
0
ファイル: test_pb_io.py プロジェクト: yqin22/pbsmrtpipe
    def test_write_chunks(self):
        def f(i):
            return {
                "{c}movie_fofn_id".format(c=PipelineChunk.CHUNK_KEY_PREFIX):
                "/path/to_movie-{i}.fofn".format(i=i),
                "{c}region_fofn_id".format(c=PipelineChunk.CHUNK_KEY_PREFIX):
                "/path/rgn_{i}.fofn".format(i=i)
            }

        to_i = lambda i: "chunk-id-{i}".format(i=i)
        to_p = lambda i: PipelineChunk(to_i(i), **f(i))

        nchunks = 5
        pipeline_chunks = [to_p(i) for i in xrange(nchunks)]
        log.debug(pipeline_chunks)
        tmp_name = get_temp_file("_chunk.json")

        IO.write_pipeline_chunks(pipeline_chunks, tmp_name,
                                 "Example chunk file")

        pchunks = IO.load_pipeline_chunks_from_json(tmp_name)
        self.assertEquals(len(pchunks), nchunks)
コード例 #26
0
ファイル: chunk_utils.py プロジェクト: MShaffar19/pbcoretools
def to_zmw_chunked_datastore_files(datastore_path, reference_path,
                                   max_total_nchunks, chunk_key, dir_name,
                                   base_name, ext):
    """
    dataset_path --- datastore.json file
    """
    datastorefile_objs, dataset_type_id, cls, dataset_ext = datastore_to_datastorefile_objs(
        datastore_path)

    dset = cls(*[f.path for f in datastorefile_objs], strict=True)
    dset.newUuid()
    merged_dataset_xml = os.path.join(dir_name,
                                      base_name + '.merged.' + dataset_ext)
    dset.write(merged_dataset_xml)

    dset = cls(merged_dataset_xml, strict=True)
    kwargs = {"chunks": max_total_nchunks, "zmws": True}
    if cls == TranscriptSet:
        kwargs.update(TRANSCRIPTSET_EXTRA_SPLIT_ARGS)

    dset_chunks = dset.split(**kwargs)

    d = {}
    for i, _dset in enumerate(dset_chunks):
        chunk_id = '_'.join([base_name, str(i)])
        # write chunk xml file, e.g., chunk_1.subreadset.xml
        chunk_dataset_path = os.path.abspath(
            os.path.join(dir_name, chunk_id + '.' + dataset_ext))
        _add_chunked_tag_if_missing(_dset)
        _dset.write(chunk_dataset_path)

        # write chunk datastore.json file.
        chunk_datastore_path = os.path.abspath(
            os.path.join(dir_name, chunk_id + '.' + ext))
        dataset_to_datastore(chunk_dataset_path, chunk_datastore_path)
        d[chunk_key] = chunk_datastore_path
        d['$chunk.reference_id'] = reference_path
        c = PipelineChunk(chunk_id, **d)
        yield c
コード例 #27
0
 def to_p(i):
     return PipelineChunk(to_i(i), **f(i))