Esempio n. 1
0
    def make_graph(self, in_queue, args):
        parallel_key_dequeue = tuple(in_queue.dequeue()
                                     for _ in range(args.enqueue))
        # read_files: [(file_path, (mmaped_file_handles, a gen)) x N]
        read_files = tuple((path_base, ) + tuple(read_gen)
                           for path_base, read_gen in zip(
                               parallel_key_dequeue,
                               pipeline.local_read_pipeline(
                                   upstream_tensors=parallel_key_dequeue,
                                   columns=self.columns)))
        # need to use tf.tuple to make sure that these are both made ready at the same time
        to_central_gen = (a[1:] for a in read_files)
        pass_around_gen = ((a[0], ) for a in read_files)

        aligner_results, run_first = tuple(
            self.make_central_pipeline(args=args,
                                       input_gen=to_central_gen,
                                       pass_around_gen=pass_around_gen))

        to_writer_gen = tuple(
            (buffer_list_handle, record_id, first_ordinal, num_records,
             file_basename) for buffer_list_handle, num_records, first_ordinal,
            record_id, file_basename in aligner_results)
        written_records = (tuple(a) for a in pipeline.local_write_pipeline(
            upstream_tensors=to_writer_gen,
            compressed=(args.compress_parallel > 0),
            record_types=self.write_columns))
        final_output_gen = zip(
            written_records,
            ((record_id, num_records, first_ordinal, file_basename)
             for _, num_records, first_ordinal, record_id, file_basename in
             aligner_results))
        output = (b + (a, ) for a, b in final_output_gen)
        return output, run_first
Esempio n. 2
0
def import_sga_local(in_queue,
                     argsj,
                     outdir=None,
                     parallel_parse=1,
                     feature="NFAT",
                     path="."):
    manifest = argsj.dataset
    if 'reference' not in manifest:
        raise Exception(
            "No reference data in manifest {}. Unaligned BAM not yet supported. Please align dataset first."
            .format(args.dataset))
    """
    key: tensor with chunk key string
    local_directory: the "base path" from which these should be read
    column_grouping_factor: the number of keys to put together
    parallel_parse: the parallelism for processing records (decomp)
    """

    ref_lens = []
    ref_seqs = []
    for contig in manifest['reference_contigs']:
        ref_lens.append(contig['length'])
        ref_seqs.append(contig['name'])

    parallel_key_dequeue = tuple(in_queue.dequeue()
                                 for _ in range(parallel_parse))
    result_chunks = pipeline.local_read_pipeline(
        upstream_tensors=parallel_key_dequeue, columns=['results'])

    result_chunk_list = [list(c) for c in result_chunks]

    parsed_results = pipeline.agd_reader_multi_column_pipeline(
        upstream_tensorz=result_chunk_list)
    parsed_results_list = list(parsed_results)

    parsed_result = pipeline.join(parsed_results_list,
                                  parallel=1,
                                  capacity=8,
                                  multi=True)[0]

    result_buf, num_results, first_ord, record_id = parsed_result
    result_buf = tf.unstack(result_buf)[0]

    result = persona_ops.import_sga(results_handle=result_buf,
                                    num_records=num_results,
                                    ref_sequences=ref_seqs,
                                    ref_seq_sizes=ref_lens,
                                    feature=feature,
                                    path=path,
                                    name="importsgaop")

    return result
Esempio n. 3
0
def agd_mark_duplicates_local(in_queue, outdir=None, parallel_parse=1, parallel_write=1, parallel_compress=1):
    """
    key: tensor with chunk key string
    local_directory: the "base path" from which these should be read
    column_grouping_factor: the number of keys to put together
    parallel_parse: the parallelism for processing records (decomp)
    """
  
    parallel_key_dequeue = tuple(in_queue.dequeue() for _ in range(parallel_parse))
    result_chunks = pipeline.local_read_pipeline(upstream_tensors=parallel_key_dequeue, columns=['results'])

    result_chunk_list = [ list(c) for c in result_chunks ]

    
    parsed_results = pipeline.agd_reader_multi_column_pipeline(upstream_tensorz=result_chunk_list)
    parsed_results_list = list(parsed_results)

    parsed_result = pipeline.join(parsed_results_list, parallel=1, capacity=8, multi=True)[0]

    # result_buf, num_recs, first_ord, record_id
    #parsed_results = tf.contrib.persona.persona_in_pipe(key=key, dataset_dir=local_directory, columns=["results"], parse_parallel=parallel_parse,
                                                        #process_parallel=1)
  
    print(parsed_result)
    result_buf, num_results, first_ord, record_id = parsed_result
    result_buf = tf.unstack(result_buf)[0]
    print(result_buf)

    bpp = persona_ops.buffer_pair_pool(size=0, bound=False, name="output_buffer_pair_pool")
    result_out = persona_ops.agd_mark_duplicates(results_handle=result_buf, num_records=num_results, 
            buffer_pair_pool=bpp, name="markdupsop")

    result_to_write = pipeline.join([result_out, num_results, first_ord, record_id], parallel=parallel_write, 
        capacity=8, multi=False)

    compressed = compress_pipeline(result_to_write, parallel_compress)

    written = _make_writers(compressed_batch=list(compressed), output_dir=outdir, write_parallelism=parallel_write)

    recs = list(written)
    all_written_keys = pipeline.join(recs, parallel=1, capacity=8, multi=False)

    return all_written_keys
Esempio n. 4
0
 def make_read_stage(self, gate):
     """
     :param gate:
     :param args:
     :return: a generator of [ id_and_count, [ filename ], [ a list of handles in the order of the columns, NOT STACKED ] ]
     """
     # each item in dequeue_ops' components is a single filename
     dequeue_ops = tuple(gate.dequeue() for _ in range(self.read_parallel))
     filenames = (components[0] for _, components in dequeue_ops)
     path_prefix = self.path_prefix
     if path_prefix != "":
         if path_prefix[-1] != "/":
             path_prefix = "{}/".format(path_prefix)
         path_prefix = tf.constant(path_prefix)
         filenames = (tf.string_join((path_prefix, fname)) for fname in filenames)
     read_file_gen = zip(dequeue_ops, pipeline.local_read_pipeline(
         upstream_tensors=filenames, columns=self.columns # a[1][0] gets the components, which is just a filename
     ))
     for a,b in read_file_gen:
         yield tuple(a)+tuple(b)
Esempio n. 5
0
    def make_read_stage(self, local_gate):
        """

        :param local_gate: components in local_gate: just the basename of intermediate files
        :return: a generator of type (id_and_count, [ handles, to, mmaped, columns ], filenames)
        """
        def gen_filenames():
            for i in range(self.read_parallel):
                idc, comp = local_gate.dequeue()
                assert len(comp) == 1
                filename = comp[0]
                yield idc, filename

        ids_and_counts, filenames = zip(*gen_filenames())
        assert len(filenames) > 0
        read_groups = (tuple(a) for a in pipeline.local_read_pipeline(
            delete_after_use=True,
            upstream_tensors=filenames,
            columns=self.columns,
            name="local_read_merge_pipeline"))

        pool = persona_ops.raw_file_system_column_pool(bound=False, size=0)
        convert = partial(persona_ops.raw_file_converter, column_pool=pool)

        def gen_conversion():
            for read_group in read_groups:
                values = tuple(
                    convert(data=file_handle) for file_handle in read_group)
                handles, record_ids = zip(*values)
                assert len(record_ids) > 0
                yield tf.stack(
                    handles,
                    name="stack_raw_filesystem_columns"), record_ids[0]

        for idc, (handles, record_id), filename in zip(ids_and_counts,
                                                       gen_conversion(),
                                                       filenames):
            yield idc, (handles, record_id, filename)
Esempio n. 6
0
def agd_flagstat_local(in_queue,
                       outdir=None,
                       parallel_parse=1,
                       parallel_write=1,
                       parallel_compress=1):
    """
    key: tensor with chunk key string
    local_directory: the "base path" from which these should be read
    column_grouping_factor: the number of keys to put together
    parallel_parse: the parallelism for processing records (decomp)
    """

    parallel_key_dequeue = tuple(in_queue.dequeue()
                                 for _ in range(parallel_parse))
    result_chunks = pipeline.local_read_pipeline(
        upstream_tensors=parallel_key_dequeue, columns=['results'])

    result_chunk_list = [list(c) for c in result_chunks]

    parsed_results = pipeline.agd_reader_multi_column_pipeline(
        upstream_tensorz=result_chunk_list)
    parsed_results_list = list(parsed_results)

    parsed_result = pipeline.join(parsed_results_list,
                                  parallel=1,
                                  capacity=8,
                                  multi=True)[0]

    # print(parsed_result)
    result_buf, num_results, first_ord, record_id = parsed_result
    result_buf = tf.unstack(result_buf)[0]
    # print(result_buf)

    result_out = persona_ops.agd_flagstat(results_handle=result_buf,
                                          num_records=num_results,
                                          name="flagstat")

    return result_out
Esempio n. 7
0
def export_bam(in_queue, args):
    manifest = args.dataset

    if 'reference' not in manifest:
        raise Exception(
            "No reference data in manifest {}. Unaligned BAM not yet supported. Please align dataset first."
            .format(args.dataset))

    #bp_handle = persona_ops.buffer_pool(size=10, bound=False, name="buf_pool")
    #mmap_pool = persona_ops.m_map_pool(size=10,  bound=False, name="file_mmap_buffer_pool")

    columns = ["base", "qual", "metadata", "results"]
    num_secondary = 0
    for column in manifest['columns']:
        if 'secondary' in column:
            columns.append(column)
            secondary += 1

    print("BAM output using columns: {}".format(columns))
    # TODO  provide option for reading from Ceph

    result_chunks = pipeline.local_read_pipeline(
        upstream_tensors=[in_queue.dequeue()], columns=columns)

    result_chunk_list = [list(c) for c in result_chunks]

    to_parse = pipeline.join(upstream_tensors=result_chunk_list,
                             parallel=args.parallel_parse,
                             multi=True,
                             capacity=8)

    parsed_results = pipeline.agd_reader_multi_column_pipeline(
        upstream_tensorz=to_parse)

    parsed_results_list = list(parsed_results)

    parsed_result = pipeline.join(parsed_results_list,
                                  parallel=1,
                                  capacity=8,
                                  multi=True)[0]

    # base, qual, meta, result, [secondary], num_recs, first_ord, record_id

    handles = parsed_result[0]
    bases = handles[0]
    quals = handles[1]
    meta = handles[2]
    # give a matrix of all the result columns
    results = tf.stack(handles[3:])
    num_recs = parsed_result[1]
    first_ord = parsed_result[2]

    if args.output_path == "":
        output_path = manifest['name'] + ".bam"
    else:
        output_path = args.output_path

    ref_lens = []
    ref_seqs = []

    for contig in manifest['reference_contigs']:
        ref_lens.append(contig['length'])
        ref_seqs.append(contig['name'])

    sort = manifest['sort'] if 'sort' in manifest else 'unsorted'

    pg_id = "personaAGD"  # TODO get from manifest
    read_group = manifest['name']
    agd_to_bam = persona_ops.agd_output_bam(results_handle=results,
                                            bases_handle=bases,
                                            qualities_handle=quals,
                                            metadata_handle=meta,
                                            num_records=num_recs,
                                            path=output_path,
                                            ref_sequences=ref_seqs,
                                            ref_seq_sizes=ref_lens,
                                            pg_id=pg_id,
                                            read_group=read_group,
                                            sort_order=sort,
                                            num_threads=args.threads)

    return [agd_to_bam], []
Esempio n. 8
0
    def make_graph(self, in_queue, args):

        # TODO remove the _out when we are satisfied it works correctly
        rec_name = args.dataset['records'][0][
            'path'][:-1]  # assuming path name is chunk_file_{ordinal}
        #print("Sorting {} chunks".format(len(args.dataset['records'])))

        parallel_key_dequeue = tuple(in_queue.dequeue()
                                     for _ in range(args.sort_read_parallel))

        # read_files: [(file_path, (mmaped_file_handles, a gen)) x N]
        mmap_pool = persona_ops.m_map_pool(name="mmap_pool",
                                           size=10,
                                           bound=False)

        read_files = list(
            list(a) for a in pipeline.local_read_pipeline(
                upstream_tensors=parallel_key_dequeue,
                columns=self.inter_columns,
                mmap_pool=mmap_pool))
        # need to use tf.tuple to make sure that these are both made ready at the same time

        buf_pool = persona_ops.buffer_pool(size=0, bound=False, name="bufpool")
        bpp = persona_ops.buffer_pair_pool(
            size=0, bound=False, name="local_read_merge_buffer_list_pool")

        sorters = self.make_sort_pipeline(args=args,
                                          input_gen=read_files,
                                          buf_pool=buf_pool,
                                          bufpair_pool=bpp)

        writers = self.make_inter_writers(sorters, args.dataset_dir,
                                          args.write_parallel)

        inter_file_paths = pipeline.join(writers,
                                         parallel=1,
                                         capacity=3,
                                         multi=True,
                                         name="writer_queue")[0]
        inter_file_name = inter_file_paths[-1]

        num_inter_files = int(
            math.ceil(len(args.dataset['records']) / args.column_grouping))

        # these two queues form a barrier, to force downstream to wait until all intermediate superchunks are ready for merge
        # wait for num_inter_files
        f = tf.train.batch([inter_file_name],
                           batch_size=num_inter_files,
                           name="inter_file_batcher")
        # now output them one by one
        files = tf.train.batch([f],
                               enqueue_many=True,
                               batch_size=1,
                               name="inter_file_output")
        full_path = tf.string_join([args.dataset_dir, "/", files])
        # needs to be scalar not shape [1] which seems pretty stupid ...
        full_path_scalar = tf.reshape(full_path, [])

        # may need to add disk read parallelism here
        merge_cols = self.inter_columns
        #if args.order_by == location_value:
        #merge_cols = self.merge_result_columns
        #else:
        #merge_cols = self.merge_meta_columns

        merge_files = list(
            list(a) for a in pipeline.local_read_pipeline(
                upstream_tensors=[full_path_scalar],
                sync=False,
                columns=merge_cols,
                mmap_pool=mmap_pool))
        stacked_chunks = []
        for f in merge_files:
            stacked_chunks.append(tf.stack(f))

        # batch all the intermediate superchunks that are now mmap'd
        chunks_to_merge = tf.train.batch(stacked_chunks,
                                         batch_size=num_inter_files,
                                         name="mapped_inter_files_to_merge")
        merge_tuple = self.make_merge_pipeline(args=args,
                                               chunks_to_merge=chunks_to_merge,
                                               record_name=rec_name,
                                               bpp=bpp)
        # out_tuple = [results, base, qual, meta, record_name, first_ord, num_recs, file_name]

        compress_queue = pipeline.join(merge_tuple,
                                       capacity=4,
                                       parallel=args.compress_parallel,
                                       multi=False,
                                       name="to_compress")

        compressed_bufs = list(self.make_compressors(compress_queue, buf_pool))
        #print(compressed_bufs)
        writers = list(
            list(a) for a in self.make_writers(args, compressed_bufs))

        return writers, []