Beispiel #1
0
def read_pipeline(fastq_file, args):
    mapped_file_pool = persona_ops.m_map_pool(size=0,
                                              bound=False,
                                              name="mmap_pool")
    if args.paired:
        assert (fastq_file.get_shape() == tensor_shape.vector(2))
        files = tf.unstack(fastq_file)
        reader_0 = persona_ops.file_m_map(filename=files[0],
                                          pool_handle=mapped_file_pool,
                                          synchronous=False,
                                          name="file_map_0")
        reader_1 = persona_ops.file_m_map(filename=files[1],
                                          pool_handle=mapped_file_pool,
                                          synchronous=False,
                                          name="file_map_1")
        queued_results = pipeline.join([reader_0, reader_1],
                                       parallel=1,
                                       capacity=2,
                                       name="read_out")
    else:
        reader = persona_ops.file_m_map(filename=fastq_file,
                                        pool_handle=mapped_file_pool,
                                        synchronous=False,
                                        name="file_map")
        queued_results = pipeline.join([reader],
                                       parallel=1,
                                       capacity=2,
                                       name="read_out")
    return queued_results[0]
Beispiel #2
0
    def _make_graph(self, upstream_gate):
        return self.even_simpler(upstream_gate=upstream_gate)
        increment_constant = tf.constant(self.increment)

        def make_chain(idc, comp, chain_id):
            for idx in range(self.queue_chain_length):
                comp = comp + increment_constant
                idc, comp = pipeline.join(upstream_tensors=(idc, comp),
                                          capacity=2,
                                          parallel=1,
                                          name="chain_{cid}_join_{idx}".format(
                                              idx=idx, cid=chain_id))[0]
            return idc, comp

        local_gate = self.make_local_gate(upstream_gate=upstream_gate)
        idc, comp = local_gate.dequeue(
        )  # has to be split up or join() complains :/
        idcs_and_comps = pipeline.join(upstream_tensors=(idc, comp),
                                       parallel=self.parallel,
                                       capacity=self.parallel * 2,
                                       name="local_head_gate")

        chains = (make_chain(idc=idc, comp=comp, chain_id=idx)
                  for idx, (idc, comp) in enumerate(idcs_and_comps))

        final = pipeline.join(upstream_tensors=chains,
                              parallel=1,
                              multi=True,
                              capacity=self.parallel * 2,
                              name="local_tail_gate")
        return final[0]  # just return idc, comp
Beispiel #3
0
    def make_central_pipeline(self, inputs, local_head_gate):
        """
        :param inputs:
        :param local_head_gate:
        :return: (id_and_count, record_id, intermediate_name, superchunk_num_records, superchunk_matrix) + rest_of_input
        """
        inputs = sanitize_generator(inputs)
        queue_name = "sort_ready_to_decomp"
        ready_to_decomp = pipeline.join(upstream_tensors=inputs,
                                        parallel=self.decompress_parallel,
                                        capacity=self.pre_decomp_capacity,
                                        multi=True,
                                        name=queue_name,
                                        shared_name=queue_name)
        with tf.name_scope("decompression_stage"):
            ready_to_sort_items = sanitize_generator(
                self.make_decomp_stage(ready_to_decomp=ready_to_decomp))
        assert len(ready_to_sort_items) > 0

        queue_name = "pre_sort_gate"
        example_item = ready_to_sort_items[0]
        pre_sort_gate = gate.StreamingGate(
            name=queue_name,
            shared_name=queue_name,
            id_and_count_upstream=example_item[0],
            sample_tensors=example_item[1:],
            capacity=self.pre_sort_gate_capacity,
            limit_upstream=True,
            limit_downstream=False)
        gate.add_credit_supplier_from_gates(upstream_gate=local_head_gate,
                                            downstream_gate=pre_sort_gate)

        enqueue_ops = tuple(
            pre_sort_gate.enqueue(id_and_count=a[0], components=a[1:])
            for a in ready_to_sort_items)
        gate.add_gate_runner(gate_runner=gate.GateRunner(
            gate=pre_sort_gate, enqueue_ops=enqueue_ops))

        to_sort_ops = tuple(
            pre_sort_gate.dequeue_many(count=self.sort_batch)
            for _ in range(self.sort_parallel))

        with tf.name_scope("sort_stage"):
            sorted = tuple(self.make_sort_stage(ready_to_sort=to_sort_ops))
        sorted_chunks, control_deps = zip(*sorted)

        queue_name = "sort_ready_to_write"
        ready_to_write = pipeline.join(upstream_tensors=sorted_chunks,
                                       control_dependencies=control_deps,
                                       parallel=self.write_parallel,
                                       multi=True,
                                       capacity=self.pre_write_capacity,
                                       name=queue_name,
                                       shared_name=queue_name)

        return ready_to_write
Beispiel #4
0
    def make_sort_pipeline(self, args, input_gen, buf_pool, bufpair_pool):

        ready_to_process = pipeline.join(
            upstream_tensors=input_gen,
            parallel=args.sort_process_parallel,
            capacity=4,  # multiplied by some factor?
            multi=True,
            name="ready_to_process")
        # need to unpack better here
        multi_column_gen = list(
            pipeline.agd_reader_multi_column_pipeline(
                upstream_tensorz=ready_to_process, buffer_pool=buf_pool))
        # [ [base qual meta result], num_recs, first_ord, record_id ]
        chunks_and_recs = []
        for chunks, num_recs, first_ord, record_id in multi_column_gen:
            entry = []
            for chunk in chunks:
                entry.append(chunk)
            entry.append(num_recs)
            chunks_and_recs.append(entry)

        ready = tf.train.batch_join(chunks_and_recs,
                                    batch_size=args.column_grouping,
                                    allow_smaller_final_batch=True,
                                    name="chunk_batcher")

        name_queue = pipeline.join([name_generator("intermediate_file")],
                                   parallel=args.sort_parallel,
                                   capacity=4,
                                   multi=False,
                                   name="inter_file_gen_q")

        #bpp = persona_ops.buffer_pair_pool(size=0, bound=False, name="local_read_buffer_pair_pool")

        if args.order_by == location_value:
            sorter = persona_ops.agd_sort
        else:
            sorter = persona_ops.agd_sort_metadata

        sorters = []
        for i in range(args.sort_parallel):
            #b, q, m, r, num = ready
            num = ready[-1]
            r = ready[0]  # the sort predicate column must be first
            cols = tf.stack(ready[1:-1])
            superchunk_matrix, num_recs = sorter(buffer_pair_pool=bufpair_pool,
                                                 results_handles=r,
                                                 column_handles=cols,
                                                 num_records=num,
                                                 name="local_read_agd_sort")
            # super chunk is r, b, q, m
            sorters.append([superchunk_matrix, num_recs, name_queue[i]])

        return sorters
Beispiel #5
0
    def make_inter_writers(self, batch, output_dir, write_parallelism):
        single = pipeline.join(batch,
                               parallel=write_parallelism,
                               capacity=4,
                               multi=True,
                               name="writer_queue")
        types = get_types_for_columns(self.inter_columns)
        #print("inter col types {}".format(types))
        #types = [ "structured", "base_compact", "text", "text"]

        # no uncompressed buffer pair writer yet
        writers = []
        for buf, num_recs, record_id in single:
            w = []
            bufs = tf.unstack(buf)
            for i, b in enumerate(bufs):
                result_key = string_ops.string_join(
                    [output_dir, "/", record_id, ".", self.inter_columns[i]],
                    name="key_string")

                result = persona_ops.agd_file_system_buffer_pair_writer(
                    record_id=record_id,
                    record_type=types[i],
                    resource_handle=b,
                    path=result_key,
                    first_ordinal=0,
                    num_records=tf.to_int32(num_recs))
                w.append(result)
            w.append(record_id)
            writers.append(w)
        return writers
Beispiel #6
0
    def make_graph_impl(self, local_gate):
        """
        :param local_gate:
        :param args:
        :return: a gen of [ id_and_count, record_id, first_ordinal, num_records, key, namespace, written_records]
        """
        with tf.name_scope("read_stage"):
            # read ops: a generator of [ id_and_count, (key, namespace, [ unstacked list of handles ]) ]
            read_ops = tuple(self.make_read_stage(gate=local_gate)) # tuple so that they're made in scope
        # same as read ops, but flattened for ease of queueing
        column_boundary = len(self.columns)
        read_ops_flattened = tuple((idc,)+tuple(comp[column_boundary])+tuple(comp[:column_boundary]) for idc, comp in read_ops)
        write_ready_inputs = self.make_central_pipeline(inputs=read_ops_flattened)

        with tf.name_scope("write_stage"):
            write_ops = self.make_write_stage(write_ready_inputs=write_ready_inputs)

        queue_name = "written_records"
        all_done = pipeline.join(upstream_tensors=write_ops,
                                 parallel=1, multi=True,
                                 capacity=self.final_sink_capacity,
                                 name=queue_name, shared_name=queue_name)

        assert len(all_done) == 1
        return all_done[0]
Beispiel #7
0
def _make_writers(compressed_batch, output_dir, write_parallelism):

    compressed_single = pipeline.join(compressed_batch,
                                      parallel=write_parallelism,
                                      capacity=8,
                                      multi=True)

    for buf, num_recs, first_ordinal, record_id in compressed_single:

        first_ord_as_string = string_ops.as_string(first_ordinal,
                                                   name="first_ord_as_string")
        result_key = string_ops.string_join(
            [output_dir, "/", record_id, "_", first_ord_as_string, ".results"],
            name="base_key_string")

        result = persona_ops.agd_file_system_buffer_writer(
            record_id=record_id,
            record_type="structured",
            resource_handle=buf,
            path=result_key,
            compressed=True,
            first_ordinal=first_ordinal,
            num_records=tf.to_int32(num_recs))

        yield result  # writes out the file path key (full path)
Beispiel #8
0
def writer_pipeline(compressors, write_parallelism, record_id, output_dir,
                    suffix, args):
    prefix_name = tf.constant("{}_".format(record_id), name="prefix_string")
    compressed_batch = pipeline.join(compressors,
                                     parallel=write_parallelism,
                                     capacity=8,
                                     multi=True,
                                     name="write_input")

    for base, meta, first_ordinal, num_recs in compressed_batch:
        first_ord_as_string = string_ops.as_string(first_ordinal,
                                                   name="first_ord_as_string")
        base_key = string_ops.string_join(
            [output_dir, prefix_name, first_ord_as_string, ".", suffix],
            name="base_key_string")
        meta_key = string_ops.string_join(
            [output_dir, prefix_name, first_ord_as_string, ".metadata"],
            name="metadata_key_string")
        base_path = persona_ops.agd_file_system_buffer_writer(
            record_id=record_id,
            record_type="text" if args.protein else "base_compact",
            resource_handle=base,
            path=base_key,
            compressed=True,
            first_ordinal=first_ordinal,
            num_records=tf.to_int32(num_recs))
        meta_path = persona_ops.agd_file_system_buffer_writer(
            record_id=record_id,
            record_type="text",
            resource_handle=meta,
            path=meta_key,
            compressed=True,
            first_ordinal=first_ordinal,
            num_records=tf.to_int32(num_recs))
        yield base_path, meta_path, first_ordinal, num_recs
Beispiel #9
0
def writer_pipeline(compressors, write_parallelism, record_id, output_dir):
    prefix_name = tf.constant("{}_".format(record_id), name="prefix_string")
    compressed_batch = pipeline.join(compressors,
                                     parallel=write_parallelism,
                                     capacity=8,
                                     multi=True,
                                     name="write_input")

    types = ['base_compact', 'text', 'text', 'structured']
    exts = ['.base', '.qual', '.metadata', '.results']
    for chunk_stacked, first_ordinal, num_recs in compressed_batch:
        chunks = tf.unstack(chunk_stacked)
        first_ord_as_string = string_ops.as_string(first_ordinal,
                                                   name="first_ord_as_string")

        paths = []
        for i, chunk in enumerate(chunks):
            key = string_ops.string_join(
                [output_dir, prefix_name, first_ord_as_string, exts[i]],
                name="key_string")
            paths.append(
                persona_ops.agd_file_system_buffer_writer(
                    record_id=record_id,
                    record_type=types[i],
                    resource_handle=chunk,
                    path=key,
                    compressed=True,
                    first_ordinal=first_ordinal,
                    num_records=tf.to_int32(num_recs)))
        yield paths + [first_ordinal, num_recs]
Beispiel #10
0
 def make_chain(idc, comp, chain_id):
     for idx in range(self.queue_chain_length):
         comp = comp + increment_constant
         idc, comp = pipeline.join(upstream_tensors=(idc, comp),
                                   capacity=2,
                                   parallel=1,
                                   name="chain_{cid}_join_{idx}".format(
                                       idx=idx, cid=chain_id))[0]
     return idc, comp
Beispiel #11
0
 def even_simpler(self, upstream_gate):
     local_gate = self.make_local_gate(upstream_gate=upstream_gate)
     idc, comp = local_gate.dequeue()
     name = "chain_{cid}_join".format(cid=0)
     idcs_and_comps = pipeline.join(upstream_tensors=(idc, comp),
                                    capacity=1,
                                    parallel=1,
                                    name=name,
                                    shared_name=name)
     return idcs_and_comps[0]
Beispiel #12
0
def agd_mark_duplicates_local(in_queue, outdir=None, parallel_parse=1, parallel_write=1, parallel_compress=1):
    """
    key: tensor with chunk key string
    local_directory: the "base path" from which these should be read
    column_grouping_factor: the number of keys to put together
    parallel_parse: the parallelism for processing records (decomp)
    """
  
    parallel_key_dequeue = tuple(in_queue.dequeue() for _ in range(parallel_parse))
    result_chunks = pipeline.local_read_pipeline(upstream_tensors=parallel_key_dequeue, columns=['results'])

    result_chunk_list = [ list(c) for c in result_chunks ]

    
    parsed_results = pipeline.agd_reader_multi_column_pipeline(upstream_tensorz=result_chunk_list)
    parsed_results_list = list(parsed_results)

    parsed_result = pipeline.join(parsed_results_list, parallel=1, capacity=8, multi=True)[0]

    # result_buf, num_recs, first_ord, record_id
    #parsed_results = tf.contrib.persona.persona_in_pipe(key=key, dataset_dir=local_directory, columns=["results"], parse_parallel=parallel_parse,
                                                        #process_parallel=1)
  
    print(parsed_result)
    result_buf, num_results, first_ord, record_id = parsed_result
    result_buf = tf.unstack(result_buf)[0]
    print(result_buf)

    bpp = persona_ops.buffer_pair_pool(size=0, bound=False, name="output_buffer_pair_pool")
    result_out = persona_ops.agd_mark_duplicates(results_handle=result_buf, num_records=num_results, 
            buffer_pair_pool=bpp, name="markdupsop")

    result_to_write = pipeline.join([result_out, num_results, first_ord, record_id], parallel=parallel_write, 
        capacity=8, multi=False)

    compressed = compress_pipeline(result_to_write, parallel_compress)

    written = _make_writers(compressed_batch=list(compressed), output_dir=outdir, write_parallelism=parallel_write)

    recs = list(written)
    all_written_keys = pipeline.join(recs, parallel=1, capacity=8, multi=False)

    return all_written_keys
Beispiel #13
0
 def make_graph(self, in_queue, args):
     increment = args.increment
     incr_by = tf.constant(increment, dtype=tf.int64)
     incr_op = tf.to_int64(in_queue.dequeue()) + incr_by
     ready_to_process = pipeline.join(upstream_tensors=(incr_op, ),
                                      parallel=1,
                                      capacity=1,
                                      multi=False,
                                      name="ready_to_process")
     return (ready_to_process, ), []
Beispiel #14
0
    def make_central_pipeline(self, read_columns, head_gate):
        """
        :param read_columns: a generator of (id_and_count, ([ list, of, file, mmap, handles, ... ], {pass around}))
        :return: a generator of (id_and_count, (chunk_matrix, record_id, {pass around})
        """
        read_columns = sanitize_generator(read_columns)

        if self.order_by == location_value:
            read_columns = sanitize_generator(
                self.make_index_building_stage(read_columns=read_columns))

        # a gen of (id_and_count, components)
        # components = ([ handles, columns ])
        queue_name = "pre_merge_barrier_gate"
        example_idc, example_comp = read_columns[0]
        pre_merge_gate = gate.StreamingGate(
            name=queue_name,
            shared_name=queue_name,
            id_and_count_upstream=example_idc,
            sample_tensors=example_comp,
            capacity=self.pre_merge_gate_capacity,
            limit_upstream=True,
            limit_downstream=False)
        gate.add_credit_supplier_from_gates(upstream_gate=head_gate,
                                            downstream_gate=pre_merge_gate)

        enqueue_ops = tuple(
            pre_merge_gate.enqueue(id_and_count=idc, components=comp)
            for idc, comp in read_columns)
        gate.add_gate_runner(gate_runner=gate.GateRunner(
            gate=pre_merge_gate, enqueue_ops=enqueue_ops))

        to_merge = (pre_merge_gate.dequeue_whole_dataset()
                    for _ in range(self.merge_parallel))

        with tf.name_scope("merge_merge_stage"):
            to_compress = tuple(self.make_merge_stage(merge_batches=to_merge))

        with tf.name_scope("merge_compress_stage"):
            to_write_items = tuple(
                self.make_compress_stage(
                    to_compress=to_compress))  # returns a generator

        control_deps = tuple(a[1] for a in to_write_items)
        to_write_items = tuple(a[0] for a in to_write_items)

        queue_name = "merge_pre_write_queue"
        to_write = pipeline.join(upstream_tensors=to_write_items,
                                 control_dependencies=control_deps,
                                 parallel=self.write_parallel,
                                 capacity=self.pre_write_capacity,
                                 multi=True,
                                 name=queue_name,
                                 shared_name=queue_name)
        return to_write
Beispiel #15
0
def writer_pipeline(compressors, write_parallelism, record_id, output_dir,
                    compressed):
    prefix_name = tf.constant("{}_".format(record_id), name="prefix_string")

    if compressed:
        write_op = partial(persona_ops.agd_file_system_buffer_writer,
                           compressed=compressed)
        converted_compressors = [
            [a.compressed_buffer
             for a in result_item[:3]] + list(result_item[3:])
            for result_item in compressors
        ]
    else:
        write_op = persona_ops.agd_file_system_buffer_pair_writer
        converted_compressors = compressors

    compressed_batch = pipeline.join(converted_compressors,
                                     parallel=write_parallelism,
                                     capacity=8,
                                     multi=True,
                                     name="write_input")

    for base, qual, meta, first_ordinal, num_recs in compressed_batch:
        first_ord_as_string = string_ops.as_string(first_ordinal,
                                                   name="first_ord_as_string")
        base_key = string_ops.string_join(
            [output_dir, prefix_name, first_ord_as_string, ".base"],
            name="base_key_string")
        qual_key = string_ops.string_join(
            [output_dir, prefix_name, first_ord_as_string, ".qual"],
            name="qual_key_string")
        meta_key = string_ops.string_join(
            [output_dir, prefix_name, first_ord_as_string, ".metadata"],
            name="metadata_key_string")
        base_path = write_op(record_id=record_id,
                             record_type="base_compact",
                             resource_handle=base,
                             path=base_key,
                             first_ordinal=first_ordinal,
                             num_records=tf.to_int32(num_recs))
        qual_path = write_op(record_id=record_id,
                             record_type="text",
                             resource_handle=qual,
                             path=qual_key,
                             first_ordinal=first_ordinal,
                             num_records=tf.to_int32(num_recs))
        meta_path = write_op(record_id=record_id,
                             record_type="text",
                             resource_handle=meta,
                             path=meta_key,
                             first_ordinal=first_ordinal,
                             num_records=tf.to_int32(num_recs))
        yield base_path, qual_path, meta_path, first_ordinal, num_recs
Beispiel #16
0
def import_sga_local(in_queue,
                     argsj,
                     outdir=None,
                     parallel_parse=1,
                     feature="NFAT",
                     path="."):
    manifest = argsj.dataset
    if 'reference' not in manifest:
        raise Exception(
            "No reference data in manifest {}. Unaligned BAM not yet supported. Please align dataset first."
            .format(args.dataset))
    """
    key: tensor with chunk key string
    local_directory: the "base path" from which these should be read
    column_grouping_factor: the number of keys to put together
    parallel_parse: the parallelism for processing records (decomp)
    """

    ref_lens = []
    ref_seqs = []
    for contig in manifest['reference_contigs']:
        ref_lens.append(contig['length'])
        ref_seqs.append(contig['name'])

    parallel_key_dequeue = tuple(in_queue.dequeue()
                                 for _ in range(parallel_parse))
    result_chunks = pipeline.local_read_pipeline(
        upstream_tensors=parallel_key_dequeue, columns=['results'])

    result_chunk_list = [list(c) for c in result_chunks]

    parsed_results = pipeline.agd_reader_multi_column_pipeline(
        upstream_tensorz=result_chunk_list)
    parsed_results_list = list(parsed_results)

    parsed_result = pipeline.join(parsed_results_list,
                                  parallel=1,
                                  capacity=8,
                                  multi=True)[0]

    result_buf, num_results, first_ord, record_id = parsed_result
    result_buf = tf.unstack(result_buf)[0]

    result = persona_ops.import_sga(results_handle=result_buf,
                                    num_records=num_results,
                                    ref_sequences=ref_seqs,
                                    ref_seq_sizes=ref_lens,
                                    feature=feature,
                                    path=path,
                                    name="importsgaop")

    return result
Beispiel #17
0
def compress_pipeline(converters, compress_parallelism):
    converted_batch = pipeline.join(converters,
                                    parallel=compress_parallelism,
                                    capacity=8,
                                    multi=True,
                                    name="compress_input")

    buf_pool = persona_ops.buffer_pool(size=0, bound=False, name="bufpool")

    for base, meta, first_ord, num_recs in converted_batch:
        base_buf = persona_ops.buffer_pair_compressor(buffer_pool=buf_pool,
                                                      buffer_pair=base)
        meta_buf = persona_ops.buffer_pair_compressor(buffer_pool=buf_pool,
                                                      buffer_pair=meta)

        yield base_buf, meta_buf, first_ord, num_recs
Beispiel #18
0
    def make_index_building_stage(self, read_columns):
        queue_name = "index_building_queue"
        read_columns = tuple((a, ) + tuple(b) for a, b in read_columns)
        to_convert = pipeline.join(upstream_tensors=read_columns,
                                   parallel=self.index_parallel,
                                   capacity=self.index_capacity,
                                   multi=True,
                                   name=queue_name,
                                   shared_name=queue_name)

        pool = persona_ops.results_index_pool(bound=False, size=0)
        for all_components in to_convert:
            idc = all_components[0]
            components = all_components[1:]
            results_column = components[0][0]  # first column of chunk matrix
            results_index = persona_ops.results_index_creator(
                index_pool=pool, column=results_column)
            yield idc, (results_index, ) + tuple(components)
Beispiel #19
0
def compress_pipeline(converters, compress_parallelism):
    converted_batch = pipeline.join(converters,
                                    parallel=compress_parallelism,
                                    capacity=8,
                                    multi=True,
                                    name="compress_input")

    buf_pool = persona_ops.buffer_pool(size=0, bound=False, name="bufpool")

    for chunk, first_ord, num_recs in converted_batch:
        cols = tf.unstack(chunk)
        out = []
        for col in cols:
            out.append(
                persona_ops.buffer_pair_compressor(buffer_pool=buf_pool,
                                                   buffer_pair=col))

        out_stacked = tf.stack(out)
        yield out_stacked, first_ord, num_recs
Beispiel #20
0
    def make_graph_impl(self, local_gate):
        # :return: a generator of (id_and_count, record_id, first_ordinal, num_records, file_basename) + (list, of, full, file, paths
        with tf.name_scope("merge_read"):
            ready_to_merge_items = self.make_read_stage(local_gate=local_gate)

        with tf.name_scope("merge"):
            ready_to_write_items = self.make_central_pipeline(
                read_columns=ready_to_merge_items, head_gate=local_gate)

        with tf.name_scope("merge_write"):
            completed_items = self.make_write_stage(
                ready_to_write_items=ready_to_write_items)

        final_name = "merge_completed_items_queue"
        return pipeline.join(upstream_tensors=completed_items,
                             parallel=self.sink_parallel,
                             capacity=self.final_capacity,
                             multi=True,
                             name=final_name,
                             shared_name=final_name)
Beispiel #21
0
    def make_graph_impl(self, local_gate):
        with tf.name_scope("read_stage"):
            read_results = self.make_read_stage(gate=local_gate)

        ready_to_write = self.make_central_pipeline(inputs=read_results,
                                                    local_head_gate=local_gate)

        with tf.name_scope("write_stage"):
            write_results = self.make_write_stage(
                write_ready_inputs=ready_to_write)

        queue_name = "completed"
        sink_queue = pipeline.join(upstream_tensors=write_results,
                                   parallel=self.sink_parallel,
                                   multi=True,
                                   capacity=self.final_sink_capacity,
                                   name=queue_name,
                                   shared_name=queue_name)
        return tuple(
            s[:-1] for s in
            sink_queue)  # :-1 to leave off the file records that aren't needed
Beispiel #22
0
    def make_writers(self, args, compressed_bufs):
        compressed_buf = pipeline.join(compressed_bufs,
                                       capacity=4,
                                       multi=True,
                                       parallel=1,
                                       name="final_write_queue")[0]

        # add parallelism here if necessary to saturate write bandwidth
        # [compressed_matrix, record_name, first_ord, num_recs, file_name]

        #print(compressed_buf)
        # upstream_tensors: a list of tensor tuples of type: buffer_list_handle, record_id, first_ordinal, num_records, file_path
        #types = self.records_type_location if args.order_by == location_value else self.records_type_metadata
        types = get_record_types_for_columns(args.order_by, self.inter_columns)
        #print("final write types {}".format(types))
        writers = pipeline.local_write_pipeline(
            upstream_tensors=[compressed_buf],
            compressed=True,
            record_types=types,
            name="local_write_pipeline")

        return writers
Beispiel #23
0
def agd_flagstat_local(in_queue,
                       outdir=None,
                       parallel_parse=1,
                       parallel_write=1,
                       parallel_compress=1):
    """
    key: tensor with chunk key string
    local_directory: the "base path" from which these should be read
    column_grouping_factor: the number of keys to put together
    parallel_parse: the parallelism for processing records (decomp)
    """

    parallel_key_dequeue = tuple(in_queue.dequeue()
                                 for _ in range(parallel_parse))
    result_chunks = pipeline.local_read_pipeline(
        upstream_tensors=parallel_key_dequeue, columns=['results'])

    result_chunk_list = [list(c) for c in result_chunks]

    parsed_results = pipeline.agd_reader_multi_column_pipeline(
        upstream_tensorz=result_chunk_list)
    parsed_results_list = list(parsed_results)

    parsed_result = pipeline.join(parsed_results_list,
                                  parallel=1,
                                  capacity=8,
                                  multi=True)[0]

    # print(parsed_result)
    result_buf, num_results, first_ord, record_id = parsed_result
    result_buf = tf.unstack(result_buf)[0]
    # print(result_buf)

    result_out = persona_ops.agd_flagstat(results_handle=result_buf,
                                          num_records=num_results,
                                          name="flagstat")

    return result_out
Beispiel #24
0
    def _make_graph(self, upstream_gate):
        def gen_delete_ops():
            remove_op = partial(persona_ops.ceph_remove,
                                cluster_name=self.ceph_cluster_name,
                                user_name=self.ceph_user_name,
                                pool_name=self.ceph_pool_name,
                                columns=self.columns,
                                ceph_conf_path=str(self.ceph_conf_path))
            for idx in range(self.delete_parallel):
                id_and_count, components = upstream_gate.dequeue_many(
                    count=self.global_batch)
                keys, namespaces = components
                num_items_deleted = remove_op(keys=keys, namespaces=namespaces)
                yield id_and_count, num_items_deleted

        items = tuple(gen_delete_ops())
        queue_name = "nullceph_final"
        return pipeline.join(upstream_tensors=items,
                             parallel=self.sink_parallel,
                             capacity=self.delete_parallel + 1,
                             multi=True,
                             name=queue_name,
                             shared_name=queue_name)
Beispiel #25
0
    def make_graph_impl(self, local_gate):
        """
        :param local_gate:
        :param args:
        :return: a gen of [ id_and_count, record_id, first_ordinal, num_records, file_basename, written_records]
        """
        with tf.name_scope("read_stage"):
            # read ops: [ id_and_count, [ filename ], [ a list of handles in the order of the columns, NOT STACKED ] ]
            read_ops = tuple(self.make_read_stage(gate=local_gate))
        # same as read ops, but flattened for ease of queueing
        read_ops_flattened = tuple((a[0],)+tuple(a[2:])+tuple(a[1]) for a in read_ops)
        write_ready_inputs = self.make_central_pipeline(inputs=read_ops_flattened)

        with tf.name_scope("write_stage"):
            write_ops = self.make_write_stage(write_ready_inputs=write_ready_inputs)

        queue_name = "written_records"
        all_done = pipeline.join(upstream_tensors=write_ops,
                                 parallel=1, multi=True,
                                 capacity=self.final_sink_capacity,
                                 name=queue_name, shared_name=queue_name)
        assert len(all_done) == 1
        return all_done[0]
Beispiel #26
0
def export_bam(in_queue, args):
    manifest = args.dataset

    if 'reference' not in manifest:
        raise Exception(
            "No reference data in manifest {}. Unaligned BAM not yet supported. Please align dataset first."
            .format(args.dataset))

    #bp_handle = persona_ops.buffer_pool(size=10, bound=False, name="buf_pool")
    #mmap_pool = persona_ops.m_map_pool(size=10,  bound=False, name="file_mmap_buffer_pool")

    columns = ["base", "qual", "metadata", "results"]
    num_secondary = 0
    for column in manifest['columns']:
        if 'secondary' in column:
            columns.append(column)
            secondary += 1

    print("BAM output using columns: {}".format(columns))
    # TODO  provide option for reading from Ceph

    result_chunks = pipeline.local_read_pipeline(
        upstream_tensors=[in_queue.dequeue()], columns=columns)

    result_chunk_list = [list(c) for c in result_chunks]

    to_parse = pipeline.join(upstream_tensors=result_chunk_list,
                             parallel=args.parallel_parse,
                             multi=True,
                             capacity=8)

    parsed_results = pipeline.agd_reader_multi_column_pipeline(
        upstream_tensorz=to_parse)

    parsed_results_list = list(parsed_results)

    parsed_result = pipeline.join(parsed_results_list,
                                  parallel=1,
                                  capacity=8,
                                  multi=True)[0]

    # base, qual, meta, result, [secondary], num_recs, first_ord, record_id

    handles = parsed_result[0]
    bases = handles[0]
    quals = handles[1]
    meta = handles[2]
    # give a matrix of all the result columns
    results = tf.stack(handles[3:])
    num_recs = parsed_result[1]
    first_ord = parsed_result[2]

    if args.output_path == "":
        output_path = manifest['name'] + ".bam"
    else:
        output_path = args.output_path

    ref_lens = []
    ref_seqs = []

    for contig in manifest['reference_contigs']:
        ref_lens.append(contig['length'])
        ref_seqs.append(contig['name'])

    sort = manifest['sort'] if 'sort' in manifest else 'unsorted'

    pg_id = "personaAGD"  # TODO get from manifest
    read_group = manifest['name']
    agd_to_bam = persona_ops.agd_output_bam(results_handle=results,
                                            bases_handle=bases,
                                            qualities_handle=quals,
                                            metadata_handle=meta,
                                            num_records=num_recs,
                                            path=output_path,
                                            ref_sequences=ref_seqs,
                                            ref_seq_sizes=ref_lens,
                                            pg_id=pg_id,
                                            read_group=read_group,
                                            sort_order=sort,
                                            num_threads=args.threads)

    return [agd_to_bam], []
Beispiel #27
0
    def make_central_pipeline(self, inputs):
        """
        Make the central pipeline between the custom read and write operations
        :param args:
        :param inputs: a generator of type (id_and_count, column0, column1, ..., [:rest of input]). The number of colums is assumed to be the same and in the same order as self.columns
        :return: a generator of [ compressed_results_column_matrix, num_records, first_ordinal, record_id, id_and_count, {rest of input} ]
        """

        if not isinstance(inputs, (list, tuple)):
            inputs = tuple(inputs)

        # type of each of these: (id_and_count, column0, column1, ..., [:rest of input])
        queue_name = "align_ready_to_decomp"
        ready_to_decomp = pipeline.join(upstream_tensors=inputs,
                                        parallel=self.decompress_parallel,
                                        capacity=self.pre_decomp_capacity, multi=True,
                                        name=queue_name, shared_name=queue_name)
        with tf.name_scope("decompression_stage"):
            ready_to_align_items = self.make_decomp_stage(ready_to_decomp=ready_to_decomp)

        queue_name = "ready_to_align"
        ready_to_align = pipeline.join(upstream_tensors=ready_to_align_items,
                                       parallel=self.align_parallel,
                                       capacity=self.pre_align_capacity, multi=True,
                                       name=queue_name, shared_name=queue_name)

        with tf.name_scope("align_stage"):
            ready_to_compress_items = self.make_align_stage(ready_to_align=ready_to_align)

        queue_name = "align_ready_to_compress"
        ready_to_compress = pipeline.join(upstream_tensors=ready_to_compress_items,
                                          parallel=self.compress_parallel,
                                          capacity=self.pre_compress_capacity, multi=True,
                                          name=queue_name, shared_name=queue_name)

        with tf.name_scope("compress_stage"):
            ready_to_write_items = tuple(self.make_compress_stage(ready_to_compress=ready_to_compress))

        def gen_control_deps():
            for item in ready_to_write_items:
                num_records, ordinal, record_id = item[1:4]
                item_id = slice_id(item[4])
                with tf.control_dependencies((item_id,)):
                    ts = gate.unix_timestamp(name="align_tail_timestamp")
                yield (gate.log_events(
                    item_names=("id", "time", "ordinal", "record_id", "num_records"),
                    directory=self.log_directory,
                    event_name="align_tail",
                    name="align_tail_event_logger",
                    components=(item_id, ts, ordinal, record_id, num_records)
                ),)

        control_deps = []
        if self.log_goodput:
            control_deps.extend(gen_control_deps())

        queue_name = "ready_to_write"
        ready_to_write = pipeline.join(upstream_tensors=ready_to_write_items,
                                       control_dependencies=control_deps,
                                       parallel=self.write_parallel,
                                       capacity=self.pre_write_capacity, multi=True,
                                       name=queue_name, shared_name=queue_name)
        return ready_to_write
Beispiel #28
0
def execute(args, modules):
  record_stats = args.record
  stats_directory = args.record_directory
 
  module = modules[args.command]

  if hasattr(args, 'service'):
    service_mode = args.service
    service = module.lookup_service(name=service_mode)
  else:
    # there is only one service if the args does not have .service
    service = module.get_services()[0]
    
  run_arguments = tuple(service.extract_run_args(args=args))

  in_queue = tf.train.input_producer(input_tensor=run_arguments, num_epochs=1, shuffle=False, capacity=len(run_arguments))

  # TODO currently we assume all the service_ops are the same
  service_ops, service_init_ops = service.make_graph(in_queue=in_queue,
                                                     args=args)
  if not isinstance(service_ops, list):
      service_ops = list(service_ops)
  assert len(service_ops) + len(service_init_ops) > 0

  has_service_ops = len(service_ops) > 0
  if has_service_ops:
      service_sink = pipeline.join(upstream_tensors=service_ops, capacity=64, parallel=1, multi=True, name="global_sink_queue")

  init_ops = [tf.global_variables_initializer(), tf.local_variables_initializer()]

  # service graph may have summary nodes
  summary = args.summary if hasattr(args, 'summary') else False

  results = []
  stats_results = {}
  with tf.Session() as sess:
      if summary and has_service_ops:
          trace_dir = setup_output_dir(dirname=args.command + "_summary")
          service_sink.append(tf.summary.merge_all())
          summary_writer = tf.summary.FileWriter(trace_dir, graph=sess.graph, max_queue=2**20, flush_secs=10**4)
      else:
          summary = False

      count = 0
      sess.run(init_ops)
      if len(service_init_ops) > 0:
        res = sess.run(service_init_ops)
        if summary:
          results.append(res[:-1])
        else:
          results.append(res)
          #sess.run(service_init_ops)

      # its possible the service is a simple run once
      if len(service_ops) > 0:
          with contextlib.ExitStack() as stack:
              if record_stats:
                  stack.enter_context(recorder.UsageRecorder(stats_results))
              coord = tf.train.Coordinator()
              print("Local executor starting {} ...".format(args.command))
              threads = tf.train.start_queue_runners(coord=coord, sess=sess)
              while not coord.should_stop():
                  try:
                      #print("Running round {}".format(count))
                      result = sess.run(service_sink)
                      count += 1
                      if summary:
                          results.append(result[:-1])
                          summary_writer.add_summary(result[-1], global_step=count)
                      else:
                          results.append(result)
                  except tf.errors.OutOfRangeError:
                      #print('Got out of range error!')
                      break
              print("Local executor finishing ...")
              coord.request_stop()
              coord.join(threads, stop_grace_period_secs=10)

          service.on_finish(args, results)
  if summary:
      summary_writer.flush(); summary_writer.close()
  if record_stats:
      params = vars(args)
      del params["func"]
      stats_results["params"] = vars(args)
      with open(create_unique_file(directory=stats_directory, prefix="runtime_stats", suffix=".json"), 'w+') as fl:
        json.dump(stats_results, fl)
Beispiel #29
0
def execute(args, modules):
  module = modules[args.dist_command]

  if hasattr(args, 'service'):
    service_mode = args.service
    service = module.lookup_service(name=service_mode)
  else:
    # there is only one service if the args does not have .service
    service = module.get_services()[0]

  if not service.distributed_capability():
    raise Exception("Service {} does not support distributed execution".format(args.service))

  task_index = args.task_index
  queue_index = args.queue_index
  cluster_spec = dist_common.make_cluster_spec(cluster_members=args.cluster_members)
  for idx in (task_index, queue_index):
      # this checks if the task index is in cluster_def
      # will throw an exception if not found
      cluster_spec.task_address(job_name=cluster_name, task_index=idx)

  input_dtypes = service.input_dtypes(args=args)
  input_shapes = service.input_shapes(args=args)
  output_dtypes = service.output_dtypes(args=args)
  output_shapes = service.output_shapes(args=args)
  service_name = args.dist_command + "_" + service.get_shortname()

  in_queue, out_queue = dist_common.make_common_queues(service_name=service_name,
                                                       queue_index=queue_index,
                                                       cluster_name=cluster_name,
                                                       input_dtypes=input_dtypes,
                                                       input_shapes=input_shapes,
                                                       output_dtypes=output_dtypes,
                                                       output_shapes=output_shapes)

  with tf.device("/job:{cluster_name}/task:{task_idx}".format(cluster_name=cluster_name, task_idx=task_index)): # me
      service_ops, service_init_ops = service.make_graph(in_queue=in_queue,
                                                         args=args)
      service_ops = tuple(service_ops)
      assert len(service_ops) + len(service_init_ops) > 0

      init_ops = [tf.global_variables_initializer(), tf.local_variables_initializer()]

      # TODO should a final join (if necessary) be moved into the service itself?

      service_sink = pipeline.join(upstream_tensors=service_ops, capacity=32, parallel=1, multi=True, name="sink_join")[0]

  queue_device = dist_common.make_queue_device_name(cluster_name=cluster_name, queue_index=queue_index)
  with tf.device(queue_device):
      final_op = out_queue.enqueue(service_sink, name="final_queue_enqueue_task_{}".format(task_index))
  tf.train.add_queue_runner(qr=tf.train.QueueRunner(queue=out_queue, enqueue_ops=(final_op,)))

  # start our local server
  server = tf.train.Server(cluster_spec, config=None, job_name=cluster_name, task_index=task_index)
  log.debug("Persona distributed runtime starting TF server for index {}".format(task_index))

  with tf.Session(server.target) as sess:
      sess.run(init_ops)
      if len(service_init_ops) > 0:
          sess.run(service_init_ops)

      # its possible the service is a simple run once
      if len(service_ops) > 0:
          coord = tf.train.Coordinator()
          uninitialized_vars = tf.report_uninitialized_variables()
          while len(sess.run(uninitialized_vars)) > 0:
              log.debug("Waiting for uninitialized variables")
              time.sleep(startup_wait_time)

          log.debug("All variables initialized. Persona dist executor starting {} ...".format(args.dist_command))
          threads = tf.train.start_queue_runners(coord=coord, sess=sess)
          log.debug("Queue runners started. Waiting on coordinator to signal stop...")
          coord.wait_for_stop()
          timeout_time=60*3
          try:
              coord.join(threads=threads, stop_grace_period_secs=timeout_time)
          except RuntimeError:
              log.error("Unable to wait for coordinator to stop all threads after {} seconds".format(timeout_time))
          else:
              log.debug("All threads joined and dead")
Beispiel #30
0
    def make_central_pipeline(self, args, input_gen, pass_around_gen):

        self.write_columns.append('results')
        for i in range(args.max_secondary):
            self.write_columns.append('secondary{}'.format(i))

        self.write_columns = [{
            "type": "structured",
            "extension": a
        } for a in self.write_columns]

        joiner = tuple(
            tuple(a) + tuple(b) for a, b in zip(input_gen, pass_around_gen))
        ready_to_process = pipeline.join(
            upstream_tensors=joiner,
            parallel=args.parallel,
            capacity=args.parallel,  # multiplied by some factor?
            multi=True,
            name="ready_to_process")
        # need to unpack better here
        to_agd_reader, pass_around_agd_reader = zip(
            *((a[:2], a[2:]) for a in ready_to_process))
        multi_column_gen = pipeline.agd_reader_multi_column_pipeline(
            upstream_tensorz=to_agd_reader)

        def process_processed_bufs():
            for processed_column, pass_around in zip(multi_column_gen,
                                                     pass_around_agd_reader):
                if isinstance(pass_around, tf.Tensor):
                    pass_around = (pass_around, )
                yield tuple(
                    a for a in itertools.chain(processed_column, pass_around))

        processed_bufs = tuple(a for a in process_processed_bufs())
        ready_to_assemble = pipeline.join(
            upstream_tensors=processed_bufs,
            parallel=args.assemblers,
            capacity=args.assemblers * 2,
            multi=True,
            name="ready_to_assemble"
        )  # TODO these params are kinda arbitrary :/
        # ready_to_assemble: [output_buffers, num_records, first_ordinal, record_id, pass_around {flattened}) x N]
        to_assembler, pass_around_assembler = zip(
            *((a[:2], a[1:]) for a in ready_to_assemble))
        # each item out of this is a handle to AGDReads
        agd_read_assembler_gen = tuple(
            pipeline.agd_read_assembler(upstream_tensors=to_assembler,
                                        include_meta=False))
        # assembled_records, ready_to_align: [(agd_reads_handle, (num_records, first_ordinal, record_id), (pass_around)) x N]
        assembled_records_gen = tuple(
            zip(agd_read_assembler_gen, pass_around_assembler))
        assembled_records = tuple(
            (a, ) + tuple(b) for a, b in assembled_records_gen)
        ready_to_align = pipeline.join(
            upstream_tensors=assembled_records,
            parallel=args.aligners,
            capacity=int(args.aligners * 1.5),
            multi=True,
            name="ready_to_align")  # TODO still have default capacity here :/

        if args.paired:
            aligner_type = persona_ops.snap_align_paired
            aligner_options = persona_ops.paired_aligner_options(
                cmd_line=args.snap_args.split(), name="paired_aligner_options")
            executor_type = persona_ops.snap_paired_executor
        else:
            aligner_type = persona_ops.snap_align_single
            aligner_options = persona_ops.aligner_options(
                cmd_line=args.snap_args.split(), name="aligner_options"
            )  # -o output.sam will not actually do anything
            executor_type = persona_ops.snap_single_executor

        first_assembled_result = ready_to_align[0][1:]
        sink_queue_shapes = [a.get_shape() for a in first_assembled_result]
        sink_queue_dtypes = [a.dtype for a in first_assembled_result]

        aligner_dtype = tf.string
        aligner_shape = (args.max_secondary + 1, 2)
        sink_queue_shapes.append(aligner_shape)
        sink_queue_dtypes.append(aligner_dtype)

        pass_around_aligners = tuple(
            a[1:] for a in ready_to_align
        )  # type: [(num_records, first_ordinal, record_id, pass_around x N) x N]
        pass_to_aligners = tuple(a[0] for a in ready_to_align)

        buffer_list_pool = persona_ops.buffer_list_pool(
            **pipeline.pool_default_args)
        genome = persona_ops.genome_index(genome_location=args.index_path,
                                          name="genome_loader")

        def make_aligners():
            single_executor = executor_type(num_threads=args.aligner_threads,
                                            work_queue_size=args.aligners + 1,
                                            options_handle=aligner_options,
                                            genome_handle=genome)
            for read_handle, pass_around in zip(pass_to_aligners,
                                                pass_around_aligners):
                aligner_results = aligner_type(
                    read=read_handle,
                    buffer_list_pool=buffer_list_pool,
                    subchunk_size=args.subchunking,
                    executor_handle=single_executor,
                    max_secondary=args.max_secondary)
                yield (aligner_results, ) + tuple(pass_around)

        aligners = tuple(make_aligners())
        # aligners: [(buffer_list_handle, num_records, first_ordinal, record_id, pass_around X N) x N], that is COMPLETELY FLAT
        if args.compress_parallel > 0:
            aligner_results_to_compress = pipeline.join(
                upstream_tensors=aligners,
                parallel=args.compress_parallel,
                multi=True,
                capacity=4,
                name="ready_to_compress")
            to_compressors = (a[0] for a in aligner_results_to_compress)
            around_compressors = (a[1:] for a in aligner_results_to_compress)
            compressed_buffers = pipeline.aligner_compress_pipeline(
                upstream_tensors=to_compressors)
            after_compression = (
                (a, ) + tuple(b)
                for a, b in zip(compressed_buffers, around_compressors))
            aligners = tuple(after_compression)

        aligned_results = pipeline.join(upstream_tensors=aligners,
                                        parallel=args.writers,
                                        multi=True,
                                        capacity=4,
                                        name="aligned_results")

        ref_seqs, lens = persona_ops.snap_index_reference_sequences(
            genome_handle=genome)
        # Taking this out because it currently breaks distributed runtime
        return aligned_results, (
            genome, ref_seqs, lens
        )  # returns [(buffer_list_handle, num_records, first_ordinal, record_id, pass_around X N) x N], that is COMPLETELY FLAT