def read_pipeline(fastq_file, args): mapped_file_pool = persona_ops.m_map_pool(size=0, bound=False, name="mmap_pool") if args.paired: assert (fastq_file.get_shape() == tensor_shape.vector(2)) files = tf.unstack(fastq_file) reader_0 = persona_ops.file_m_map(filename=files[0], pool_handle=mapped_file_pool, synchronous=False, name="file_map_0") reader_1 = persona_ops.file_m_map(filename=files[1], pool_handle=mapped_file_pool, synchronous=False, name="file_map_1") queued_results = pipeline.join([reader_0, reader_1], parallel=1, capacity=2, name="read_out") else: reader = persona_ops.file_m_map(filename=fastq_file, pool_handle=mapped_file_pool, synchronous=False, name="file_map") queued_results = pipeline.join([reader], parallel=1, capacity=2, name="read_out") return queued_results[0]
def _make_graph(self, upstream_gate): return self.even_simpler(upstream_gate=upstream_gate) increment_constant = tf.constant(self.increment) def make_chain(idc, comp, chain_id): for idx in range(self.queue_chain_length): comp = comp + increment_constant idc, comp = pipeline.join(upstream_tensors=(idc, comp), capacity=2, parallel=1, name="chain_{cid}_join_{idx}".format( idx=idx, cid=chain_id))[0] return idc, comp local_gate = self.make_local_gate(upstream_gate=upstream_gate) idc, comp = local_gate.dequeue( ) # has to be split up or join() complains :/ idcs_and_comps = pipeline.join(upstream_tensors=(idc, comp), parallel=self.parallel, capacity=self.parallel * 2, name="local_head_gate") chains = (make_chain(idc=idc, comp=comp, chain_id=idx) for idx, (idc, comp) in enumerate(idcs_and_comps)) final = pipeline.join(upstream_tensors=chains, parallel=1, multi=True, capacity=self.parallel * 2, name="local_tail_gate") return final[0] # just return idc, comp
def make_central_pipeline(self, inputs, local_head_gate): """ :param inputs: :param local_head_gate: :return: (id_and_count, record_id, intermediate_name, superchunk_num_records, superchunk_matrix) + rest_of_input """ inputs = sanitize_generator(inputs) queue_name = "sort_ready_to_decomp" ready_to_decomp = pipeline.join(upstream_tensors=inputs, parallel=self.decompress_parallel, capacity=self.pre_decomp_capacity, multi=True, name=queue_name, shared_name=queue_name) with tf.name_scope("decompression_stage"): ready_to_sort_items = sanitize_generator( self.make_decomp_stage(ready_to_decomp=ready_to_decomp)) assert len(ready_to_sort_items) > 0 queue_name = "pre_sort_gate" example_item = ready_to_sort_items[0] pre_sort_gate = gate.StreamingGate( name=queue_name, shared_name=queue_name, id_and_count_upstream=example_item[0], sample_tensors=example_item[1:], capacity=self.pre_sort_gate_capacity, limit_upstream=True, limit_downstream=False) gate.add_credit_supplier_from_gates(upstream_gate=local_head_gate, downstream_gate=pre_sort_gate) enqueue_ops = tuple( pre_sort_gate.enqueue(id_and_count=a[0], components=a[1:]) for a in ready_to_sort_items) gate.add_gate_runner(gate_runner=gate.GateRunner( gate=pre_sort_gate, enqueue_ops=enqueue_ops)) to_sort_ops = tuple( pre_sort_gate.dequeue_many(count=self.sort_batch) for _ in range(self.sort_parallel)) with tf.name_scope("sort_stage"): sorted = tuple(self.make_sort_stage(ready_to_sort=to_sort_ops)) sorted_chunks, control_deps = zip(*sorted) queue_name = "sort_ready_to_write" ready_to_write = pipeline.join(upstream_tensors=sorted_chunks, control_dependencies=control_deps, parallel=self.write_parallel, multi=True, capacity=self.pre_write_capacity, name=queue_name, shared_name=queue_name) return ready_to_write
def make_sort_pipeline(self, args, input_gen, buf_pool, bufpair_pool): ready_to_process = pipeline.join( upstream_tensors=input_gen, parallel=args.sort_process_parallel, capacity=4, # multiplied by some factor? multi=True, name="ready_to_process") # need to unpack better here multi_column_gen = list( pipeline.agd_reader_multi_column_pipeline( upstream_tensorz=ready_to_process, buffer_pool=buf_pool)) # [ [base qual meta result], num_recs, first_ord, record_id ] chunks_and_recs = [] for chunks, num_recs, first_ord, record_id in multi_column_gen: entry = [] for chunk in chunks: entry.append(chunk) entry.append(num_recs) chunks_and_recs.append(entry) ready = tf.train.batch_join(chunks_and_recs, batch_size=args.column_grouping, allow_smaller_final_batch=True, name="chunk_batcher") name_queue = pipeline.join([name_generator("intermediate_file")], parallel=args.sort_parallel, capacity=4, multi=False, name="inter_file_gen_q") #bpp = persona_ops.buffer_pair_pool(size=0, bound=False, name="local_read_buffer_pair_pool") if args.order_by == location_value: sorter = persona_ops.agd_sort else: sorter = persona_ops.agd_sort_metadata sorters = [] for i in range(args.sort_parallel): #b, q, m, r, num = ready num = ready[-1] r = ready[0] # the sort predicate column must be first cols = tf.stack(ready[1:-1]) superchunk_matrix, num_recs = sorter(buffer_pair_pool=bufpair_pool, results_handles=r, column_handles=cols, num_records=num, name="local_read_agd_sort") # super chunk is r, b, q, m sorters.append([superchunk_matrix, num_recs, name_queue[i]]) return sorters
def make_inter_writers(self, batch, output_dir, write_parallelism): single = pipeline.join(batch, parallel=write_parallelism, capacity=4, multi=True, name="writer_queue") types = get_types_for_columns(self.inter_columns) #print("inter col types {}".format(types)) #types = [ "structured", "base_compact", "text", "text"] # no uncompressed buffer pair writer yet writers = [] for buf, num_recs, record_id in single: w = [] bufs = tf.unstack(buf) for i, b in enumerate(bufs): result_key = string_ops.string_join( [output_dir, "/", record_id, ".", self.inter_columns[i]], name="key_string") result = persona_ops.agd_file_system_buffer_pair_writer( record_id=record_id, record_type=types[i], resource_handle=b, path=result_key, first_ordinal=0, num_records=tf.to_int32(num_recs)) w.append(result) w.append(record_id) writers.append(w) return writers
def make_graph_impl(self, local_gate): """ :param local_gate: :param args: :return: a gen of [ id_and_count, record_id, first_ordinal, num_records, key, namespace, written_records] """ with tf.name_scope("read_stage"): # read ops: a generator of [ id_and_count, (key, namespace, [ unstacked list of handles ]) ] read_ops = tuple(self.make_read_stage(gate=local_gate)) # tuple so that they're made in scope # same as read ops, but flattened for ease of queueing column_boundary = len(self.columns) read_ops_flattened = tuple((idc,)+tuple(comp[column_boundary])+tuple(comp[:column_boundary]) for idc, comp in read_ops) write_ready_inputs = self.make_central_pipeline(inputs=read_ops_flattened) with tf.name_scope("write_stage"): write_ops = self.make_write_stage(write_ready_inputs=write_ready_inputs) queue_name = "written_records" all_done = pipeline.join(upstream_tensors=write_ops, parallel=1, multi=True, capacity=self.final_sink_capacity, name=queue_name, shared_name=queue_name) assert len(all_done) == 1 return all_done[0]
def _make_writers(compressed_batch, output_dir, write_parallelism): compressed_single = pipeline.join(compressed_batch, parallel=write_parallelism, capacity=8, multi=True) for buf, num_recs, first_ordinal, record_id in compressed_single: first_ord_as_string = string_ops.as_string(first_ordinal, name="first_ord_as_string") result_key = string_ops.string_join( [output_dir, "/", record_id, "_", first_ord_as_string, ".results"], name="base_key_string") result = persona_ops.agd_file_system_buffer_writer( record_id=record_id, record_type="structured", resource_handle=buf, path=result_key, compressed=True, first_ordinal=first_ordinal, num_records=tf.to_int32(num_recs)) yield result # writes out the file path key (full path)
def writer_pipeline(compressors, write_parallelism, record_id, output_dir, suffix, args): prefix_name = tf.constant("{}_".format(record_id), name="prefix_string") compressed_batch = pipeline.join(compressors, parallel=write_parallelism, capacity=8, multi=True, name="write_input") for base, meta, first_ordinal, num_recs in compressed_batch: first_ord_as_string = string_ops.as_string(first_ordinal, name="first_ord_as_string") base_key = string_ops.string_join( [output_dir, prefix_name, first_ord_as_string, ".", suffix], name="base_key_string") meta_key = string_ops.string_join( [output_dir, prefix_name, first_ord_as_string, ".metadata"], name="metadata_key_string") base_path = persona_ops.agd_file_system_buffer_writer( record_id=record_id, record_type="text" if args.protein else "base_compact", resource_handle=base, path=base_key, compressed=True, first_ordinal=first_ordinal, num_records=tf.to_int32(num_recs)) meta_path = persona_ops.agd_file_system_buffer_writer( record_id=record_id, record_type="text", resource_handle=meta, path=meta_key, compressed=True, first_ordinal=first_ordinal, num_records=tf.to_int32(num_recs)) yield base_path, meta_path, first_ordinal, num_recs
def writer_pipeline(compressors, write_parallelism, record_id, output_dir): prefix_name = tf.constant("{}_".format(record_id), name="prefix_string") compressed_batch = pipeline.join(compressors, parallel=write_parallelism, capacity=8, multi=True, name="write_input") types = ['base_compact', 'text', 'text', 'structured'] exts = ['.base', '.qual', '.metadata', '.results'] for chunk_stacked, first_ordinal, num_recs in compressed_batch: chunks = tf.unstack(chunk_stacked) first_ord_as_string = string_ops.as_string(first_ordinal, name="first_ord_as_string") paths = [] for i, chunk in enumerate(chunks): key = string_ops.string_join( [output_dir, prefix_name, first_ord_as_string, exts[i]], name="key_string") paths.append( persona_ops.agd_file_system_buffer_writer( record_id=record_id, record_type=types[i], resource_handle=chunk, path=key, compressed=True, first_ordinal=first_ordinal, num_records=tf.to_int32(num_recs))) yield paths + [first_ordinal, num_recs]
def make_chain(idc, comp, chain_id): for idx in range(self.queue_chain_length): comp = comp + increment_constant idc, comp = pipeline.join(upstream_tensors=(idc, comp), capacity=2, parallel=1, name="chain_{cid}_join_{idx}".format( idx=idx, cid=chain_id))[0] return idc, comp
def even_simpler(self, upstream_gate): local_gate = self.make_local_gate(upstream_gate=upstream_gate) idc, comp = local_gate.dequeue() name = "chain_{cid}_join".format(cid=0) idcs_and_comps = pipeline.join(upstream_tensors=(idc, comp), capacity=1, parallel=1, name=name, shared_name=name) return idcs_and_comps[0]
def agd_mark_duplicates_local(in_queue, outdir=None, parallel_parse=1, parallel_write=1, parallel_compress=1): """ key: tensor with chunk key string local_directory: the "base path" from which these should be read column_grouping_factor: the number of keys to put together parallel_parse: the parallelism for processing records (decomp) """ parallel_key_dequeue = tuple(in_queue.dequeue() for _ in range(parallel_parse)) result_chunks = pipeline.local_read_pipeline(upstream_tensors=parallel_key_dequeue, columns=['results']) result_chunk_list = [ list(c) for c in result_chunks ] parsed_results = pipeline.agd_reader_multi_column_pipeline(upstream_tensorz=result_chunk_list) parsed_results_list = list(parsed_results) parsed_result = pipeline.join(parsed_results_list, parallel=1, capacity=8, multi=True)[0] # result_buf, num_recs, first_ord, record_id #parsed_results = tf.contrib.persona.persona_in_pipe(key=key, dataset_dir=local_directory, columns=["results"], parse_parallel=parallel_parse, #process_parallel=1) print(parsed_result) result_buf, num_results, first_ord, record_id = parsed_result result_buf = tf.unstack(result_buf)[0] print(result_buf) bpp = persona_ops.buffer_pair_pool(size=0, bound=False, name="output_buffer_pair_pool") result_out = persona_ops.agd_mark_duplicates(results_handle=result_buf, num_records=num_results, buffer_pair_pool=bpp, name="markdupsop") result_to_write = pipeline.join([result_out, num_results, first_ord, record_id], parallel=parallel_write, capacity=8, multi=False) compressed = compress_pipeline(result_to_write, parallel_compress) written = _make_writers(compressed_batch=list(compressed), output_dir=outdir, write_parallelism=parallel_write) recs = list(written) all_written_keys = pipeline.join(recs, parallel=1, capacity=8, multi=False) return all_written_keys
def make_graph(self, in_queue, args): increment = args.increment incr_by = tf.constant(increment, dtype=tf.int64) incr_op = tf.to_int64(in_queue.dequeue()) + incr_by ready_to_process = pipeline.join(upstream_tensors=(incr_op, ), parallel=1, capacity=1, multi=False, name="ready_to_process") return (ready_to_process, ), []
def make_central_pipeline(self, read_columns, head_gate): """ :param read_columns: a generator of (id_and_count, ([ list, of, file, mmap, handles, ... ], {pass around})) :return: a generator of (id_and_count, (chunk_matrix, record_id, {pass around}) """ read_columns = sanitize_generator(read_columns) if self.order_by == location_value: read_columns = sanitize_generator( self.make_index_building_stage(read_columns=read_columns)) # a gen of (id_and_count, components) # components = ([ handles, columns ]) queue_name = "pre_merge_barrier_gate" example_idc, example_comp = read_columns[0] pre_merge_gate = gate.StreamingGate( name=queue_name, shared_name=queue_name, id_and_count_upstream=example_idc, sample_tensors=example_comp, capacity=self.pre_merge_gate_capacity, limit_upstream=True, limit_downstream=False) gate.add_credit_supplier_from_gates(upstream_gate=head_gate, downstream_gate=pre_merge_gate) enqueue_ops = tuple( pre_merge_gate.enqueue(id_and_count=idc, components=comp) for idc, comp in read_columns) gate.add_gate_runner(gate_runner=gate.GateRunner( gate=pre_merge_gate, enqueue_ops=enqueue_ops)) to_merge = (pre_merge_gate.dequeue_whole_dataset() for _ in range(self.merge_parallel)) with tf.name_scope("merge_merge_stage"): to_compress = tuple(self.make_merge_stage(merge_batches=to_merge)) with tf.name_scope("merge_compress_stage"): to_write_items = tuple( self.make_compress_stage( to_compress=to_compress)) # returns a generator control_deps = tuple(a[1] for a in to_write_items) to_write_items = tuple(a[0] for a in to_write_items) queue_name = "merge_pre_write_queue" to_write = pipeline.join(upstream_tensors=to_write_items, control_dependencies=control_deps, parallel=self.write_parallel, capacity=self.pre_write_capacity, multi=True, name=queue_name, shared_name=queue_name) return to_write
def writer_pipeline(compressors, write_parallelism, record_id, output_dir, compressed): prefix_name = tf.constant("{}_".format(record_id), name="prefix_string") if compressed: write_op = partial(persona_ops.agd_file_system_buffer_writer, compressed=compressed) converted_compressors = [ [a.compressed_buffer for a in result_item[:3]] + list(result_item[3:]) for result_item in compressors ] else: write_op = persona_ops.agd_file_system_buffer_pair_writer converted_compressors = compressors compressed_batch = pipeline.join(converted_compressors, parallel=write_parallelism, capacity=8, multi=True, name="write_input") for base, qual, meta, first_ordinal, num_recs in compressed_batch: first_ord_as_string = string_ops.as_string(first_ordinal, name="first_ord_as_string") base_key = string_ops.string_join( [output_dir, prefix_name, first_ord_as_string, ".base"], name="base_key_string") qual_key = string_ops.string_join( [output_dir, prefix_name, first_ord_as_string, ".qual"], name="qual_key_string") meta_key = string_ops.string_join( [output_dir, prefix_name, first_ord_as_string, ".metadata"], name="metadata_key_string") base_path = write_op(record_id=record_id, record_type="base_compact", resource_handle=base, path=base_key, first_ordinal=first_ordinal, num_records=tf.to_int32(num_recs)) qual_path = write_op(record_id=record_id, record_type="text", resource_handle=qual, path=qual_key, first_ordinal=first_ordinal, num_records=tf.to_int32(num_recs)) meta_path = write_op(record_id=record_id, record_type="text", resource_handle=meta, path=meta_key, first_ordinal=first_ordinal, num_records=tf.to_int32(num_recs)) yield base_path, qual_path, meta_path, first_ordinal, num_recs
def import_sga_local(in_queue, argsj, outdir=None, parallel_parse=1, feature="NFAT", path="."): manifest = argsj.dataset if 'reference' not in manifest: raise Exception( "No reference data in manifest {}. Unaligned BAM not yet supported. Please align dataset first." .format(args.dataset)) """ key: tensor with chunk key string local_directory: the "base path" from which these should be read column_grouping_factor: the number of keys to put together parallel_parse: the parallelism for processing records (decomp) """ ref_lens = [] ref_seqs = [] for contig in manifest['reference_contigs']: ref_lens.append(contig['length']) ref_seqs.append(contig['name']) parallel_key_dequeue = tuple(in_queue.dequeue() for _ in range(parallel_parse)) result_chunks = pipeline.local_read_pipeline( upstream_tensors=parallel_key_dequeue, columns=['results']) result_chunk_list = [list(c) for c in result_chunks] parsed_results = pipeline.agd_reader_multi_column_pipeline( upstream_tensorz=result_chunk_list) parsed_results_list = list(parsed_results) parsed_result = pipeline.join(parsed_results_list, parallel=1, capacity=8, multi=True)[0] result_buf, num_results, first_ord, record_id = parsed_result result_buf = tf.unstack(result_buf)[0] result = persona_ops.import_sga(results_handle=result_buf, num_records=num_results, ref_sequences=ref_seqs, ref_seq_sizes=ref_lens, feature=feature, path=path, name="importsgaop") return result
def compress_pipeline(converters, compress_parallelism): converted_batch = pipeline.join(converters, parallel=compress_parallelism, capacity=8, multi=True, name="compress_input") buf_pool = persona_ops.buffer_pool(size=0, bound=False, name="bufpool") for base, meta, first_ord, num_recs in converted_batch: base_buf = persona_ops.buffer_pair_compressor(buffer_pool=buf_pool, buffer_pair=base) meta_buf = persona_ops.buffer_pair_compressor(buffer_pool=buf_pool, buffer_pair=meta) yield base_buf, meta_buf, first_ord, num_recs
def make_index_building_stage(self, read_columns): queue_name = "index_building_queue" read_columns = tuple((a, ) + tuple(b) for a, b in read_columns) to_convert = pipeline.join(upstream_tensors=read_columns, parallel=self.index_parallel, capacity=self.index_capacity, multi=True, name=queue_name, shared_name=queue_name) pool = persona_ops.results_index_pool(bound=False, size=0) for all_components in to_convert: idc = all_components[0] components = all_components[1:] results_column = components[0][0] # first column of chunk matrix results_index = persona_ops.results_index_creator( index_pool=pool, column=results_column) yield idc, (results_index, ) + tuple(components)
def compress_pipeline(converters, compress_parallelism): converted_batch = pipeline.join(converters, parallel=compress_parallelism, capacity=8, multi=True, name="compress_input") buf_pool = persona_ops.buffer_pool(size=0, bound=False, name="bufpool") for chunk, first_ord, num_recs in converted_batch: cols = tf.unstack(chunk) out = [] for col in cols: out.append( persona_ops.buffer_pair_compressor(buffer_pool=buf_pool, buffer_pair=col)) out_stacked = tf.stack(out) yield out_stacked, first_ord, num_recs
def make_graph_impl(self, local_gate): # :return: a generator of (id_and_count, record_id, first_ordinal, num_records, file_basename) + (list, of, full, file, paths with tf.name_scope("merge_read"): ready_to_merge_items = self.make_read_stage(local_gate=local_gate) with tf.name_scope("merge"): ready_to_write_items = self.make_central_pipeline( read_columns=ready_to_merge_items, head_gate=local_gate) with tf.name_scope("merge_write"): completed_items = self.make_write_stage( ready_to_write_items=ready_to_write_items) final_name = "merge_completed_items_queue" return pipeline.join(upstream_tensors=completed_items, parallel=self.sink_parallel, capacity=self.final_capacity, multi=True, name=final_name, shared_name=final_name)
def make_graph_impl(self, local_gate): with tf.name_scope("read_stage"): read_results = self.make_read_stage(gate=local_gate) ready_to_write = self.make_central_pipeline(inputs=read_results, local_head_gate=local_gate) with tf.name_scope("write_stage"): write_results = self.make_write_stage( write_ready_inputs=ready_to_write) queue_name = "completed" sink_queue = pipeline.join(upstream_tensors=write_results, parallel=self.sink_parallel, multi=True, capacity=self.final_sink_capacity, name=queue_name, shared_name=queue_name) return tuple( s[:-1] for s in sink_queue) # :-1 to leave off the file records that aren't needed
def make_writers(self, args, compressed_bufs): compressed_buf = pipeline.join(compressed_bufs, capacity=4, multi=True, parallel=1, name="final_write_queue")[0] # add parallelism here if necessary to saturate write bandwidth # [compressed_matrix, record_name, first_ord, num_recs, file_name] #print(compressed_buf) # upstream_tensors: a list of tensor tuples of type: buffer_list_handle, record_id, first_ordinal, num_records, file_path #types = self.records_type_location if args.order_by == location_value else self.records_type_metadata types = get_record_types_for_columns(args.order_by, self.inter_columns) #print("final write types {}".format(types)) writers = pipeline.local_write_pipeline( upstream_tensors=[compressed_buf], compressed=True, record_types=types, name="local_write_pipeline") return writers
def agd_flagstat_local(in_queue, outdir=None, parallel_parse=1, parallel_write=1, parallel_compress=1): """ key: tensor with chunk key string local_directory: the "base path" from which these should be read column_grouping_factor: the number of keys to put together parallel_parse: the parallelism for processing records (decomp) """ parallel_key_dequeue = tuple(in_queue.dequeue() for _ in range(parallel_parse)) result_chunks = pipeline.local_read_pipeline( upstream_tensors=parallel_key_dequeue, columns=['results']) result_chunk_list = [list(c) for c in result_chunks] parsed_results = pipeline.agd_reader_multi_column_pipeline( upstream_tensorz=result_chunk_list) parsed_results_list = list(parsed_results) parsed_result = pipeline.join(parsed_results_list, parallel=1, capacity=8, multi=True)[0] # print(parsed_result) result_buf, num_results, first_ord, record_id = parsed_result result_buf = tf.unstack(result_buf)[0] # print(result_buf) result_out = persona_ops.agd_flagstat(results_handle=result_buf, num_records=num_results, name="flagstat") return result_out
def _make_graph(self, upstream_gate): def gen_delete_ops(): remove_op = partial(persona_ops.ceph_remove, cluster_name=self.ceph_cluster_name, user_name=self.ceph_user_name, pool_name=self.ceph_pool_name, columns=self.columns, ceph_conf_path=str(self.ceph_conf_path)) for idx in range(self.delete_parallel): id_and_count, components = upstream_gate.dequeue_many( count=self.global_batch) keys, namespaces = components num_items_deleted = remove_op(keys=keys, namespaces=namespaces) yield id_and_count, num_items_deleted items = tuple(gen_delete_ops()) queue_name = "nullceph_final" return pipeline.join(upstream_tensors=items, parallel=self.sink_parallel, capacity=self.delete_parallel + 1, multi=True, name=queue_name, shared_name=queue_name)
def make_graph_impl(self, local_gate): """ :param local_gate: :param args: :return: a gen of [ id_and_count, record_id, first_ordinal, num_records, file_basename, written_records] """ with tf.name_scope("read_stage"): # read ops: [ id_and_count, [ filename ], [ a list of handles in the order of the columns, NOT STACKED ] ] read_ops = tuple(self.make_read_stage(gate=local_gate)) # same as read ops, but flattened for ease of queueing read_ops_flattened = tuple((a[0],)+tuple(a[2:])+tuple(a[1]) for a in read_ops) write_ready_inputs = self.make_central_pipeline(inputs=read_ops_flattened) with tf.name_scope("write_stage"): write_ops = self.make_write_stage(write_ready_inputs=write_ready_inputs) queue_name = "written_records" all_done = pipeline.join(upstream_tensors=write_ops, parallel=1, multi=True, capacity=self.final_sink_capacity, name=queue_name, shared_name=queue_name) assert len(all_done) == 1 return all_done[0]
def export_bam(in_queue, args): manifest = args.dataset if 'reference' not in manifest: raise Exception( "No reference data in manifest {}. Unaligned BAM not yet supported. Please align dataset first." .format(args.dataset)) #bp_handle = persona_ops.buffer_pool(size=10, bound=False, name="buf_pool") #mmap_pool = persona_ops.m_map_pool(size=10, bound=False, name="file_mmap_buffer_pool") columns = ["base", "qual", "metadata", "results"] num_secondary = 0 for column in manifest['columns']: if 'secondary' in column: columns.append(column) secondary += 1 print("BAM output using columns: {}".format(columns)) # TODO provide option for reading from Ceph result_chunks = pipeline.local_read_pipeline( upstream_tensors=[in_queue.dequeue()], columns=columns) result_chunk_list = [list(c) for c in result_chunks] to_parse = pipeline.join(upstream_tensors=result_chunk_list, parallel=args.parallel_parse, multi=True, capacity=8) parsed_results = pipeline.agd_reader_multi_column_pipeline( upstream_tensorz=to_parse) parsed_results_list = list(parsed_results) parsed_result = pipeline.join(parsed_results_list, parallel=1, capacity=8, multi=True)[0] # base, qual, meta, result, [secondary], num_recs, first_ord, record_id handles = parsed_result[0] bases = handles[0] quals = handles[1] meta = handles[2] # give a matrix of all the result columns results = tf.stack(handles[3:]) num_recs = parsed_result[1] first_ord = parsed_result[2] if args.output_path == "": output_path = manifest['name'] + ".bam" else: output_path = args.output_path ref_lens = [] ref_seqs = [] for contig in manifest['reference_contigs']: ref_lens.append(contig['length']) ref_seqs.append(contig['name']) sort = manifest['sort'] if 'sort' in manifest else 'unsorted' pg_id = "personaAGD" # TODO get from manifest read_group = manifest['name'] agd_to_bam = persona_ops.agd_output_bam(results_handle=results, bases_handle=bases, qualities_handle=quals, metadata_handle=meta, num_records=num_recs, path=output_path, ref_sequences=ref_seqs, ref_seq_sizes=ref_lens, pg_id=pg_id, read_group=read_group, sort_order=sort, num_threads=args.threads) return [agd_to_bam], []
def make_central_pipeline(self, inputs): """ Make the central pipeline between the custom read and write operations :param args: :param inputs: a generator of type (id_and_count, column0, column1, ..., [:rest of input]). The number of colums is assumed to be the same and in the same order as self.columns :return: a generator of [ compressed_results_column_matrix, num_records, first_ordinal, record_id, id_and_count, {rest of input} ] """ if not isinstance(inputs, (list, tuple)): inputs = tuple(inputs) # type of each of these: (id_and_count, column0, column1, ..., [:rest of input]) queue_name = "align_ready_to_decomp" ready_to_decomp = pipeline.join(upstream_tensors=inputs, parallel=self.decompress_parallel, capacity=self.pre_decomp_capacity, multi=True, name=queue_name, shared_name=queue_name) with tf.name_scope("decompression_stage"): ready_to_align_items = self.make_decomp_stage(ready_to_decomp=ready_to_decomp) queue_name = "ready_to_align" ready_to_align = pipeline.join(upstream_tensors=ready_to_align_items, parallel=self.align_parallel, capacity=self.pre_align_capacity, multi=True, name=queue_name, shared_name=queue_name) with tf.name_scope("align_stage"): ready_to_compress_items = self.make_align_stage(ready_to_align=ready_to_align) queue_name = "align_ready_to_compress" ready_to_compress = pipeline.join(upstream_tensors=ready_to_compress_items, parallel=self.compress_parallel, capacity=self.pre_compress_capacity, multi=True, name=queue_name, shared_name=queue_name) with tf.name_scope("compress_stage"): ready_to_write_items = tuple(self.make_compress_stage(ready_to_compress=ready_to_compress)) def gen_control_deps(): for item in ready_to_write_items: num_records, ordinal, record_id = item[1:4] item_id = slice_id(item[4]) with tf.control_dependencies((item_id,)): ts = gate.unix_timestamp(name="align_tail_timestamp") yield (gate.log_events( item_names=("id", "time", "ordinal", "record_id", "num_records"), directory=self.log_directory, event_name="align_tail", name="align_tail_event_logger", components=(item_id, ts, ordinal, record_id, num_records) ),) control_deps = [] if self.log_goodput: control_deps.extend(gen_control_deps()) queue_name = "ready_to_write" ready_to_write = pipeline.join(upstream_tensors=ready_to_write_items, control_dependencies=control_deps, parallel=self.write_parallel, capacity=self.pre_write_capacity, multi=True, name=queue_name, shared_name=queue_name) return ready_to_write
def execute(args, modules): record_stats = args.record stats_directory = args.record_directory module = modules[args.command] if hasattr(args, 'service'): service_mode = args.service service = module.lookup_service(name=service_mode) else: # there is only one service if the args does not have .service service = module.get_services()[0] run_arguments = tuple(service.extract_run_args(args=args)) in_queue = tf.train.input_producer(input_tensor=run_arguments, num_epochs=1, shuffle=False, capacity=len(run_arguments)) # TODO currently we assume all the service_ops are the same service_ops, service_init_ops = service.make_graph(in_queue=in_queue, args=args) if not isinstance(service_ops, list): service_ops = list(service_ops) assert len(service_ops) + len(service_init_ops) > 0 has_service_ops = len(service_ops) > 0 if has_service_ops: service_sink = pipeline.join(upstream_tensors=service_ops, capacity=64, parallel=1, multi=True, name="global_sink_queue") init_ops = [tf.global_variables_initializer(), tf.local_variables_initializer()] # service graph may have summary nodes summary = args.summary if hasattr(args, 'summary') else False results = [] stats_results = {} with tf.Session() as sess: if summary and has_service_ops: trace_dir = setup_output_dir(dirname=args.command + "_summary") service_sink.append(tf.summary.merge_all()) summary_writer = tf.summary.FileWriter(trace_dir, graph=sess.graph, max_queue=2**20, flush_secs=10**4) else: summary = False count = 0 sess.run(init_ops) if len(service_init_ops) > 0: res = sess.run(service_init_ops) if summary: results.append(res[:-1]) else: results.append(res) #sess.run(service_init_ops) # its possible the service is a simple run once if len(service_ops) > 0: with contextlib.ExitStack() as stack: if record_stats: stack.enter_context(recorder.UsageRecorder(stats_results)) coord = tf.train.Coordinator() print("Local executor starting {} ...".format(args.command)) threads = tf.train.start_queue_runners(coord=coord, sess=sess) while not coord.should_stop(): try: #print("Running round {}".format(count)) result = sess.run(service_sink) count += 1 if summary: results.append(result[:-1]) summary_writer.add_summary(result[-1], global_step=count) else: results.append(result) except tf.errors.OutOfRangeError: #print('Got out of range error!') break print("Local executor finishing ...") coord.request_stop() coord.join(threads, stop_grace_period_secs=10) service.on_finish(args, results) if summary: summary_writer.flush(); summary_writer.close() if record_stats: params = vars(args) del params["func"] stats_results["params"] = vars(args) with open(create_unique_file(directory=stats_directory, prefix="runtime_stats", suffix=".json"), 'w+') as fl: json.dump(stats_results, fl)
def execute(args, modules): module = modules[args.dist_command] if hasattr(args, 'service'): service_mode = args.service service = module.lookup_service(name=service_mode) else: # there is only one service if the args does not have .service service = module.get_services()[0] if not service.distributed_capability(): raise Exception("Service {} does not support distributed execution".format(args.service)) task_index = args.task_index queue_index = args.queue_index cluster_spec = dist_common.make_cluster_spec(cluster_members=args.cluster_members) for idx in (task_index, queue_index): # this checks if the task index is in cluster_def # will throw an exception if not found cluster_spec.task_address(job_name=cluster_name, task_index=idx) input_dtypes = service.input_dtypes(args=args) input_shapes = service.input_shapes(args=args) output_dtypes = service.output_dtypes(args=args) output_shapes = service.output_shapes(args=args) service_name = args.dist_command + "_" + service.get_shortname() in_queue, out_queue = dist_common.make_common_queues(service_name=service_name, queue_index=queue_index, cluster_name=cluster_name, input_dtypes=input_dtypes, input_shapes=input_shapes, output_dtypes=output_dtypes, output_shapes=output_shapes) with tf.device("/job:{cluster_name}/task:{task_idx}".format(cluster_name=cluster_name, task_idx=task_index)): # me service_ops, service_init_ops = service.make_graph(in_queue=in_queue, args=args) service_ops = tuple(service_ops) assert len(service_ops) + len(service_init_ops) > 0 init_ops = [tf.global_variables_initializer(), tf.local_variables_initializer()] # TODO should a final join (if necessary) be moved into the service itself? service_sink = pipeline.join(upstream_tensors=service_ops, capacity=32, parallel=1, multi=True, name="sink_join")[0] queue_device = dist_common.make_queue_device_name(cluster_name=cluster_name, queue_index=queue_index) with tf.device(queue_device): final_op = out_queue.enqueue(service_sink, name="final_queue_enqueue_task_{}".format(task_index)) tf.train.add_queue_runner(qr=tf.train.QueueRunner(queue=out_queue, enqueue_ops=(final_op,))) # start our local server server = tf.train.Server(cluster_spec, config=None, job_name=cluster_name, task_index=task_index) log.debug("Persona distributed runtime starting TF server for index {}".format(task_index)) with tf.Session(server.target) as sess: sess.run(init_ops) if len(service_init_ops) > 0: sess.run(service_init_ops) # its possible the service is a simple run once if len(service_ops) > 0: coord = tf.train.Coordinator() uninitialized_vars = tf.report_uninitialized_variables() while len(sess.run(uninitialized_vars)) > 0: log.debug("Waiting for uninitialized variables") time.sleep(startup_wait_time) log.debug("All variables initialized. Persona dist executor starting {} ...".format(args.dist_command)) threads = tf.train.start_queue_runners(coord=coord, sess=sess) log.debug("Queue runners started. Waiting on coordinator to signal stop...") coord.wait_for_stop() timeout_time=60*3 try: coord.join(threads=threads, stop_grace_period_secs=timeout_time) except RuntimeError: log.error("Unable to wait for coordinator to stop all threads after {} seconds".format(timeout_time)) else: log.debug("All threads joined and dead")
def make_central_pipeline(self, args, input_gen, pass_around_gen): self.write_columns.append('results') for i in range(args.max_secondary): self.write_columns.append('secondary{}'.format(i)) self.write_columns = [{ "type": "structured", "extension": a } for a in self.write_columns] joiner = tuple( tuple(a) + tuple(b) for a, b in zip(input_gen, pass_around_gen)) ready_to_process = pipeline.join( upstream_tensors=joiner, parallel=args.parallel, capacity=args.parallel, # multiplied by some factor? multi=True, name="ready_to_process") # need to unpack better here to_agd_reader, pass_around_agd_reader = zip( *((a[:2], a[2:]) for a in ready_to_process)) multi_column_gen = pipeline.agd_reader_multi_column_pipeline( upstream_tensorz=to_agd_reader) def process_processed_bufs(): for processed_column, pass_around in zip(multi_column_gen, pass_around_agd_reader): if isinstance(pass_around, tf.Tensor): pass_around = (pass_around, ) yield tuple( a for a in itertools.chain(processed_column, pass_around)) processed_bufs = tuple(a for a in process_processed_bufs()) ready_to_assemble = pipeline.join( upstream_tensors=processed_bufs, parallel=args.assemblers, capacity=args.assemblers * 2, multi=True, name="ready_to_assemble" ) # TODO these params are kinda arbitrary :/ # ready_to_assemble: [output_buffers, num_records, first_ordinal, record_id, pass_around {flattened}) x N] to_assembler, pass_around_assembler = zip( *((a[:2], a[1:]) for a in ready_to_assemble)) # each item out of this is a handle to AGDReads agd_read_assembler_gen = tuple( pipeline.agd_read_assembler(upstream_tensors=to_assembler, include_meta=False)) # assembled_records, ready_to_align: [(agd_reads_handle, (num_records, first_ordinal, record_id), (pass_around)) x N] assembled_records_gen = tuple( zip(agd_read_assembler_gen, pass_around_assembler)) assembled_records = tuple( (a, ) + tuple(b) for a, b in assembled_records_gen) ready_to_align = pipeline.join( upstream_tensors=assembled_records, parallel=args.aligners, capacity=int(args.aligners * 1.5), multi=True, name="ready_to_align") # TODO still have default capacity here :/ if args.paired: aligner_type = persona_ops.snap_align_paired aligner_options = persona_ops.paired_aligner_options( cmd_line=args.snap_args.split(), name="paired_aligner_options") executor_type = persona_ops.snap_paired_executor else: aligner_type = persona_ops.snap_align_single aligner_options = persona_ops.aligner_options( cmd_line=args.snap_args.split(), name="aligner_options" ) # -o output.sam will not actually do anything executor_type = persona_ops.snap_single_executor first_assembled_result = ready_to_align[0][1:] sink_queue_shapes = [a.get_shape() for a in first_assembled_result] sink_queue_dtypes = [a.dtype for a in first_assembled_result] aligner_dtype = tf.string aligner_shape = (args.max_secondary + 1, 2) sink_queue_shapes.append(aligner_shape) sink_queue_dtypes.append(aligner_dtype) pass_around_aligners = tuple( a[1:] for a in ready_to_align ) # type: [(num_records, first_ordinal, record_id, pass_around x N) x N] pass_to_aligners = tuple(a[0] for a in ready_to_align) buffer_list_pool = persona_ops.buffer_list_pool( **pipeline.pool_default_args) genome = persona_ops.genome_index(genome_location=args.index_path, name="genome_loader") def make_aligners(): single_executor = executor_type(num_threads=args.aligner_threads, work_queue_size=args.aligners + 1, options_handle=aligner_options, genome_handle=genome) for read_handle, pass_around in zip(pass_to_aligners, pass_around_aligners): aligner_results = aligner_type( read=read_handle, buffer_list_pool=buffer_list_pool, subchunk_size=args.subchunking, executor_handle=single_executor, max_secondary=args.max_secondary) yield (aligner_results, ) + tuple(pass_around) aligners = tuple(make_aligners()) # aligners: [(buffer_list_handle, num_records, first_ordinal, record_id, pass_around X N) x N], that is COMPLETELY FLAT if args.compress_parallel > 0: aligner_results_to_compress = pipeline.join( upstream_tensors=aligners, parallel=args.compress_parallel, multi=True, capacity=4, name="ready_to_compress") to_compressors = (a[0] for a in aligner_results_to_compress) around_compressors = (a[1:] for a in aligner_results_to_compress) compressed_buffers = pipeline.aligner_compress_pipeline( upstream_tensors=to_compressors) after_compression = ( (a, ) + tuple(b) for a, b in zip(compressed_buffers, around_compressors)) aligners = tuple(after_compression) aligned_results = pipeline.join(upstream_tensors=aligners, parallel=args.writers, multi=True, capacity=4, name="aligned_results") ref_seqs, lens = persona_ops.snap_index_reference_sequences( genome_handle=genome) # Taking this out because it currently breaks distributed runtime return aligned_results, ( genome, ref_seqs, lens ) # returns [(buffer_list_handle, num_records, first_ordinal, record_id, pass_around X N) x N], that is COMPLETELY FLAT