def make_merge_ops(): for id_and_count, components in merge_batches: merge_kwargs = {} if self.order_by == location_value: merge_kwargs["results_indexes"] = components[0] components = components[1:] column_handless, record_ids = components[:2] pass_around = tuple( a[0] for a in components[2:]) # slice so we only get the frist component record_id = record_ids[0] control_deps = [] if self.log_goodput: with tf.control_dependencies( (id_and_count, ) ): # put it after this in case the merge takes a while ts = gate.unix_timestamp() control_deps.append( gate.log_events(item_names=("id", "time"), directory=self.log_directory, event_name="merge_head", components=(slice_id(id_and_count), ts))) with tf.control_dependencies(control_deps): yield merge(chunk_group_handles=column_handless, other_components=(id_and_count, record_id) + tuple(pass_around), **merge_kwargs)
def gen_written_paths(): for chunk_file, extension, write_type in zip( chunk_files, extensions, write_types): full_key = tf.string_join( [intermediate_name, extension], separator=".", name="full_key_join_{ext}_{t}".format(ext=extension, t=write_type)) result = writer( path=full_key, record_type=write_type, resource_handle=chunk_file, name="intermediate_ceph_writer_{ext}_{t}".format( ext=extension, t=write_type)) out_path = result.output_path if self.log_goodput: timestamp = result.time duration = result.duration bytes = result.bytes log_op = gate.log_events( item_names=("timestamp", "duration", "bytes", "key", "id"), directory=self.log_directory, event_name="sort_ceph_write", name="sort_ceph_write_logger", components=(timestamp, duration, bytes, out_path, slice_id(idc))) with tf.control_dependencies((log_op, )): out_path = tf.identity(out_path) yield out_path
def make_decomp_stage(self, ready_to_decomp): """ :param args: :param ready_to_decomp: generator of (id_and_count, column0, column1, ..., [:rest of input]) :return: a generator of [ agd_read_handle, num_records, first_ordinal, record_id, id_and_count, {rest of input} ] """ ready_to_decomp = sanitize_generator(ready_to_decomp) num_columns = len(self.columns) # to_agd_reader = just the columns # pass_around_agd_reader = (id_and_count, rest, of, input, ...) to_agd_reader, pass_around_agd_reader = zip(*( (rtd[1:1+num_columns], (rtd[0],)+tuple(rtd[1+num_columns:])) for rtd in ready_to_decomp )) def gen_timestamps(): for group in pass_around_agd_reader: with tf.control_dependencies((group[0],)): yield gate.unix_timestamp(name="align_head_timestamp") reader_kwargs = {} timestamps = [] if self.log_goodput: timestamps.extend(gen_timestamps()) assert len(timestamps) == len(ready_to_decomp) # control dependencies have to be an iterable reader_kwargs["control_ops"] = tuple((a,) for a in timestamps) # [output_buffer_handles], num_records, first_ordinal, record_id; in order, for each column group in upstream_tensorz multi_column_gen = tuple(pipeline.agd_reader_multi_column_pipeline(upstream_tensorz=to_agd_reader, verify=self.deep_verify, name="align_reader", **reader_kwargs)) # around = num_records, first_ordinal, record_id for each group to_assembler, around_assembler = zip(*( (a[:2], a[1:]) for a in multi_column_gen )) assembler_kwargs = {} if self.log_goodput: log_event_ops = [ (gate.log_events( # single element tuple because that's how tf.control_dependencies works item_names=("id","time","ordinal","record_id"), components=(in_id, timestamp, ordinal, record_id), event_name="align_head", directory=self.log_directory, name="align_head_event_logger" ),) for in_id, timestamp, ordinal, record_id in zip( (slice_id(a[0]) for a in pass_around_agd_reader), timestamps, (b[2] for b in multi_column_gen), (b[3] for b in multi_column_gen) ) ] assembler_kwargs["control_deps"] = log_event_ops # each element is an agd_reads handle agd_assembled_reads = pipeline.agd_read_assembler(upstream_tensors=to_assembler, include_meta=False, **assembler_kwargs) for agd_read, around_assembler_group, around_reader_group in zip(agd_assembled_reads, around_assembler, pass_around_agd_reader): yield (agd_read,) + tuple(around_assembler_group) + tuple(around_reader_group)
def gen_timestamps(): for group in pass_around_agd_reader: idc = group[0] with tf.control_dependencies((idc, )): ts = gate.unix_timestamp(name="sort_head_timestamp") event_log_op = gate.log_events(item_names=("id", "time"), components=(slice_id(idc), ts), event_name="sort_head", directory=self.log_directory, name="sort_head_event_logger") yield event_log_op
def gen_control_deps(): for item in ready_to_write_items: num_records, ordinal, record_id = item[1:4] item_id = slice_id(item[4]) with tf.control_dependencies((item_id,)): ts = gate.unix_timestamp(name="align_tail_timestamp") yield (gate.log_events( item_names=("id", "time", "ordinal", "record_id", "num_records"), directory=self.log_directory, event_name="align_tail", name="align_tail_event_logger", components=(item_id, ts, ordinal, record_id, num_records) ),)
def gen_buffers(bufs): for cb in bufs: compressed_buffer = cb.compressed_buffer if self.log_goodput: timestamp = cb.time duration = cb.duration original_size = cb.original_size compressed_size = cb.compressed_size log_op = gate.log_events( item_names=("timestamp", "duration", "original_bytes", "compressed_bytes"), directory=self.log_directory, event_name="merge_compression", name="merge_compression_logger", components=(timestamp, duration, original_size, compressed_size)) with tf.control_dependencies((log_op, )): compressed_buffer = tf.identity( compressed_buffer) yield compressed_buffer
def make_compress_stage(self, to_compress): """ :param to_compress: a generator of (chunk_handles, num_records, first_ordinal, total_chunks, id_and_count, record_id, {pass around}) :return: a generator of (id_and_count, chunk_file_matrix, first_ordinal, num_records, record_id, {pass around}) """ def compress_pipeline(handles): with tf.name_scope("merge_compress_results"): buffer_pool = persona_ops.buffer_pool(bound=False, size=10) compressors = tuple( partial(persona_ops.buffer_pair_compressor, buffer_pool=buffer_pool, pack=False, name="buffer_pair_compressor_{}".format(cname)) for cname in self.columns) for buffer_pairs in handles: bps_unstacked = tf.unstack(buffer_pairs) compressed_buffers = tuple( compressor(buffer_pair=a) for compressor, a in zip(compressors, bps_unstacked)) def gen_buffers(bufs): for cb in bufs: compressed_buffer = cb.compressed_buffer if self.log_goodput: timestamp = cb.time duration = cb.duration original_size = cb.original_size compressed_size = cb.compressed_size log_op = gate.log_events( item_names=("timestamp", "duration", "original_bytes", "compressed_bytes"), directory=self.log_directory, event_name="merge_compression", name="merge_compression_logger", components=(timestamp, duration, original_size, compressed_size)) with tf.control_dependencies((log_op, )): compressed_buffer = tf.identity( compressed_buffer) yield compressed_buffer yield tf.stack(tuple(gen_buffers(bufs=compressed_buffers))) to_compress = sanitize_generator(to_compress) for chunk_file_matrix, (num_records, first_ordinal, total_num_chunks, id_and_count, record_id), pass_around in \ zip( compress_pipeline(handles=(a[0] for a in to_compress)), (a[1:6] for a in to_compress), (a[6:] for a in to_compress) ): ids_only = tf.unstack(id_and_count, axis=1, name="id_only_extractor")[0] new_count = tf.fill(ids_only.shape, total_num_chunks, name="new_counts_fill") new_id_and_count = tf.stack((ids_only, new_count), axis=1, name="new_id_and_count_constructor") control_deps = [] if self.log_goodput: with tf.control_dependencies((new_id_and_count, )): ts = gate.unix_timestamp() control_deps.append( gate.log_events(item_names=("id", "time", "record_id", "num_records"), event_name="merge_tail", directory=self.log_directory, components=(ids_only, ts, record_id, num_records))) yield (new_id_and_count, chunk_file_matrix, first_ordinal, num_records, record_id) + tuple(pass_around), control_deps
def make_sort_stage(self, ready_to_sort): """ :param ready_to_sort: :return: (id_and_count, record_id, intermediate_name, superchunk_num_records, superchunk_matrix) + rest_of_input, log_event """ bpp = persona_ops.buffer_pair_pool( size=0, bound=False, name="local_read_sort_buffer_list_pool") self.log.info("order by is '{ob}'".format(ob=self.order_by)) if self.order_by == location_value: self.log.info("sorting by location") sort_op = partial(persona_ops.agd_sort, buffer_pair_pool=bpp, name="agd_sort_results") else: raise Exception("not supported") sort_op = partial(persona_ops.agd_sort_metadata, buffer_pair_pool=bpp, name="agd_sort_metadata") for id_and_count, components in ready_to_sort: output_buffer_handless, num_recordss, first_ordinals, record_ids = components[: 4] rest_of_inputs = components[4:] # need to just pick the top things rest_of_input = tuple(a[0] for a in rest_of_inputs) record_id = record_ids[0] first_ordinal = first_ordinals[0] first_ordinal_str = tf.as_string(first_ordinal, name="first_ordinal_conversion") # this filename is guaranteed to be unique because of the ordinal (unique among this dataset) and the extension (so it doesn't conflict with existing chunk files) # otherwise when a request is resubmitted, the cleanup from the merge stage may overlap with the new files created! random_gen = tf.as_string( tf.random_uniform(dtype=tf.int32, maxval=2**20, shape=(), name="random_intermediate_name_gen"), name="random_intermediate_value_to_string") intermediate_name = tf.string_join( (record_id, first_ordinal_str, random_gen, intermediate_extension), separator="_", name="intermediate_filename") # TODO not sure if this axis=1 is correct unstack_handles = tf.unstack(output_buffer_handless, axis=1, name="buffers_unstack") key_handles = unstack_handles[0] # output_buffer_handless[:,0,:] other_handles = tf.stack(unstack_handles[1:], axis=1) # output_buffer_handless[:,1:,:] # first column is always the correct one, due to self.extended_columns order superchunk_matrix, superchunk_num_records = sort_op( num_records=num_recordss, sort_key_handles=key_handles, column_handles=other_handles) if self.log_goodput: with tf.control_dependencies((superchunk_num_records, )): ts = gate.unix_timestamp(name="sort_tail_timestamp") log_event = (gate.log_events( item_names=("id", "time", "record_id", "num_records"), directory=self.log_directory, event_name="sort_tail", name="sort_tail_event_logger", components=(slice_id(id_and_count), ts, record_id, superchunk_num_records)), ) else: log_event = () yield (id_and_count, record_id, intermediate_name, superchunk_num_records, superchunk_matrix) + rest_of_input, log_event