Example #1
0
 def make_merge_ops():
     for id_and_count, components in merge_batches:
         merge_kwargs = {}
         if self.order_by == location_value:
             merge_kwargs["results_indexes"] = components[0]
             components = components[1:]
         column_handless, record_ids = components[:2]
         pass_around = tuple(
             a[0] for a in
             components[2:])  # slice so we only get the frist component
         record_id = record_ids[0]
         control_deps = []
         if self.log_goodput:
             with tf.control_dependencies(
                 (id_and_count, )
             ):  # put it after this in case the merge takes a while
                 ts = gate.unix_timestamp()
             control_deps.append(
                 gate.log_events(item_names=("id", "time"),
                                 directory=self.log_directory,
                                 event_name="merge_head",
                                 components=(slice_id(id_and_count),
                                             ts)))
         with tf.control_dependencies(control_deps):
             yield merge(chunk_group_handles=column_handless,
                         other_components=(id_and_count, record_id) +
                         tuple(pass_around),
                         **merge_kwargs)
Example #2
0
 def gen_timestamps():
     for group in pass_around_agd_reader:
         idc = group[0]
         with tf.control_dependencies((idc, )):
             ts = gate.unix_timestamp(name="sort_head_timestamp")
         event_log_op = gate.log_events(item_names=("id", "time"),
                                        components=(slice_id(idc), ts),
                                        event_name="sort_head",
                                        directory=self.log_directory,
                                        name="sort_head_event_logger")
         yield event_log_op
Example #3
0
 def gen_control_deps():
     for item in ready_to_write_items:
         num_records, ordinal, record_id = item[1:4]
         item_id = slice_id(item[4])
         with tf.control_dependencies((item_id,)):
             ts = gate.unix_timestamp(name="align_tail_timestamp")
         yield (gate.log_events(
             item_names=("id", "time", "ordinal", "record_id", "num_records"),
             directory=self.log_directory,
             event_name="align_tail",
             name="align_tail_event_logger",
             components=(item_id, ts, ordinal, record_id, num_records)
         ),)
Example #4
0
 def gen_timestamps():
     for group in pass_around_agd_reader:
         with tf.control_dependencies((group[0],)):
             yield gate.unix_timestamp(name="align_head_timestamp")
Example #5
0
    def make_compress_stage(self, to_compress):
        """
        :param to_compress: a generator of (chunk_handles, num_records, first_ordinal, total_chunks, id_and_count, record_id, {pass around})
        :return: a generator of (id_and_count, chunk_file_matrix, first_ordinal, num_records, record_id, {pass around})
        """
        def compress_pipeline(handles):
            with tf.name_scope("merge_compress_results"):
                buffer_pool = persona_ops.buffer_pool(bound=False, size=10)

                compressors = tuple(
                    partial(persona_ops.buffer_pair_compressor,
                            buffer_pool=buffer_pool,
                            pack=False,
                            name="buffer_pair_compressor_{}".format(cname))
                    for cname in self.columns)
                for buffer_pairs in handles:
                    bps_unstacked = tf.unstack(buffer_pairs)
                    compressed_buffers = tuple(
                        compressor(buffer_pair=a)
                        for compressor, a in zip(compressors, bps_unstacked))

                    def gen_buffers(bufs):
                        for cb in bufs:
                            compressed_buffer = cb.compressed_buffer
                            if self.log_goodput:
                                timestamp = cb.time
                                duration = cb.duration
                                original_size = cb.original_size
                                compressed_size = cb.compressed_size
                                log_op = gate.log_events(
                                    item_names=("timestamp", "duration",
                                                "original_bytes",
                                                "compressed_bytes"),
                                    directory=self.log_directory,
                                    event_name="merge_compression",
                                    name="merge_compression_logger",
                                    components=(timestamp, duration,
                                                original_size,
                                                compressed_size))
                                with tf.control_dependencies((log_op, )):
                                    compressed_buffer = tf.identity(
                                        compressed_buffer)
                            yield compressed_buffer

                    yield tf.stack(tuple(gen_buffers(bufs=compressed_buffers)))

        to_compress = sanitize_generator(to_compress)
        for chunk_file_matrix, (num_records, first_ordinal, total_num_chunks, id_and_count, record_id), pass_around in \
            zip(
                compress_pipeline(handles=(a[0] for a in to_compress)),
                (a[1:6] for a in to_compress),
                (a[6:] for a in to_compress)
            ):
            ids_only = tf.unstack(id_and_count,
                                  axis=1,
                                  name="id_only_extractor")[0]
            new_count = tf.fill(ids_only.shape,
                                total_num_chunks,
                                name="new_counts_fill")
            new_id_and_count = tf.stack((ids_only, new_count),
                                        axis=1,
                                        name="new_id_and_count_constructor")
            control_deps = []
            if self.log_goodput:
                with tf.control_dependencies((new_id_and_count, )):
                    ts = gate.unix_timestamp()
                control_deps.append(
                    gate.log_events(item_names=("id", "time", "record_id",
                                                "num_records"),
                                    event_name="merge_tail",
                                    directory=self.log_directory,
                                    components=(ids_only, ts, record_id,
                                                num_records)))

            yield (new_id_and_count, chunk_file_matrix, first_ordinal,
                   num_records, record_id) + tuple(pass_around), control_deps
Example #6
0
    def make_sort_stage(self, ready_to_sort):
        """
        :param ready_to_sort:
        :return: (id_and_count, record_id, intermediate_name, superchunk_num_records, superchunk_matrix) + rest_of_input, log_event
        """
        bpp = persona_ops.buffer_pair_pool(
            size=0, bound=False, name="local_read_sort_buffer_list_pool")

        self.log.info("order by is '{ob}'".format(ob=self.order_by))
        if self.order_by == location_value:
            self.log.info("sorting by location")
            sort_op = partial(persona_ops.agd_sort,
                              buffer_pair_pool=bpp,
                              name="agd_sort_results")
        else:
            raise Exception("not supported")
            sort_op = partial(persona_ops.agd_sort_metadata,
                              buffer_pair_pool=bpp,
                              name="agd_sort_metadata")

        for id_and_count, components in ready_to_sort:
            output_buffer_handless, num_recordss, first_ordinals, record_ids = components[:
                                                                                          4]
            rest_of_inputs = components[4:]

            # need to just pick the top things
            rest_of_input = tuple(a[0] for a in rest_of_inputs)
            record_id = record_ids[0]
            first_ordinal = first_ordinals[0]

            first_ordinal_str = tf.as_string(first_ordinal,
                                             name="first_ordinal_conversion")

            # this filename is guaranteed to be unique because of the ordinal (unique among this dataset) and the extension (so it doesn't conflict with existing chunk files)
            # otherwise when a request is resubmitted, the cleanup from the merge stage may overlap with the new files created!
            random_gen = tf.as_string(
                tf.random_uniform(dtype=tf.int32,
                                  maxval=2**20,
                                  shape=(),
                                  name="random_intermediate_name_gen"),
                name="random_intermediate_value_to_string")
            intermediate_name = tf.string_join(
                (record_id, first_ordinal_str, random_gen,
                 intermediate_extension),
                separator="_",
                name="intermediate_filename")

            # TODO not sure if this axis=1 is correct
            unstack_handles = tf.unstack(output_buffer_handless,
                                         axis=1,
                                         name="buffers_unstack")
            key_handles = unstack_handles[0]  # output_buffer_handless[:,0,:]
            other_handles = tf.stack(unstack_handles[1:],
                                     axis=1)  # output_buffer_handless[:,1:,:]

            # first column is always the correct one, due to self.extended_columns order
            superchunk_matrix, superchunk_num_records = sort_op(
                num_records=num_recordss,
                sort_key_handles=key_handles,
                column_handles=other_handles)

            if self.log_goodput:
                with tf.control_dependencies((superchunk_num_records, )):
                    ts = gate.unix_timestamp(name="sort_tail_timestamp")
                log_event = (gate.log_events(
                    item_names=("id", "time", "record_id", "num_records"),
                    directory=self.log_directory,
                    event_name="sort_tail",
                    name="sort_tail_event_logger",
                    components=(slice_id(id_and_count), ts, record_id,
                                superchunk_num_records)), )
            else:
                log_event = ()

            yield (id_and_count, record_id, intermediate_name,
                   superchunk_num_records,
                   superchunk_matrix) + rest_of_input, log_event