Esempio n. 1
0
 def after_run(self, run_context, run_value):
     metrics.emit_store(name="loss",
                        value=run_value.results['loss'],
                        tags={})
     metrics.emit_store(name="auc",
                        value=run_value.results['auc'],
                        tags={})
    def update_stats(self, item, kind='joined'):
        """
        Args:
            item: RawDataIter.Item. Item from iterating RawDataVisitor
            kind: str. 'joined', 'unjoined', 'negative'. Indicate where the item
                should be counted towards.

        Returns: None
        Update stats dict. Emit join status and other fields of each item to ES.
        """
        assert kind in ('joined', 'unjoined', 'negative')
        if kind == 'unjoined':
            self.sample_unjoined(item.example_id)
        item_stat = {
            'joined': int(kind == 'joined'),
            'original': int(kind != 'negative'),
            'negative': int(kind == 'negative')
        }
        tags = copy.deepcopy(self._tags)
        for field in self._stat_fields:
            value = self._convert_to_str(getattr(item, field, '#None#'))
            item_stat[field] = value
            self._stats[kind]['{}={}'.format(field, value)] += 1
        tags.update(item_stat)
        tags['example_id'] = self._convert_to_str(item.example_id)
        tags['event_time'] = self._convert_to_str(item.event_time)
        tags['event_time_iso'] = convert_to_iso_format(item.event_time)
        metrics.emit_store(name='datajoin', value=0, tags=tags)
Esempio n. 3
0
 def emit_metric(self, item):
     if random.random() < self._sample_ratio:
         tags = copy.deepcopy(self._tags)
         for field in self._stat_fields:
             value = self.convert_to_str(getattr(item, field, '#None#'))
             tags[field] = value
         tags['example_id'] = self.convert_to_str(item.example_id)
         tags['event_time'] = convert_to_iso_format(item.event_time)
         metrics.emit_store(name='input_data', value=0, tags=tags)
Esempio n. 4
0
 def _emit_dumper_metrics(self, file_index, dumped_index):
     dump_duration = time.time() - self._latest_dump_timestamp
     metrics.emit_timer(name='example_id_dump_duration',
                        value=int(dump_duration),
                        tags=self._metrics_tags)
     metrics.emit_store(name='example_dump_file_index',
                        value=file_index,
                        tags=self._metrics_tags)
     metrics.emit_store(name='example_id_dumped_index',
                        value=dumped_index,
                        tags=self._metrics_tags)
Esempio n. 5
0
    def _stats_metric(self, global_step, results):
        with self._stats_client.pipeline() as pipe:
            pipe.gauge("trainer.metric_global_step", global_step)
            for key in self._metric_names:
                value = results[key]
                pipe.gauge("trainer.metric_value",
                           value.sum(),
                           tags={"metric": key})

                # for compatibility, also write to metrics(es)
                metrics.emit_store(name=key, value=value, tags={})
Esempio n. 6
0
 def after_run(self, run_context, run_value):
     self._iter += 1
     if self._iter % self._every_n_iter == 0:
         result = run_value.results
         tags = {}
         if 'event_time' in result:
             event_time = result.pop('event_time').decode()
             tags['event_time'] = fcc.convert_to_datetime(
                 event_time.decode(),
                 True).isoformat(timespec='microseconds')
         for name, value in result.items():
             metrics.emit_store(name=name, value=value, tags=tags)
Esempio n. 7
0
 def after_run(self, run_context, run_value):
     self._iter += 1
     if self._iter % self._every_n_iter == 0:
         result = run_value.results
         tags = {}
         for tag in self._tag_names:
             if tag in result:
                 tags[tag] = result[tag]
         for name in self._tensor_names:
             if name in result:
                 metrics.emit_store(name=name,
                                    value=result[name],
                                    tags=tags)
Esempio n. 8
0
 def emit_metric(self, item):
     if random.random() < self._sample_ratio:
         tags = copy.deepcopy(self._tags)
         for field in self._stat_fields:
             value = convert_to_str(getattr(item, field, '#None#'))
             tags[field] = value
         tags['example_id'] = convert_to_str(item.example_id)
         tags['event_time'] = convert_to_datetime(item.event_time, True) \
             .isoformat(timespec='microseconds')
         tags['process_time'] = datetime.now(tz=pytz.utc) \
             .isoformat(timespec='microseconds')
         metrics.emit_store(name='input_data',
                            value=0,
                            tags=tags,
                            index_type='raw_data')
Esempio n. 9
0
 def make_processor(self, next_index):
     input_finished = False
     with self._lock:
         if next_index is None:
             return
         if self._check_index_rollback(next_index):
             self._batch_queue = []
             self._flying_item_count = 0
         if len(self._batch_queue) > 0:
             end_batch = self._batch_queue[-1]
             next_index = end_batch.begin_index + len(end_batch)
         input_finished = self._input_finished
     assert next_index >= 0, "the next index should >= 0"
     end_batch = None
     batch_finished = False
     iter_round = 0
     processed_index = None
     start_tm = time.time()
     for batch, batch_finished in self._make_inner_generator(next_index):
         if batch is not None:
             if len(batch) > 0:
                 latency_mn = '{}.produce.latency'.format(self.name())
                 metrics.emit_timer(name=latency_mn,
                                    value=time.time() - start_tm,
                                    tags=self._get_metrics_tags())
                 store_mn = '{}.produce.index'.format(self.name())
                 metrics.emit_store(name=store_mn,
                                    value=batch.begin_index + len(batch) -
                                    1,
                                    tags=self._get_metrics_tags())
                 self._append_next_item_batch(batch)
                 yield batch
                 start_tm = time.time()
             self._update_last_index(batch.begin_index + len(batch) - 1)
             iter_round += 1
             processed_index = batch.begin_index + len(batch) - 1
             if iter_round % 16 == 0:
                 logging.info("%s process to index %d", self.name(),
                              processed_index)
     if processed_index is not None:
         logging.info("%s process to index %d when round finished",
                      self.name(), processed_index)
     if input_finished and batch_finished:
         self._set_process_finished()
Esempio n. 10
0
 def _update_peer_index(self, impl_ctx, peer_next_index, peer_dumped_index):
     assert isinstance(impl_ctx, TransmitLeader.ImplContext)
     _, dumped_index = impl_ctx.get_peer_index()
     impl_ctx.set_peer_index(peer_next_index, peer_dumped_index)
     if dumped_index < peer_dumped_index:
         req = dj_pb.RawDataRequest(
             data_source_meta=self._data_source.data_source_meta,
             rank_id=self._rank_id,
             partition_id=impl_ctx.partition_id,
             peer_dumped_index=dj_pb.PeerDumpedIndex(
                 peer_dumped_index=peer_dumped_index))
         rsp = self._master_client.ForwardPeerDumpedIndex(req)
         if rsp.code != 0:
             raise RuntimeError("{} failed to forward peer dumped index "\
                                "to {} reason: {}".format(self._repr_str,
                                                          peer_dumped_index,
                                                          rsp.error_message))
         metrics.emit_store('peer_dumped_index', peer_dumped_index,
                            self._get_metrics_tag(impl_ctx))
Esempio n. 11
0
 def iterator():
     with lock:
         resend_msgs = list(resend_list)
     for item in resend_msgs:
         logging.warning("Streaming resend message seq_num=%d",
                         item.seq_num)
         metrics.emit_store(name="resend_msg_seq_num",
                            value=int(item.seq_num),
                            tags={})
         yield item
     while True:
         item = self._transmit_queue.get()
         with lock:
             resend_list.append(item)
         logging.debug("Streaming send message seq_num=%d",
                       item.seq_num)
         metrics.emit_store(name="send_msg_seq_num",
                            value=int(item.seq_num),
                            tags={})
         yield item
Esempio n. 12
0
 def after_run(self, run_context, run_value):
     self._iter += 1
     if self._iter % self._every_n_iter == 0:
         for name, value in run_value.results.items():
             metrics.emit_store(name=name, value=value, tags={})
Esempio n. 13
0
 def _emit_logger(self, metrics_tags):
     meta = self._data_block_meta
     nmetric_tags = self._metrics_tags
     if metrics_tags is not None and len(metrics_tags) > 0:
         nmetric_tags = copy.deepcopy(self._metrics_tags)
         nmetric_tags.update(metrics_tags)
     metrics.emit_store(name='data_block_index',
                        value=meta.data_block_index,
                        tags=nmetric_tags)
     metrics.emit_store(name='stats_cum_join_num',
                        value=meta.joiner_stats_info.stats_cum_join_num,
                        tags=nmetric_tags)
     metrics.emit_store(name='actual_cum_join_num',
                        value=meta.joiner_stats_info.actual_cum_join_num,
                        tags=nmetric_tags)
     metrics.emit_store(name='leader_stats_index',
                        value=meta.joiner_stats_info.leader_stats_index,
                        tags=nmetric_tags)
     metrics.emit_store(name='follower_stats_index',
                        value=meta.joiner_stats_info.follower_stats_index,
                        tags=nmetric_tags)
     leader_join_rate = 0.0
     if meta.joiner_stats_info.leader_stats_index > 0:
         leader_join_rate = meta.joiner_stats_info.actual_cum_join_num / \
                 meta.joiner_stats_info.leader_stats_index
     follower_join_rate = 0.0
     if meta.joiner_stats_info.follower_stats_index > 0:
         follower_join_rate = meta.joiner_stats_info.actual_cum_join_num / \
             meta.joiner_stats_info.follower_stats_index
     metrics.emit_store(name='leader_join_rate_percent',
                        value=int(leader_join_rate * 100),
                        tags=nmetric_tags)
     metrics.emit_store(name='follower_join_rate_percent',
                        value=int(follower_join_rate * 100),
                        tags=nmetric_tags)
     logging.info("create new data block id: %s, data block index: %d," \
                  "stats:\n stats_cum_join_num: %d, actual_cum_join_num: "\
                  "%d, leader_stats_index: %d, follower_stats_index: %d, "\
                  "leader_join_rate: %f, follower_join_rate: %f",
                  meta.block_id, meta.data_block_index,
                  meta.joiner_stats_info.stats_cum_join_num,
                  meta.joiner_stats_info.actual_cum_join_num,
                  meta.joiner_stats_info.leader_stats_index,
                  meta.joiner_stats_info.follower_stats_index,
                  leader_join_rate, follower_join_rate)
Esempio n. 14
0
    def _client_daemon_fn(self):
        stop_event = threading.Event()
        generator = None
        channel = make_insecure_channel(self._remote_address,
                                        ChannelType.REMOTE,
                                        options=self._grpc_options,
                                        compression=self._compression)
        client = make_ready_client(channel, stop_event)

        lock = threading.Lock()
        resend_list = collections.deque()

        @metrics.timer(func_name="shutdown_fn", tags={})
        def shutdown_fn():
            with lock:
                while len(resend_list) > 0 or not self._transmit_queue.empty():
                    logging.debug(
                        "Waiting for resend queue's being cleaned. "
                        "Resend queue size: %d", len(resend_list))
                    lock.release()
                    time.sleep(1)
                    lock.acquire()

            stop_event.set()
            if generator is not None:
                generator.cancel()

        self._client_daemon_shutdown_fn = shutdown_fn

        while not stop_event.is_set():
            try:

                def iterator():
                    with lock:
                        resend_msgs = list(resend_list)
                    for item in resend_msgs:
                        logging.warning("Streaming resend message seq_num=%d",
                                        item.seq_num)
                        metrics.emit_store(name="resend_msg_seq_num",
                                           value=int(item.seq_num),
                                           tags={})
                        yield item
                    while True:
                        item = self._transmit_queue.get()
                        with lock:
                            resend_list.append(item)
                        logging.debug("Streaming send message seq_num=%d",
                                      item.seq_num)
                        metrics.emit_store(name="send_msg_seq_num",
                                           value=int(item.seq_num),
                                           tags={})
                        yield item

                time_start = time.time()
                generator = client.StreamTransmit(iterator())
                time_end = time.time()
                metrics.emit_timer(name="one_StreamTransmit_spend",
                                   value=int(time_end - time_start),
                                   tags={})
                for response in generator:
                    if response.status.code == common_pb.STATUS_SUCCESS:
                        logging.debug(
                            "Message with seq_num=%d is "
                            "confirmed", response.next_seq_num - 1)
                    elif response.status.code == \
                        common_pb.STATUS_MESSAGE_DUPLICATED:
                        logging.debug(
                            "Resent Message with seq_num=%d is "
                            "confirmed", response.next_seq_num - 1)
                    elif response.status.code == \
                        common_pb.STATUS_MESSAGE_MISSING:
                        raise RuntimeError("Message with seq_num=%d is "
                                           "missing!" %
                                           (response.next_seq_num - 1))
                    else:
                        raise RuntimeError("Trainsmit failed with %d" %
                                           response.status.code)
                    with lock:
                        while resend_list and \
                                resend_list[0].seq_num < response.next_seq_num:
                            resend_list.popleft()
                        min_seq_num_to_resend = resend_list[0].seq_num \
                            if resend_list else "NaN"
                        logging.debug(
                            "Resend queue size: %d, starting from seq_num=%s",
                            len(resend_list), min_seq_num_to_resend)
                metrics.emit_store(name="sum_of_resend",
                                   value=int(len(resend_list)),
                                   tags={})
            except Exception as e:  # pylint: disable=broad-except
                if not stop_event.is_set():
                    logging.warning("Bridge streaming broken: %s.", repr(e))
            finally:
                generator.cancel()
                channel.close()
                logging.warning(
                    "Restarting streaming: resend queue size: %d, "
                    "starting from seq_num=%s", len(resend_list),
                    resend_list and resend_list[0].seq_num or "NaN")
                channel = make_insecure_channel(self._remote_address,
                                                ChannelType.REMOTE,
                                                options=self._grpc_options,
                                                compression=self._compression)
                client = make_ready_client(channel, stop_event)
                self._check_remote_heartbeat(client)