def parse_file(self, directory): mls = [] for f in self.parser.find_all_files(directory): if os.path.isdir(f): continue try: self.logger.info(f + " will be parsed.") mls.extend(self.parser.parse_summary(f, self.metrics)) except Exception as e: self.logger.warning("Unexpected error: " + str(e)) continue # Metrics logs must contain at least one objective metric value # Objective metric is located at first index is_objective_metric_reported = False for ml in mls: if ml.metric.name == self.metrics[0]: is_objective_metric_reported = True break # If objective metrics were not reported, insert unavailable value in the DB if not is_objective_metric_reported: mls = [ api_pb2.MetricLog(time_stamp=rfc3339.rfc3339(datetime.now()), metric=api_pb2.Metric( name=self.metrics[0], value=const.UNAVAILABLE_METRIC_VALUE)) ] self.logger.info( "Objective metric {} is not found in training logs, {} value is reported" .format(self.metrics[0], const.UNAVAILABLE_METRIC_VALUE)) return api_pb2.ObservationLog(metric_logs=mls)
def parse_summary(self, tfefile): metric_logs = [] event_accumulator = EventAccumulator(tfefile, size_guidance={'tensors': 0}) event_accumulator.Reload() for tag in event_accumulator.Tags()['tensors']: for m in self.metric_names: tfefile_parent_dir = os.path.dirname(m) if len( m.split("/")) >= 2 else os.path.dirname(tfefile) basedir_name = os.path.dirname(tfefile) if not tag.startswith( m.split("/")[-1]) or not basedir_name.endswith( tfefile_parent_dir): continue for wall_time, step, tensor in event_accumulator.Tensors(tag): ml = api_pb2.MetricLog(time_stamp=rfc3339.rfc3339( datetime.fromtimestamp(wall_time)), metric=api_pb2.Metric( name=m, value=str( tf.make_ndarray(tensor)))) metric_logs.append(ml) return metric_logs
def parse_summary(self, tfefile, metrics): metric_logs = [] for summary in tf.train.summary_iterator(tfefile): paths = tfefile.split("/") for v in summary.summary.value: for m in metrics: tag = str(v.tag) if len(paths) >= 2 and len(m.split("/")) >= 2: tag = str(paths[-2] + "/" + v.tag) if tag.startswith(m): ml = api_pb2.MetricLog(time_stamp=rfc3339.rfc3339( datetime.fromtimestamp(summary.wall_time)), metric=api_pb2.Metric( name=m, value=str(v.simple_value))) metric_logs.append(ml) return metric_logs
def register_trial(stub): try: obj = api_pb2.ObjectiveSpec(type=1, goal=0.09, objective_metric_name="loss") parameters = api_pb2.TrialSpec.ParameterAssignments(assignments=[api_pb2.ParameterAssignment(name="rl", value="0.01")]) spec = api_pb2.TrialSpec(experiment_name=TEST_EXPERIMENT, objective=obj, run_spec="a batch/job resource", metrics_collector_spec="metrics/collector", parameter_assignments=parameters) observation = api_pb2.Observation(metrics=[api_pb2.Metric(name="loss", value="0.54")]) status = api_pb2.TrialStatus(condition=2, observation=observation, start_time="2019-04-28T17:09:15Z", completion_time="2019-04-28T18:09:15Z") t = api_pb2.Trial(name=TEST_TRIAL, status=status, spec=spec) stub.RegisterTrial(api_pb2.RegisterTrialRequest(trial=t), 10) logger.info("Register trial %s successfully" % TEST_TRIAL) except: logger.error("Failed to register trial %s" % TEST_TRIAL, exc_info=True) raise