def unnest_checkpoints(checkpoints): checkpoint_dicts = [] for g in checkpoints: checkpoint = copy.deepcopy(g) for key in UNNEST_KEYS: if key not in checkpoint: continue try: unnest_dict = flatten_dict(checkpoint.pop(key)) checkpoint.update(unnest_dict) except Exception: logger.debug("Failed to flatten dict.") checkpoint = flatten_dict(checkpoint) checkpoint_dicts.append(checkpoint) return checkpoint_dicts
def _read_experiment(experiment_state, experiment_path): checkpoint_dicts = experiment_state["checkpoints"] checkpoint_dicts = [flatten_dict(g) for g in checkpoint_dicts] progress = {} params = {} # TODO: no real use for exp_directories outside this function, why get it? exp_directories = {} for exp in checkpoint_dicts: if exp.get("logdir", None) is None: continue exp_dir = os.path.basename(exp["logdir"]) exp_tag = exp["experiment_tag"] csv = os.path.join(experiment_path, exp_dir, "progress.csv") # check if file size is > 0 before proceeding if os.path.isfile(csv) and os.stat(csv).st_size: progress[exp_tag] = pd.read_csv(csv) exp_directories[exp_tag] = os.path.abspath( os.path.join(experiment_path, exp_dir)) # Read in the configs for this experiment params_file = os.path.join(experiment_path, exp_dir, "params.json") with open(params_file) as f: params[exp_tag] = json.load(f) return progress, params
def _process_trial(self, trial): try: result = self.trial_executor.fetch_result(trial) is_duplicate = RESULT_DUPLICATE in result # TrialScheduler and SearchAlgorithm still receive a # notification because there may be special handling for # the `on_trial_complete` hook. if is_duplicate: logger.debug("Trial finished without logging 'done'.") result = trial.last_result result.update(done=True) self._total_time += result.get(TIME_THIS_ITER_S, 0) flat_result = flatten_dict(result) if trial.should_stop(flat_result): # Hook into scheduler self._scheduler_alg.on_trial_complete(self, trial, flat_result) self._search_alg.on_trial_complete( trial.trial_id, result=flat_result) decision = TrialScheduler.STOP else: with warn_if_slow("scheduler.on_trial_result"): decision = self._scheduler_alg.on_trial_result( self, trial, flat_result) with warn_if_slow("search_alg.on_trial_result"): self._search_alg.on_trial_result(trial.trial_id, flat_result) if decision == TrialScheduler.STOP: with warn_if_slow("search_alg.on_trial_complete"): self._search_alg.on_trial_complete( trial.trial_id, result=flat_result, early_terminated=True) if not is_duplicate: trial.update_last_result( result, terminate=(decision == TrialScheduler.STOP)) # Checkpoints to disk. This should be checked even if # the scheduler decision is STOP or PAUSE. Note that # PAUSE only checkpoints to memory and does not update # the global checkpoint state. self._checkpoint_trial_if_needed( trial, force=result.get(SHOULD_CHECKPOINT, False)) if decision == TrialScheduler.CONTINUE: self.trial_executor.continue_training(trial) elif decision == TrialScheduler.PAUSE: self.trial_executor.pause_trial(trial) elif decision == TrialScheduler.STOP: self.trial_executor.export_trial_if_needed(trial) self.trial_executor.stop_trial(trial) else: assert False, "Invalid scheduling decision: {}".format( decision) except Exception: logger.exception("Error processing event.") self._process_trial_failure(trial, traceback.format_exc())
def on_result(self, result): if self._file_writer is None: from tensorflow.python.eager import context self._context = context self._file_writer = tf.summary.create_file_writer(self.logdir) with tf.device("/CPU:0"), self._context.eager_mode(): with tf.summary.record_if(True), self._file_writer.as_default(): step = result.get( TIMESTEPS_TOTAL) or result[TRAINING_ITERATION] tmp = result.copy() for k in [ "config", "pid", "timestamp", TIME_TOTAL_S, TRAINING_ITERATION ]: if k in tmp: del tmp[k] # not useful to log these flat_result = flatten_dict(tmp, delimiter="/") path = ["ray", "tune"] for attr, value in flat_result.items(): if type(value) in VALID_SUMMARY_TYPES: tf.summary.scalar("/".join(path + [attr]), value, step=step) self._file_writer.flush()
def _generate_trials(self, experiment_spec, output_path=""): """Generates trials with configurations from `_suggest`. Creates a trial_id that is passed into `_suggest`. Yields: Trial objects constructed according to `spec` """ if "run" not in experiment_spec: raise TuneError("Must specify `run` in {}".format(experiment_spec)) for _ in range(experiment_spec.get("num_samples", 1)): trial_id = Trial.generate_id() while True: suggested_config = self._suggest(trial_id) if suggested_config is None: yield None else: break spec = copy.deepcopy(experiment_spec) spec["config"] = merge_dicts(spec["config"], copy.deepcopy(suggested_config)) flattened_config = resolve_nested_dict(spec["config"]) self._counter += 1 tag = "{0}_{1}".format(str(self._counter), format_vars(flattened_config)) yield create_trial_from_spec( spec, output_path, self._parser, evaluated_params=flatten_dict(suggested_config), experiment_tag=tag, trial_id=trial_id)
def _parse_configs(cfg_path): try: with open(cfg_path) as f: cfg_dict = flatten_dict(json.load(f)) except Exception: logger.exception("Config parsing failed.") return cfg_dict
def update_last_result(self, result, terminate=False): result.update(trial_id=self.trial_id, done=terminate) if self.experiment_tag: result.update(experiment_tag=self.experiment_tag) if self.verbose and (terminate or time.time() - self.last_debug > DEBUG_PRINT_INTERVAL): print("Result for {}:".format(self)) print(" {}".format(pretty_print(result).replace("\n", "\n "))) self.last_debug = time.time() self.set_location(Location(result.get("node_ip"), result.get("pid"))) self.last_result = result self.last_update_time = time.time() self.result_logger.on_result(self.last_result) for metric, value in flatten_dict(result).items(): if isinstance(value, Number): if metric not in self.metric_analysis: self.metric_analysis[metric] = { "max": value, "min": value, "last": value } else: self.metric_analysis[metric]["max"] = max( value, self.metric_analysis[metric]["max"]) self.metric_analysis[metric]["min"] = min( value, self.metric_analysis[metric]["min"]) self.metric_analysis[metric]["last"] = value
def to_tf_values(result, path): flat_result = flatten_dict(result, delimiter="/") values = [ tf.Summary.Value(tag="/".join(path + [attr]), simple_value=value) for attr, value in flat_result.items() if type(value) in VALID_SUMMARY_TYPES ] return values
def _parse_results(res_path): res_dict = {} try: with open(res_path) as f: # Get last line in file for line in f: pass res_dict = flatten_dict(json.loads(line.strip())) except Exception: logger.exception("Importing %s failed...Perhaps empty?" % res_path) return res_dict
def to_tf_values(result, path): if use_tf150_api: type_list = [int, float, np.float32, np.float64, np.int32] else: type_list = [int, float] flat_result = flatten_dict(result, delimiter="/") values = [ tf.Summary.Value(tag="/".join(path + [attr]), simple_value=value) for attr, value in flat_result.items() if type(value) in type_list ] return values
def on_result(self, result): tmp = result.copy() if "config" in tmp: del tmp["config"] result = flatten_dict(tmp, delimiter="/") if self._csv_out is None: self._csv_out = csv.DictWriter(self._file, result.keys()) if not self._continuing: self._csv_out.writeheader() self._csv_out.writerow( {k: v for k, v in result.items() if k in self._csv_out.fieldnames}) self._file.flush()
def _get_trial_info(trial, parameters, metrics): """Returns the following information about a trial: name | status | loc | params... | metrics... Args: trial (Trial): Trial to get information for. parameters (List[str]): Names of trial parameters to include. metrics (List[str]): Names of metrics to include. """ result = flatten_dict(trial.last_result) trial_info = [str(trial), trial.status, str(trial.location)] trial_info += [result.get(CONFIG_PREFIX + param) for param in parameters] trial_info += [result.get(metric) for metric in metrics] return trial_info
def list_trials(experiment_path, sort=None, info_keys=DEFAULT_EXPERIMENT_INFO_KEYS, result_keys=DEFAULT_RESULT_KEYS): """Lists trials in the directory subtree starting at the given path. Args: experiment_path (str): Directory where trials are located. Corresponds to Experiment.local_dir/Experiment.name. sort (str): Key to sort by. info_keys (list): Keys that are displayed. result_keys (list): Keys of last result that are displayed. """ _check_tabulate() experiment_state = _get_experiment_state(experiment_path, exit_on_fail=True) checkpoint_dicts = experiment_state["checkpoints"] checkpoint_dicts = [flatten_dict(g) for g in checkpoint_dicts] checkpoints_df = pd.DataFrame(checkpoint_dicts) result_keys = ["last_result:{}".format(k) for k in result_keys] col_keys = [ k for k in list(info_keys) + result_keys if k in checkpoints_df ] checkpoints_df = checkpoints_df[col_keys] if "last_update_time" in checkpoints_df: with pd.option_context('mode.use_inf_as_null', True): datetime_series = checkpoints_df["last_update_time"].dropna() datetime_series = datetime_series.apply( lambda t: datetime.fromtimestamp(t).strftime(TIMESTAMP_FORMAT)) checkpoints_df["last_update_time"] = datetime_series if "logdir" in checkpoints_df: # logdir often too verbose to view in table, so drop experiment_path checkpoints_df["logdir"] = checkpoints_df["logdir"].str.replace( experiment_path, '') if sort: if sort not in checkpoints_df: raise KeyError("Sort Index '{}' not in: {}".format( sort, list(checkpoints_df))) checkpoints_df = checkpoints_df.sort_values(by=sort) print_format_output(checkpoints_df)
def on_result(self, result): config = result.get("config") if config and self._config is None: for k in config.keys(): if wandb.config.get(k) is None: wandb.config[k] = config[k] self._config = config tmp = result.copy() for k in ["done", "config", "pid", "timestamp"]: if k in tmp: del tmp[k] metrics = {} for key, value in flatten_dict(tmp, delimiter="/").items(): if not isinstance(value, numbers.Number): continue metrics[key] = value wandb.log(metrics)
def testNestedResults(self): def create_result(i): return {"test": {"1": {"2": {"3": i, "4": False}}}} flattened_keys = list(flatten_dict(create_result(0))) class _MockScheduler(FIFOScheduler): results = [] def on_trial_result(self, trial_runner, trial, result): self.results += [result] return TrialScheduler.CONTINUE def on_trial_complete(self, trial_runner, trial, result): self.complete_result = result def train(config, reporter): for i in range(100): reporter(**create_result(i)) algo = _MockSuggestionAlgorithm() scheduler = _MockScheduler() [trial] = tune.run(train, scheduler=scheduler, search_alg=algo, stop={ "test/1/2/3": 20 }).trials self.assertEqual(trial.status, Trial.TERMINATED) self.assertEqual(trial.last_result["test"]["1"]["2"]["3"], 20) self.assertEqual(trial.last_result["test"]["1"]["2"]["4"], False) self.assertEqual(trial.last_result[TRAINING_ITERATION], 21) self.assertEqual(len(scheduler.results), 20) self.assertTrue( all( set(result) >= set(flattened_keys) for result in scheduler.results)) self.assertTrue(set(scheduler.complete_result) >= set(flattened_keys)) self.assertEqual(len(algo.results), 20) self.assertTrue( all(set(result) >= set(flattened_keys) for result in algo.results)) with self.assertRaises(TuneError): [trial] = tune.run(train, stop={"1/2/3": 20}) with self.assertRaises(TuneError): [trial] = tune.run(train, stop={"test": 1}).trials
def on_result(self, result): tmp = result.copy() if "config" in tmp: del tmp["config"] result = flatten_dict(tmp, delimiter="/") if self._csv_out is None: self._csv_out = csv.DictWriter(self._file, result.keys()) if not self._continuing: self._csv_out.writeheader() columns_to_unroll = [tmp[col] for col in tmp['unroll_columns']] for i, row in enumerate(zip(*columns_to_unroll)): row = {k: v for k, v in zip(tmp['unroll_columns'], row)} if i == len(columns_to_unroll[0])-1: # Writing the additional information in the last row filtered_dict = {k: v for k, v in tmp.items() if k not in tmp['unroll_columns']} row.update(**filtered_dict) self._csv_out.writerow(row) self._file.flush()
def _get_trial_info(trial, parameters, metrics, include_error_data=False): """Returns the following information about a trial: name | ID | status | loc | # failures | error_file | params... | metrics... Args: trial (Trial): Trial to get information for. parameters (List[str]): Names of trial parameters to include. metrics (List[str]): Names of metrics to include. include_error_data (bool): Include error file and # of failures. """ result = flatten_dict(trial.last_result) trial_info = [str(trial), trial.trial_id, trial.status] trial_info += [_location_str(result.get(HOSTNAME), result.get(PID))] if include_error_data: # TODO(ujvl): File path is too long to display in a single row. trial_info += [trial.num_failures, trial.error_file] trial_info += [result.get(CONFIG_PREFIX + param) for param in parameters] trial_info += [result.get(metric) for metric in metrics] return trial_info
def on_result(self, result): tmp = result.copy() if "config" in tmp: del tmp["config"] result = flatten_dict(tmp, delimiter="/") if self._csv_out is None: self._csv_out = csv.DictWriter(self._file, result.keys()) if not self._continuing: self._csv_out.writeheader() encode_results = {} for k, v in result.items(): if k not in self._csv_out.fieldnames: continue if isinstance(v, self.pickle_types): v = pickle.dumps(v) v = codecs.encode(v, "base64").decode() encode_results[k] = v self._csv_out.writerow(encode_results) self._file.flush()
def on_result(self, result): with tf.device("/CPU:0"): with self._file_writer.as_default(): step = result.get( TIMESTEPS_TOTAL) or result[TRAINING_ITERATION] tmp = result.copy() for k in [ "config", "pid", "timestamp", TIME_TOTAL_S, TRAINING_ITERATION ]: if k in tmp: del tmp[k] # not useful to log these flat_result = flatten_dict(tmp, delimiter="/") path = ["ray", "tune"] for attr, value in flat_result.items(): if type(value) in VALID_SUMMARY_TYPES: tf.summary.scalar("/".join(path + [attr]), value, step=step) self._file_writer.flush()
def on_result(self, result): step = result.get(TIMESTEPS_TOTAL) or result[TRAINING_ITERATION] tmp = result.copy() for k in [ "config", "pid", "timestamp", TIME_TOTAL_S, TRAINING_ITERATION ]: if k in tmp: del tmp[k] # not useful to log these flat_result = flatten_dict(tmp, delimiter="/") path = ["ray", "tune"] valid_result = { "/".join(path + [attr]): value for attr, value in flat_result.items() if type(value) in VALID_SUMMARY_TYPES } for attr, value in valid_result.items(): self._file_writer.add_scalar(attr, value, global_step=step) self.last_result = valid_result self._file_writer.flush()
def on_result(self, result): tmp = result.copy() config = tmp.get("config") if config and self._config is None: make_dict_items_yaml_representable(config) for k in config.keys(): if wandb.config.get(k) is None: wandb.config[k] = config[k] self._config = config for k in ["done", "config", "pid", "timestamp"]: if k in tmp: del tmp[k] metrics = {} for key, value in flatten_dict(tmp, delimiter="/").items(): if not isinstance(value, numbers.Number): continue metrics[key] = value wandb.log(metrics, step=tmp.get("timesteps_total", None))
def _read_experiment(self, experiment_state): checkpoint_dicts = experiment_state["checkpoints"] checkpoint_dicts = [flatten_dict(g) for g in checkpoint_dicts] for exp in checkpoint_dicts: if exp.get("logdir", None) is None: continue exp_dir = os.path.basename(exp["logdir"]) csv = os.path.join(self.experiment_path, exp_dir, "progress.csv") self.progress[exp["experiment_tag"]] = pd.read_csv(csv) self.exp_directories[exp["experiment_tag"]] = os.path.abspath( os.path.join(self.experiment_path, exp_dir)) # Figure out checkpoint file (.pt or .pth) if it exists. For some reason # we need to switch to the directory in order for glob to work. ed = os.path.abspath(os.path.join(self.experiment_path, exp_dir)) os.chdir(ed) cds = glob.glob("checkpoint*") if len(cds) > 0: cd = max(cds) cf = glob.glob(os.path.join(cd, "*.pt")) cf += glob.glob(os.path.join(cd, "*.pth")) if len(cf) > 0: self.checkpoint_directories[ exp["experiment_tag"]] = os.path.join(ed, cf[0]) else: self.checkpoint_directories[exp["experiment_tag"]] = "" else: self.checkpoint_directories[exp["experiment_tag"]] = "" # Read in the configs for this experiment params_file = os.path.join(self.experiment_path, exp_dir, "params.json") with open(params_file) as f: import json self.params[exp["experiment_tag"]] = json.load(f)
def on_result(self, result): if self._file_writer is None: from tensorflow.python.eager import context from tensorboard.plugins.hparams import api as hp self._context = context self._file_writer = tf.summary.create_file_writer(self.logdir) with tf.device("/CPU:0"): with tf.summary.record_if(True), self._file_writer.as_default(): step = result.get( TIMESTEPS_TOTAL) or result[TRAINING_ITERATION] tmp = result.copy() if not self._hp_logged: if self.trial and self.trial.evaluated_params: try: hp.hparams(self.trial.evaluated_params, trial_id=self.trial.trial_id) except Exception as exc: logger.error("HParams failed with %s", exc) self._hp_logged = True for k in [ "config", "pid", "timestamp", TIME_TOTAL_S, TRAINING_ITERATION ]: if k in tmp: del tmp[k] # not useful to log these flat_result = flatten_dict(tmp, delimiter="/") path = ["ray", "tune"] for attr, value in flat_result.items(): if type(value) in VALID_SUMMARY_TYPES: tf.summary.scalar("/".join(path + [attr]), value, step=step) self._file_writer.flush()
def list_trials(experiment_path, sort=None, output=None, filter_op=None, info_keys=DEFAULT_EXPERIMENT_INFO_KEYS, result_keys=DEFAULT_RESULT_KEYS): """Lists trials in the directory subtree starting at the given path. Args: experiment_path (str): Directory where trials are located. Corresponds to Experiment.local_dir/Experiment.name. sort (str): Key to sort by. output (str): Name of file where output is saved. filter_op (str): Filter operation in the format "<column> <operator> <value>". info_keys (list): Keys that are displayed. result_keys (list): Keys of last result that are displayed. """ _check_tabulate() experiment_state = _get_experiment_state(experiment_path, exit_on_fail=True) checkpoint_dicts = experiment_state["checkpoints"] checkpoint_dicts = [flatten_dict(g) for g in checkpoint_dicts] checkpoints_df = pd.DataFrame(checkpoint_dicts) result_keys = ["last_result:{}".format(k) for k in result_keys] col_keys = [ k for k in list(info_keys) + result_keys if k in checkpoints_df ] checkpoints_df = checkpoints_df[col_keys] if "last_update_time" in checkpoints_df: with pd.option_context("mode.use_inf_as_null", True): datetime_series = checkpoints_df["last_update_time"].dropna() datetime_series = datetime_series.apply( lambda t: datetime.fromtimestamp(t).strftime(TIMESTAMP_FORMAT)) checkpoints_df["last_update_time"] = datetime_series if "logdir" in checkpoints_df: # logdir often too verbose to view in table, so drop experiment_path checkpoints_df["logdir"] = checkpoints_df["logdir"].str.replace( experiment_path, '') if filter_op: col, op, val = filter_op.split(' ') col_type = checkpoints_df[col].dtype if is_numeric_dtype(col_type): val = float(val) elif is_string_dtype(col_type): val = str(val) # TODO(Andrew): add support for datetime and boolean else: raise ValueError("Unsupported dtype for '{}': {}".format( val, col_type)) op = OPERATORS[op] filtered_index = op(checkpoints_df[col], val) checkpoints_df = checkpoints_df[filtered_index] if sort: if sort not in checkpoints_df: raise KeyError("Sort Index '{}' not in: {}".format( sort, list(checkpoints_df))) checkpoints_df = checkpoints_df.sort_values(by=sort) print_format_output(checkpoints_df) if output: file_extension = os.path.splitext(output)[1].lower() if file_extension in (".p", ".pkl", ".pickle"): checkpoints_df.to_pickle(output) elif file_extension == ".csv": checkpoints_df.to_csv(output, index=False) else: raise ValueError("Unsupported filetype: {}".format(output)) print("Output saved at:", output)