def execute(config_path, training, version=None, render=False, debug=False, profile=False, random=False): # load config config = load_config(config_path, version=version, render=render, debug=debug, training=training) # run each evaluation error = False for evaluation_config in config: try: # create trainer trainer = load_trainer(evaluation_config) # perform evaluation trainer.execute(render=render, profile=profile, random=random) except (KeyboardInterrupt, SystemExit, bdb.BdbQuit): log_warning("Evaluation halted by request.") break except Exception as e: log_error(f"Evaluation failed: {e}") traceback.print_exc() error = True break # allow local runs to keep tensorboard alive if config.local and not error: if config.get("tensorboard", False): log("Experiment complete, Tensorboard still alive. Press any key to terminate...") else: log("Experiment complete. Press any key to terminate...") getch()
def _add_samples(self, samples, single=False): for key, values in samples.items(): values = np.array(values) if single: values = values[np.newaxis] samples[key] = values # make sure sample counts are equal for all keys samples_counts = np.array( [len(values) for _, values in samples.items()]) samples_count = samples_counts[0] assert samples_count > 0 assert np.all(samples_counts == samples_count) original_sample_count = samples_count # make sure sample buffers are initialized if self.samples is None: self.samples = {} for key, values in samples.items(): value_shape = np.shape(values[0]) value_dtype = values.dtype.base if self.size is None: # unbounded buffers samples_shape = (0, ) + value_shape else: # bounded buffers samples_shape = (self.size, ) + value_shape self.samples[key] = np.zeros(samples_shape, dtype=value_dtype) # get storage indexes for samples if self.size: idxs = self._get_slice(samples_count) if len(idxs) > 0: samples_count = len(idxs) else: available = self.size - self.sample_count() log_warning( f"Unable to store any of the {original_sample_count} samples in buffer" f" | Available: {available} / {self.size})") return 0 self._count += samples_count for key, values in samples.items(): # make sure samples are compatible with this buffer if key not in self.samples: if key not in self._missing_keys: self._missing_keys[key] = True log_warning( f"Sample key not found in buffer: {key} {self._missing_keys}" ) continue # add sample values for key if self.size is None: self.samples[key] = np.concatenate([self.samples[key], values]) else: np.put(self.samples[key], idxs, values[:samples_count]) return samples_count
def encipher(self, value): if is_collection(value): # TODO omit Nones... return [self.encipher(subvalue) for subvalue in value] elif isinstance(value, str): return self.word_to_ids.get(value, None) else: log_warning( f"Unknown vocabulary word type: {value} ({type(value)})") return None
def decipher(self, value): if is_collection(value): # TODO omit Nones... return [self.decipher(subvalue) for subvalue in value] elif isinstance(value, int) or np.isscalar(value): if value < self.size: return self.words[value] else: log_warning( f"Unknown vocabulary word ID type: {value} ({type(value)})") return None
def add_simple_summary(self, name, query=None, allow_overwrite=False, **kwargs): query = query or DEFAULT_EXPERIMENT_QUERY summary_results = self.get_summary_results(query) tag = self.summary_scope(name, query) if not allow_overwrite and tag in summary_results.simple_summaries: log_warning(f"Overwriting simple summary value: {tag} " "(Use set_simple_value to avoid warning.)") summary_results.simple_summaries[tag] = tf.Summary.Value(tag=tag, **kwargs)
def _load_evaluations(self): self.evaluations = None self.current_evaluation = 0 self.current_sweep = None # get evaluation generation params sweeps = self.properties.pop("sweeps", None) seeds = self.properties.pop("seeds", None) # allow multiple evaluations with different seeds if seeds: try: if sweeps is None: # default to be overridden self.properties["seed"] = 1 if isinstance(seeds, list): self.evaluations = [[["seed", seed]] for seed in seeds] else: seeds = int(seeds) M = 0xffffffff self.evaluations = [[["seed", np.random.randint(M)]] for _ in range(seeds)] self.properties["seed"] = 1 else: log_warning("Currently 'seeds' is ignored when 'sweeps' config is set") except ValueError: log_warning(f"The config parameter 'seeds' must be int or list. (Found: {seeds})") # combine sweeps into evaluations if sweeps: def _build_combinations(d): params = list(d.keys()) values = list(d.values()) combinations = itertools.product(*values) return [list(zip(params, combination)) for combination in combinations] # build sweeps evaluations from dict or list of dicts if isinstance(sweeps, list): self.evaluations = [c for d in sweeps for c in _build_combinations(d)] elif isinstance(sweeps, dict): self.evaluations = _build_combinations(sweeps) self.num_evaluations = len(self.evaluations) if self.evaluations else 1
def get_gpu_memory(): mem = MemoryStatus() try: result = shell_call(['nvidia-smi', '-q', '-d', 'MEMORY'], response_type="text") lines = result.split("FB Memory Usage")[1].split("\n")[1:4] for line in lines: parts = line.split(":") label = parts[0].strip().lower() value = parts[1].strip() if label == "total": mem.total = value elif label == "used": mem.used = value elif label == "free": mem.available = value except Exception as e: log_warning(f"Failed to collect GPU information: {e}") return mem
def _start_checkpoints(self): # check if there is a checkpoint var_list = None load_dir = os.path.dirname(self.load_path) load_checkpoint = tf.train.latest_checkpoint(load_dir) # check compatibility if self.debugging and load_checkpoint: model_variables = tf.get_collection_ref(tf.GraphKeys.GLOBAL_VARIABLES) checkpoint_variables = tf.train.list_variables(load_checkpoint) model_var_names = set() # var_list = set() compatible_var_names = set() for model_variable in model_variables: model_var_name = model_variable.name.split(":")[0] model_var_names.add(model_var_name) model_var_shape = model_variable.shape for checkpoint_var_name, checkpoint_var_shape in checkpoint_variables: if model_var_name == checkpoint_var_name: if model_var_shape == checkpoint_var_shape: # var_list.add(model_variable) compatible_var_names.add(model_var_name) break missing_var_names = model_var_names.difference(compatible_var_names) checkpoint_var_names = set([name for name, _ in checkpoint_variables]) unused_variables = checkpoint_var_names.difference(compatible_var_names) if len(missing_var_names) > 0 or len(unused_variables) > 0: log_warning(f"\nIncompatible checkpoint file detected: {load_checkpoint}") load_checkpoint = None if len(missing_var_names) > 0: var_str = "\n * ".join(missing_var_names) log_warning(f"\nMissing model variables from checkpoint:\n * {var_str}") if len(unused_variables) > 0: var_str = "\n * ".join(unused_variables) log_warning(f"\nUnused checkpoint variables by model:\n * {var_str}") # TODO - Saver(max_to_keep=4, keep_checkpoint_every_n_hours=2, ...) self.saver = tf.train.Saver(var_list=var_list) # prepare save directory if self.save_path is not None: save_dir = os.path.dirname(self.save_path) os.makedirs(save_dir, exist_ok=True) self.dirty_meta_graph = True # load any previously saved data for the current version return self.load(load_checkpoint)
def _start_evaluation(self): message = f"Starting Evaluation: {self.current_evaluation + 1} / {self.num_evaluations}" warning_message = False evaluation_props = {} print() if self.evaluations: # copy params for this evaluation self.current_properties = copy.deepcopy(self.properties) evaluation_sweep = self.evaluations[self.current_evaluation] self.current_sweep = dict(evaluation_sweep) evaluation_props["Sweep"] = self.current_sweep def _recursive_replace(node, param, value, path=None): num_replaced_values = 0 if isinstance(node, list): for i in range(len(node)): v_path = f"{path}[{i}]" num_replaced_values += _recursive_replace(node[i], param, value, v_path) elif isinstance(node, dict): for k, v in node.items(): v_path = k if not path else "{}.{}".format(path, k) if v_path.endswith(param): node[k] = value num_replaced_values += 1 else: num_replaced_values += _recursive_replace(v, param, value, v_path) return num_replaced_values # set appropriate hyperparams for this sweep evaluation for param, value in evaluation_sweep: replacements = _recursive_replace(self.current_properties, param, value) if replacements == 0: log_warning(f"Sweeps parameter '{param}' not found in config.") else: self.current_properties = self.properties # debugging info debug_props = ', '.join([k for k in self.properties.keys() if k.startswith("debug_")]) if self.debugging: if len(debug_props) > 0: evaluation_props["Debug"] = f"Enabled: Using options: {debug_props}" else: evaluation_props["Debug"] = f"Enabled: ALTHOUGH NO CONFIG OPTIONS DETECTED!" warning_message = True # catch segfaults from glearn.utils.debug import debug_faults debug_faults() else: if len(debug_props) > 0: # warn about ignored debug options evaluation_props[f"Debug"] = f"Disabled: IGNORING OPTIONS: {debug_props}" warning_message = True # log evaluation sweep info message = {message: evaluation_props} table_color = "yellow" if warning_message else "white" print_tabular(message, grouped=True, color=table_color, bold=True, show_type=False) print() # init session self._init_session() # prepare random seed self.seed = self.get("seed", 1) tf.set_random_seed(self.seed) np.random.seed(self.seed) # config batch size self.batch_size = self.get("batch_size", 1) # load env or dataset self.env = None self.dataset = None if self.has("env"): # make env self.env = load_env(self.get("env")) elif self.has("dataset"): # make dataset self.dataset = load_dataset(self) if self.env is None and self.dataset is None: raise Exception("Failed to find training env or dataset in config") # prepare log and save/load paths self.log_dir = f"{self.root_log_dir}/{self.current_evaluation + 1}" self.summary_path = f"{self.tensorboard_path}/{self.current_evaluation + 1}" self.save_path = f"{self.log_dir}/checkpoints/model.ckpt" self.load_path = self.save_path # create render viewer controller self.viewer = load_view_controller(self, render=self.rendering) # prepare input/output interfaces, and env if self.has_dataset: self.input = self.dataset.input self.output = self.dataset.output elif self.has_env: self.env.seed(self.seed) self.input = Interface(self.env.observation_space) # FIXME - network output should determine if stochastic (distribution) or deterministic self.output = Interface(self.env.action_space, deterministic=False) # start summary logging and tensorboard self._start_summaries() # print evaluation info self.print_info()