Example #1
0
def execute(config_path, training, version=None, render=False, debug=False, profile=False,
            random=False):
    # load config
    config = load_config(config_path, version=version, render=render, debug=debug,
                         training=training)

    # run each evaluation
    error = False
    for evaluation_config in config:
        try:
            # create trainer
            trainer = load_trainer(evaluation_config)

            # perform evaluation
            trainer.execute(render=render, profile=profile, random=random)
        except (KeyboardInterrupt, SystemExit, bdb.BdbQuit):
            log_warning("Evaluation halted by request.")
            break
        except Exception as e:
            log_error(f"Evaluation failed: {e}")
            traceback.print_exc()
            error = True
            break

    # allow local runs to keep tensorboard alive
    if config.local and not error:
        if config.get("tensorboard", False):
            log("Experiment complete, Tensorboard still alive.  Press any key to terminate...")
        else:
            log("Experiment complete.  Press any key to terminate...")
        getch()
Example #2
0
    def _add_samples(self, samples, single=False):
        for key, values in samples.items():
            values = np.array(values)
            if single:
                values = values[np.newaxis]
            samples[key] = values

        # make sure sample counts are equal for all keys
        samples_counts = np.array(
            [len(values) for _, values in samples.items()])
        samples_count = samples_counts[0]
        assert samples_count > 0
        assert np.all(samples_counts == samples_count)
        original_sample_count = samples_count

        # make sure sample buffers are initialized
        if self.samples is None:
            self.samples = {}
            for key, values in samples.items():
                value_shape = np.shape(values[0])
                value_dtype = values.dtype.base
                if self.size is None:
                    # unbounded buffers
                    samples_shape = (0, ) + value_shape
                else:
                    # bounded buffers
                    samples_shape = (self.size, ) + value_shape
                self.samples[key] = np.zeros(samples_shape, dtype=value_dtype)

        # get storage indexes for samples
        if self.size:
            idxs = self._get_slice(samples_count)
            if len(idxs) > 0:
                samples_count = len(idxs)
            else:
                available = self.size - self.sample_count()
                log_warning(
                    f"Unable to store any of the {original_sample_count} samples in buffer"
                    f"  |  Available: {available} / {self.size})")
                return 0
        self._count += samples_count

        for key, values in samples.items():
            # make sure samples are compatible with this buffer
            if key not in self.samples:
                if key not in self._missing_keys:
                    self._missing_keys[key] = True
                    log_warning(
                        f"Sample key not found in buffer: {key} {self._missing_keys}"
                    )
                continue

            # add sample values for key
            if self.size is None:
                self.samples[key] = np.concatenate([self.samples[key], values])
            else:
                np.put(self.samples[key], idxs, values[:samples_count])

        return samples_count
Example #3
0
 def encipher(self, value):
     if is_collection(value):
         # TODO omit Nones...
         return [self.encipher(subvalue) for subvalue in value]
     elif isinstance(value, str):
         return self.word_to_ids.get(value, None)
     else:
         log_warning(
             f"Unknown vocabulary word type: {value} ({type(value)})")
         return None
Example #4
0
 def decipher(self, value):
     if is_collection(value):
         # TODO omit Nones...
         return [self.decipher(subvalue) for subvalue in value]
     elif isinstance(value, int) or np.isscalar(value):
         if value < self.size:
             return self.words[value]
     else:
         log_warning(
             f"Unknown vocabulary word ID type: {value} ({type(value)})")
         return None
Example #5
0
    def add_simple_summary(self,
                           name,
                           query=None,
                           allow_overwrite=False,
                           **kwargs):
        query = query or DEFAULT_EXPERIMENT_QUERY
        summary_results = self.get_summary_results(query)
        tag = self.summary_scope(name, query)

        if not allow_overwrite and tag in summary_results.simple_summaries:
            log_warning(f"Overwriting simple summary value: {tag}  "
                        "(Use set_simple_value to avoid warning.)")

        summary_results.simple_summaries[tag] = tf.Summary.Value(tag=tag,
                                                                 **kwargs)
Example #6
0
    def _load_evaluations(self):
        self.evaluations = None
        self.current_evaluation = 0
        self.current_sweep = None

        # get evaluation generation params
        sweeps = self.properties.pop("sweeps", None)
        seeds = self.properties.pop("seeds", None)

        # allow multiple evaluations with different seeds
        if seeds:
            try:
                if sweeps is None:
                    # default to be overridden
                    self.properties["seed"] = 1

                    if isinstance(seeds, list):
                        self.evaluations = [[["seed", seed]] for seed in seeds]
                    else:
                        seeds = int(seeds)

                        M = 0xffffffff
                        self.evaluations = [[["seed", np.random.randint(M)]] for _ in range(seeds)]
                        self.properties["seed"] = 1
                else:
                    log_warning("Currently 'seeds' is ignored when 'sweeps' config is set")
            except ValueError:
                log_warning(f"The config parameter 'seeds' must be int or list. (Found: {seeds})")

        # combine sweeps into evaluations
        if sweeps:
            def _build_combinations(d):
                params = list(d.keys())
                values = list(d.values())
                combinations = itertools.product(*values)
                return [list(zip(params, combination)) for combination in combinations]

            # build sweeps evaluations from dict or list of dicts
            if isinstance(sweeps, list):
                self.evaluations = [c for d in sweeps for c in _build_combinations(d)]
            elif isinstance(sweeps, dict):
                self.evaluations = _build_combinations(sweeps)

        self.num_evaluations = len(self.evaluations) if self.evaluations else 1
Example #7
0
def get_gpu_memory():
    mem = MemoryStatus()
    try:
        result = shell_call(['nvidia-smi', '-q', '-d', 'MEMORY'],
                            response_type="text")
        lines = result.split("FB Memory Usage")[1].split("\n")[1:4]
        for line in lines:
            parts = line.split(":")
            label = parts[0].strip().lower()
            value = parts[1].strip()
            if label == "total":
                mem.total = value
            elif label == "used":
                mem.used = value
            elif label == "free":
                mem.available = value
    except Exception as e:
        log_warning(f"Failed to collect GPU information: {e}")
    return mem
Example #8
0
    def _start_checkpoints(self):
        # check if there is a checkpoint
        var_list = None
        load_dir = os.path.dirname(self.load_path)
        load_checkpoint = tf.train.latest_checkpoint(load_dir)

        # check compatibility
        if self.debugging and load_checkpoint:
            model_variables = tf.get_collection_ref(tf.GraphKeys.GLOBAL_VARIABLES)
            checkpoint_variables = tf.train.list_variables(load_checkpoint)
            model_var_names = set()
            # var_list = set()
            compatible_var_names = set()
            for model_variable in model_variables:
                model_var_name = model_variable.name.split(":")[0]
                model_var_names.add(model_var_name)
                model_var_shape = model_variable.shape
                for checkpoint_var_name, checkpoint_var_shape in checkpoint_variables:
                    if model_var_name == checkpoint_var_name:
                        if model_var_shape == checkpoint_var_shape:
                            # var_list.add(model_variable)
                            compatible_var_names.add(model_var_name)
                            break
            missing_var_names = model_var_names.difference(compatible_var_names)
            checkpoint_var_names = set([name for name, _ in checkpoint_variables])
            unused_variables = checkpoint_var_names.difference(compatible_var_names)
            if len(missing_var_names) > 0 or len(unused_variables) > 0:
                log_warning(f"\nIncompatible checkpoint file detected: {load_checkpoint}")
                load_checkpoint = None
                if len(missing_var_names) > 0:
                    var_str = "\n * ".join(missing_var_names)
                    log_warning(f"\nMissing model variables from checkpoint:\n * {var_str}")
                if len(unused_variables) > 0:
                    var_str = "\n * ".join(unused_variables)
                    log_warning(f"\nUnused checkpoint variables by model:\n * {var_str}")

        # TODO - Saver(max_to_keep=4, keep_checkpoint_every_n_hours=2, ...)
        self.saver = tf.train.Saver(var_list=var_list)

        # prepare save directory
        if self.save_path is not None:
            save_dir = os.path.dirname(self.save_path)
            os.makedirs(save_dir, exist_ok=True)
            self.dirty_meta_graph = True

        # load any previously saved data for the current version
        return self.load(load_checkpoint)
Example #9
0
    def _start_evaluation(self):
        message = f"Starting Evaluation: {self.current_evaluation + 1} / {self.num_evaluations}"
        warning_message = False
        evaluation_props = {}
        print()

        if self.evaluations:
            # copy params for this evaluation
            self.current_properties = copy.deepcopy(self.properties)
            evaluation_sweep = self.evaluations[self.current_evaluation]
            self.current_sweep = dict(evaluation_sweep)
            evaluation_props["Sweep"] = self.current_sweep

            def _recursive_replace(node, param, value, path=None):
                num_replaced_values = 0
                if isinstance(node, list):
                    for i in range(len(node)):
                        v_path = f"{path}[{i}]"
                        num_replaced_values += _recursive_replace(node[i], param, value, v_path)
                elif isinstance(node, dict):
                    for k, v in node.items():
                        v_path = k if not path else "{}.{}".format(path, k)
                        if v_path.endswith(param):
                            node[k] = value
                            num_replaced_values += 1
                        else:
                            num_replaced_values += _recursive_replace(v, param, value, v_path)
                return num_replaced_values

            # set appropriate hyperparams for this sweep evaluation
            for param, value in evaluation_sweep:
                replacements = _recursive_replace(self.current_properties, param, value)
                if replacements == 0:
                    log_warning(f"Sweeps parameter '{param}' not found in config.")
        else:
            self.current_properties = self.properties

        # debugging info
        debug_props = ', '.join([k for k in self.properties.keys() if k.startswith("debug_")])
        if self.debugging:
            if len(debug_props) > 0:
                evaluation_props["Debug"] = f"Enabled:  Using options: {debug_props}"
            else:
                evaluation_props["Debug"] = f"Enabled:  ALTHOUGH NO CONFIG OPTIONS DETECTED!"
                warning_message = True

            # catch segfaults
            from glearn.utils.debug import debug_faults
            debug_faults()
        else:
            if len(debug_props) > 0:
                # warn about ignored debug options
                evaluation_props[f"Debug"] = f"Disabled:  IGNORING OPTIONS: {debug_props}"
                warning_message = True

        # log evaluation sweep info
        message = {message: evaluation_props}
        table_color = "yellow" if warning_message else "white"
        print_tabular(message, grouped=True, color=table_color, bold=True, show_type=False)
        print()

        # init session
        self._init_session()

        # prepare random seed
        self.seed = self.get("seed", 1)
        tf.set_random_seed(self.seed)
        np.random.seed(self.seed)

        # config batch size
        self.batch_size = self.get("batch_size", 1)

        # load env or dataset
        self.env = None
        self.dataset = None
        if self.has("env"):
            # make env
            self.env = load_env(self.get("env"))
        elif self.has("dataset"):
            # make dataset
            self.dataset = load_dataset(self)
        if self.env is None and self.dataset is None:
            raise Exception("Failed to find training env or dataset in config")

        # prepare log and save/load paths
        self.log_dir = f"{self.root_log_dir}/{self.current_evaluation + 1}"
        self.summary_path = f"{self.tensorboard_path}/{self.current_evaluation + 1}"
        self.save_path = f"{self.log_dir}/checkpoints/model.ckpt"
        self.load_path = self.save_path

        # create render viewer controller
        self.viewer = load_view_controller(self, render=self.rendering)

        # prepare input/output interfaces, and env
        if self.has_dataset:
            self.input = self.dataset.input
            self.output = self.dataset.output
        elif self.has_env:
            self.env.seed(self.seed)

            self.input = Interface(self.env.observation_space)
            # FIXME - network output should determine if stochastic (distribution) or deterministic
            self.output = Interface(self.env.action_space, deterministic=False)

        # start summary logging and tensorboard
        self._start_summaries()

        # print evaluation info
        self.print_info()