def __init__(
            self,
            log_dir,
            output_fname='progress.csv',
            debug: bool = False,
            exp_name=None,
            level: int = 1,  # verbosity level
            use_tensor_board=True,
            verbose=True):
        """
        Initialize a Logger.

        Args:
            log_dir (string): A directory for saving results to. If
                ``None``, defaults to a temp directory of the form
                ``/tmp/experiments/somerandomnumber``.

            output_fname (string): Name for the tab-separated-value file
                containing metrics logged throughout a training run.
                Defaults to ``progress.txt``.

            exp_name (string): Experiment name. If you run multiple training
                runs and give them all the same ``exp_name``, the plotter
                will know to group them. (Use case: if you run the same
                hyperparameter configuration with multiple random seeds, you
                should give them all the same ``exp_name``.)
        """
        self.log_dir = log_dir
        self.debug = debug if proc_id() == 0 else False
        self.level = level
        # only the MPI root process is allowed to print information to console
        self.verbose = verbose if proc_id() == 0 else False

        if proc_id() == 0:
            os.makedirs(self.log_dir, exist_ok=True)
            self.output_file = open(osp.join(self.log_dir, output_fname), 'w')
            atexit.register(self.output_file.close)
            print(
                colorize(f"Logging data to {self.output_file.name}",
                         'cyan',
                         bold=True))
        else:
            self.output_file = None

        self.epoch = 0
        self.first_row = True
        self.log_headers = []
        self.log_current_row = {}
        self.exp_name = exp_name
        self.torch_saver_elements = None

        # Setup tensor board logging if enabled and MPI root process
        self.summary_writer = SummaryWriter(os.path.join(self.log_dir, 'tb')) \
            if use_tensor_board and proc_id() == 0 else None
 def torch_save(self, itr=None):
     """
     Saves the PyTorch model (or models).
     """
     if proc_id() == 0:
         self.log('Save model to disk...')
         assert self.torch_saver_elements is not None,\
             "First have to setup saving with self.setup_torch_saver"
         fpath = 'torch_save'
         fpath = osp.join(self.log_dir, fpath)
         fname = 'model' + ('%d' % itr if itr is not None else '') + '.pt'
         fname = osp.join(fpath, fname)
         os.makedirs(fpath, exist_ok=True)
         with warnings.catch_warnings():
             warnings.simplefilter("ignore")
             # We are using a non-recommended way of saving PyTorch models,
             # by pickling whole objects (which are dependent on the exact
             # directory structure at the time of saving) as opposed to
             # just saving network weights. This works sufficiently well
             # for the purposes of Spinning Up, but you may want to do
             # something different for your personal PyTorch project.
             # We use a catch_warnings() context to avoid the warnings about
             # not being able to save the source code.
             torch.save(self.torch_saver_elements, fname)
         torch.save(self.torch_saver_elements.state_dict(), fname)
         self.log('Done.')
    def save_state(self, state_dict, itr=None):
        """
        Saves the state of an experiment.

        To be clear: this is about saving *state*, not logging diagnostics.
        All diagnostic logging is separate from this function. This function
        will save whatever is in ``state_dict``---usually just a copy of the
        environment---and the most recent parameters for the model you
        previously set up saving for with ``setup_tf_saver``.

        Call with any frequency you prefer. If you only want to maintain a
        single state and overwrite it at each call with the most recent
        version, leave ``itr=None``. If you want to keep all of the states you
        save, provide unique (increasing) values for 'itr'.

        Args:
            state_dict (dict): Dictionary containing essential elements to
                describe the current state of training.

            itr: An int, or None. Current iteration of training.
        """
        if proc_id() == 0:
            fname = 'state.pkl' if itr is None else 'state%d.pkl' % itr
            try:
                joblib.dump(state_dict, osp.join(self.log_dir, fname))
            except:
                self.log('Warning: could not pickle state_dict.', color='red')
            if hasattr(self, 'torch_saver_elements'):
                self.torch_save(itr)
    def save_config(self, config):
        """
        Log an experiment configuration.

        Call this once at the top of your experiment, passing in all important
        config vars as a dict. This will serialize the config to JSON, while
        handling anything which can't be serialized in a graceful way (writing
        as informative a string as possible).

        Example use:

        .. code-block:: python

            logger = EpochLogger(**logger_kwargs)
            logger.save_config(locals())
        """
        if proc_id() == 0:  # only root process logs configurations
            config_json = convert_json(config)
            if self.exp_name is not None:
                config_json['exp_name'] = self.exp_name

            output = json.dumps(config_json,
                                separators=(',', ':\t'),
                                indent=4,
                                sort_keys=True)
            if self.verbose and self.level > 0:
                print(colorize('Run with config:', color='yellow', bold=True))
                print(output)
            with open(osp.join(self.log_dir, "config.json"), 'w') as out:
                out.write(output)
 def close(self):
     """Close opened output files immediately after training in order to
     avoid number of open files overflow. Avoids the following error:
     OSError: [Errno 24] Too many open files
     """
     if proc_id() == 0:
         self.output_file.close()
    def __init__(self, log_dir, log_costs=True):

        self.log_dir = log_dir
        self.env = None
        self.ac = None
        self.log_costs = log_costs

        # open returns.csv file at the beginning to avoid disk access errors
        # on our HPC servers...
        if mpi_tools.proc_id() == 0:
            os.makedirs(log_dir, exist_ok=True)
            self.ret_file_name = 'returns.csv'
            self.ret_file = open(os.path.join(log_dir, self.ret_file_name),
                                 'w')
            # Register close function is executed for normal program termination
            atexit.register(self.ret_file.close)
            if log_costs:
                self.c_file_name = 'costs.csv'
                self.costs_file = open(os.path.join(log_dir, self.c_file_name),
                                       'w')
                atexit.register(self.costs_file.close)
        else:
            self.ret_file_name = None
            self.ret_file = None
            if log_costs:
                self.c_file_name = None
                self.costs_file = None
Example #7
0
    def test_mpi_matrix_updates(self):
        """ OnlineMeanStd module is updated with a batch of vector inputs,
            i.e. inputs of shape M x N.
            Note that std dev might differ more than 1e-5 when epochs > 10.
        """
        epochs = 20
        cores = 4
        T = 500
        obs_shape = (6, )

        try:
            if mpi_tools.mpi_fork(n=cores):
                sys.exit()
            pid = mpi_tools.proc_id()
            # print(f'Here is process:', mpi_tools.proc_id())

            # === calculation through online updates
            rms = OnlineMeanStd(shape=obs_shape)
            for ep in range(epochs):
                # shape of batch: T x obs_shape
                obs_batch = self.get_data(T, obs_shape[0], ep, pid)
                rms.update(obs_batch)
            mpi_mean = rms.mean.numpy()
            mpi_std = rms.std.numpy()

            # ===== calculate ground truths
            obs_list = [self.get_data(T, obs_shape[0], ep, pid=i)
                        for i in range(cores)
                        for ep in range(epochs)
                        ]
            obs = np.vstack(obs_list)
            gt_mean = np.mean(obs, axis=0)
            gt_std = np.std(obs, axis=0)

            # if mpi_tools.proc_id() == 0:
            #     print('gt_mean:', gt_mean)
            #     print('mpi_mean:', mpi_mean)
            #     print('gt_std:', gt_std)
            #     print('mpi_std:', mpi_std)
            self.assertTrue(np.allclose(mpi_mean, gt_mean))
            self.assertTrue(np.allclose(mpi_std, gt_std))
            self.assertTrue(self.perform_single_pass(rms, obs_shape))

        # necessary to prevent system exit with error...
        except SystemExit:
            print('Join....')
    def dump_tabular(self) -> None:
        """
        Write all of the diagnostics from the current iteration.

        Writes both to stdout, and to the output file.
        """
        if proc_id() == 0:
            vals = list()
            self.epoch += 1
            # Print formatted information into console
            key_lens = [len(key) for key in self.log_headers]
            max_key_len = max(15, max(key_lens))
            keystr = '%' + '%d' % max_key_len
            fmt = "| " + keystr + "s | %15s |"
            n_slashes = 22 + max_key_len
            print("-" * n_slashes) if self.verbose and self.level > 0 else None
            for key in self.log_headers:
                val = self.log_current_row.get(key, "")
                valstr = "%8.3g" % val if hasattr(val, "__float__") else val
                if self.verbose and self.level > 0:
                    print(fmt % (key, valstr))
                vals.append(val)
            if self.verbose and self.level > 0:
                print("-" * n_slashes, flush=True)

            # Write into the output file (can be any text file format, e.g. CSV)
            if self.output_file is not None:
                if self.first_row:
                    self.output_file.write(",".join(self.log_headers) + "\n")
                self.output_file.write(",".join(map(str, vals)) + "\n")
                self.output_file.flush()

            if self.summary_writer is not None:
                [
                    self.summary_writer.add_scalar(k,
                                                   v,
                                                   global_step=self.epoch)
                    for (k, v) in zip(self.log_headers, vals)
                ]
                # Flushes the event file to disk. Call this method to make sure
                # that all pending events have been written to disk.
                self.summary_writer.flush()

        # free logged information in all processes...
        self.log_current_row.clear()
        self.first_row = False
Example #9
0
    def test_mpi_vector_updates(self):
        """Test with vector inputs.
        Note that std dev might differ more than 1e-5 when epochs > 10.
        """
        epochs = 10
        cores = 4
        T = 500
        input_shape = (1,)

        try:
            mpi_tools.mpi_fork(n=cores)
            p = mpi_tools.proc_id()
            # print(f'Here is process:', mpi_tools.proc_id())

            # === calculation through online updates
            rms = OnlineMeanStd(shape=input_shape)
            for ep in range(epochs):
                # shape of batch: T x input_shape
                vector_input = self.get_data(T, input_shape[0], ep, p).flatten()
                rms.update(vector_input)
            mpi_mean = rms.mean.numpy()
            mpi_std = rms.std.numpy()

            # ===== calculate ground truths
            obs_list = [self.get_data(T, input_shape[0], ep, pid=i)
                        for i in range(cores)
                        for ep in range(epochs)
                        ]
            obs = np.vstack(obs_list)
            gt_mean = np.mean(obs, axis=0)
            gt_std = np.std(obs, axis=0)

            # if mpi_tools.proc_id() == 0:
            #     print('gt_mean:', gt_mean)
            #     print('mpi_mean:', mpi_mean)
            #     print('gt_std:', gt_std)
            #     print('mpi_std:', mpi_std)
            self.assertTrue(np.allclose(mpi_mean, gt_mean))
            self.assertTrue(np.allclose(mpi_std, gt_std, rtol=1e-2))
            self.assertTrue(self.perform_single_pass(rms, input_shape))

        # necessary to prevent system exit with error...
        except SystemExit:
            print('Join....')
Example #10
0
 def test_mpi_version_of_algorithms(self):
     """ Run all the specified algorithms with MPI."""
     cores = 4
     if mpi_tools.mpi_fork(n=cores):  # forks the current script and use MPI
         return  # use return instead of sys.exit() to exit test with 'OK'...
     is_root = mpi_tools.proc_id() == 0
     algs = ['iwpg', 'npg', 'trpo', 'lag-trpo', 'pdo', 'cpo']
     for alg in algs:
         try:
             print(f'Run {alg.upper()}') if is_root else None
             ac, env = self.check_alg(alg,
                                      'HopperBulletEnv-v0',
                                      cores=cores)
             self.assertTrue(isinstance(env, gym.Env))
         except NotImplementedError:
             print('No MPI yet supported...') if is_root else None
     else:
         # sleep one sec to finish all console prints...
         time.sleep(1)
    def eval(self, env, ac, num_evaluations):
        """ Evaluate actor critic module for given number of evaluations.
        """
        self.ac = ac
        self.ac.eval()  # disable exploration noise

        if isinstance(env, gym.Env):
            self.env = env
        elif isinstance(env, str):
            self.env = gym.make(env)
        else:
            raise TypeError('Env is not of type: str, gym.Env')

        size = mpi_tools.num_procs()
        num_local_evaluations = num_evaluations // size
        returns = np.zeros(num_local_evaluations, dtype=np.float32)
        costs = np.zeros(num_local_evaluations, dtype=np.float32)
        ep_lengths = np.zeros(num_local_evaluations, dtype=np.float32)

        for i in range(num_local_evaluations):
            returns[i], ep_lengths[i], costs[i] = self.eval_once()
        # Gather returns from all processes
        # Note: only root process owns valid data...
        returns = list(mpi_tools.gather_and_stack(returns))
        costs = list(mpi_tools.gather_and_stack(costs))

        # now write returns as column into output file...
        if mpi_tools.proc_id() == 0:
            self.write_to_file(self.ret_file, contents=returns)
            print('Saved to:', os.path.join(self.log_dir, self.ret_file_name))
            if self.log_costs:
                self.write_to_file(self.costs_file, contents=costs)
            print(f'Mean Ret: { np.mean(returns)} \t'
                  f'Mean EpLen: {np.mean(ep_lengths)} \t'
                  f'Mean Costs: {np.mean(costs)}')

        self.ac.train()  # back to train mode
        return np.array(returns), np.array(ep_lengths), np.array(costs)
Example #12
0
                        help=f'Define the init seed, e.g. {random_seed}')
    parser.add_argument('--search',
                        action='store_true',
                        help='If given search over learning rates.')
    parser.add_argument('--log-dir',
                        type=str,
                        default=default_log_dir,
                        help='Define a custom directory for logging.')

    args, unparsed_args = parser.parse_known_args()
    # Use number of physical cores as default. If also hardware threading CPUs
    # should be used, enable this by the use_number_of_threads=True
    use_number_of_threads = True if args.cores > physical_cores else False
    if mpi_tools.mpi_fork(args.cores,
                          use_number_of_threads=use_number_of_threads):
        # Re-launches the current script with workers linked by MPI
        sys.exit()
    print('Unknowns:', unparsed_args) if mpi_tools.proc_id() == 0 else None

    model = Model(alg=args.alg,
                  env_id=args.env,
                  log_dir=args.log_dir,
                  init_seed=args.seed,
                  unparsed_args=unparsed_args,
                  use_mpi=not args.no_mpi)
    model.compile(num_runs=args.runs, num_cores=args.cores)
    model.fit()
    model.eval()
    if args.play:
        model.play()
Example #13
0
 def get_data(M, N, epoch, pid=mpi_tools.proc_id()):
     """Returns data matrix of shape MxN."""
     start = pid*10000 + 4 * epoch
     stop = pid*10000 + M * N + 4 * epoch
     step = 1
     return 0.001 * np.arange(start, stop, step).reshape((M, N))
Example #14
0
    def __init__(
        self,
        actor: str,
        ac_kwargs: dict,
        env_id: str,
        epochs: int,
        logger_kwargs: dict,
        adv_estimation_method: str = 'gae',
        alg='iwpg',
        check_freq: int = 25,
        entropy_coef: float = 0.01,
        gamma: float = 0.99,
        lam: float = 0.95,  # GAE scalar
        lam_c: float = 0.95,  # GAE scalar for cost estimation
        max_ep_len: int = 1000,
        max_grad_norm: float = 0.5,
        num_mini_batches: int = 16,  # used for value network training
        optimizer: str = 'Adam',  # policy optimizer
        pi_lr: float = 3e-4,
        steps_per_epoch: int = 32 * 1000,  # number global steps per epoch
        target_kl: float = 0.01,
        train_pi_iterations: int = 80,
        train_v_iterations: int = 5,
        trust_region='plain',  # used for easy filtering in plot utils
        use_cost_value_function: bool = False,
        use_entropy: bool = False,
        use_exploration_noise_anneal: bool = False,
        use_kl_early_stopping: bool = False,
        use_linear_lr_decay: bool = True,
        use_max_grad_norm: bool = False,
        use_reward_scaling: bool = True,
        use_reward_penalty: bool = False,
        use_shared_weights: bool = False,
        use_standardized_advantages: bool = False,
        use_standardized_obs: bool = True,
        verbose: bool = True,
        vf_lr: float = 1e-3,
        weight_initialization: str = 'kaiming_uniform',
        save_freq: int = 10,
        seed: int = 0,
        video_freq: int = -1,  # set to positive integer for video recording
        **kwargs  # use to log parameters from child classes
    ):
        """

        Parameters
        ----------
        actor
        ac_kwargs
        env_id
        epochs
        logger_kwargs
        adv_estimation_method
        alg
        check_freq
        entropy_coef
        gamma
        lam
        lam_c
        max_ep_len
        max_grad_norm
        num_mini_batches
        optimizer
        pi_lr
        steps_per_epoch
        target_kl
        train_pi_iterations
        train_v_iterations
        trust_region
        use_cost_value_function
        use_entropy
        use_exploration_noise_anneal
        use_kl_early_stopping
        use_linear_lr_decay
        use_max_grad_norm
        use_reward_scaling
        use_reward_penalty
        use_shared_weights
        use_standardized_advantages
        use_standardized_obs
        verbose
        vf_lr
        weight_initialization
        save_freq
        seed
        video_freq
        kwargs
        """

        # Environment calls
        # TODO: call gym.make with **kwargs (to allow customization)
        self.env = env = gym.make(env_id) if isinstance(env_id,
                                                        str) else env_id
        # Collect information from environment if it has an time wrapper
        if hasattr(self.env, '_max_episode_steps'):
            max_ep_len = self.env._max_episode_steps

        self.adv_estimation_method = adv_estimation_method
        self.alg = alg
        self.check_freq = check_freq
        self.entropy_coef = entropy_coef if use_entropy else 0.0
        self.epoch = 0  # iterated in learn method
        self.epochs = epochs
        self.lam = lam
        self.local_steps_per_epoch = steps_per_epoch // mpi_tools.num_procs()
        self.logger_kwargs = logger_kwargs
        self.max_ep_len = max_ep_len
        self.max_grad_norm = max_grad_norm
        self.num_mini_batches = num_mini_batches
        self.pi_lr = pi_lr
        self.save_freq = save_freq
        self.seed = seed
        self.steps_per_epoch = steps_per_epoch
        self.target_kl = target_kl
        self.train_pi_iterations = train_pi_iterations
        self.train_v_iterations = train_v_iterations
        self.use_cost_value_function = use_cost_value_function
        self.use_exploration_noise_anneal = use_exploration_noise_anneal
        self.use_kl_early_stopping = use_kl_early_stopping
        self.use_linear_lr_decay = use_linear_lr_decay
        self.use_max_grad_norm = use_max_grad_norm
        self.use_reward_penalty = use_reward_penalty
        self.use_reward_scaling = use_reward_scaling
        self.use_standardized_obs = use_standardized_obs
        self.use_standardized_advantages = use_standardized_advantages
        self.video_freq = video_freq
        self.vf_lr = vf_lr

        # ==== Call assertions....
        self._sanity_checks()

        # === Set up logger and save configuration to disk
        # get local parameters before logger instance to avoid unnecessary print
        self.params = locals()
        self.logger = self._init_logger()
        self.logger.save_config(self.params)

        # === Seeding
        seed += 10000 * mpi_tools.proc_id()
        torch.manual_seed(seed)
        np.random.seed(seed)
        self.env.seed(seed=seed)

        # === Setup actor-critic module
        self.ac = core.ActorCriticWithCosts(
            actor_type=actor,
            observation_space=env.observation_space,
            action_space=env.action_space,
            use_standardized_obs=use_standardized_obs,
            use_scaled_rewards=use_reward_scaling,
            use_shared_weights=use_shared_weights,
            weight_initialization=weight_initialization,
            ac_kwargs=ac_kwargs)

        # === set up MPI specifics
        self._init_mpi()

        # === Set up experience buffer
        self.buf = core.Buffer(
            actor_critic=self.ac,
            obs_dim=env.observation_space.shape,
            act_dim=env.action_space.shape,
            size=self.local_steps_per_epoch,
            gamma=gamma,
            lam=lam,
            adv_estimation_method=adv_estimation_method,
            use_scaled_rewards=use_reward_scaling,
            standardize_env_obs=use_standardized_obs,
            standardize_advantages=use_standardized_advantages,
            lam_c=lam_c,
            use_reward_penalty=use_reward_penalty,
        )

        # Set up optimizers for policy and value function
        self.pi_optimizer = core.get_optimizer(optimizer,
                                               module=self.ac.pi,
                                               lr=pi_lr)
        self.vf_optimizer = core.get_optimizer('Adam',
                                               module=self.ac.v,
                                               lr=vf_lr)
        if use_cost_value_function:
            self.cf_optimizer = core.get_optimizer('Adam',
                                                   module=self.ac.c,
                                                   lr=self.vf_lr)
        # Set up video recorder
        self.recorder = self._init_video_recorder()
        # setup scheduler for policy learning rate decay
        self.scheduler = self._init_learning_rate_scheduler()

        # Set up model saving
        self.logger.setup_torch_saver(self.ac)
        self.logger.torch_save()

        # setup statistics
        self.start_time = time.time()
        self.epoch_time = time.time()
        self.loss_pi_before = 0.0
        self.loss_v_before = 0.0
        self.loss_c_before = 0.0
        self.logger.log('Start with training.')
 def write_to_file(file, contents: list):
     if mpi_tools.proc_id() == 0:
         column = [str(x) for x in contents]
         file.write("\n".join(column) + "\n")
         file.flush()