def __init__( self, log_dir, output_fname='progress.csv', debug: bool = False, exp_name=None, level: int = 1, # verbosity level use_tensor_board=True, verbose=True): """ Initialize a Logger. Args: log_dir (string): A directory for saving results to. If ``None``, defaults to a temp directory of the form ``/tmp/experiments/somerandomnumber``. output_fname (string): Name for the tab-separated-value file containing metrics logged throughout a training run. Defaults to ``progress.txt``. exp_name (string): Experiment name. If you run multiple training runs and give them all the same ``exp_name``, the plotter will know to group them. (Use case: if you run the same hyperparameter configuration with multiple random seeds, you should give them all the same ``exp_name``.) """ self.log_dir = log_dir self.debug = debug if proc_id() == 0 else False self.level = level # only the MPI root process is allowed to print information to console self.verbose = verbose if proc_id() == 0 else False if proc_id() == 0: os.makedirs(self.log_dir, exist_ok=True) self.output_file = open(osp.join(self.log_dir, output_fname), 'w') atexit.register(self.output_file.close) print( colorize(f"Logging data to {self.output_file.name}", 'cyan', bold=True)) else: self.output_file = None self.epoch = 0 self.first_row = True self.log_headers = [] self.log_current_row = {} self.exp_name = exp_name self.torch_saver_elements = None # Setup tensor board logging if enabled and MPI root process self.summary_writer = SummaryWriter(os.path.join(self.log_dir, 'tb')) \ if use_tensor_board and proc_id() == 0 else None
def torch_save(self, itr=None): """ Saves the PyTorch model (or models). """ if proc_id() == 0: self.log('Save model to disk...') assert self.torch_saver_elements is not None,\ "First have to setup saving with self.setup_torch_saver" fpath = 'torch_save' fpath = osp.join(self.log_dir, fpath) fname = 'model' + ('%d' % itr if itr is not None else '') + '.pt' fname = osp.join(fpath, fname) os.makedirs(fpath, exist_ok=True) with warnings.catch_warnings(): warnings.simplefilter("ignore") # We are using a non-recommended way of saving PyTorch models, # by pickling whole objects (which are dependent on the exact # directory structure at the time of saving) as opposed to # just saving network weights. This works sufficiently well # for the purposes of Spinning Up, but you may want to do # something different for your personal PyTorch project. # We use a catch_warnings() context to avoid the warnings about # not being able to save the source code. torch.save(self.torch_saver_elements, fname) torch.save(self.torch_saver_elements.state_dict(), fname) self.log('Done.')
def save_state(self, state_dict, itr=None): """ Saves the state of an experiment. To be clear: this is about saving *state*, not logging diagnostics. All diagnostic logging is separate from this function. This function will save whatever is in ``state_dict``---usually just a copy of the environment---and the most recent parameters for the model you previously set up saving for with ``setup_tf_saver``. Call with any frequency you prefer. If you only want to maintain a single state and overwrite it at each call with the most recent version, leave ``itr=None``. If you want to keep all of the states you save, provide unique (increasing) values for 'itr'. Args: state_dict (dict): Dictionary containing essential elements to describe the current state of training. itr: An int, or None. Current iteration of training. """ if proc_id() == 0: fname = 'state.pkl' if itr is None else 'state%d.pkl' % itr try: joblib.dump(state_dict, osp.join(self.log_dir, fname)) except: self.log('Warning: could not pickle state_dict.', color='red') if hasattr(self, 'torch_saver_elements'): self.torch_save(itr)
def save_config(self, config): """ Log an experiment configuration. Call this once at the top of your experiment, passing in all important config vars as a dict. This will serialize the config to JSON, while handling anything which can't be serialized in a graceful way (writing as informative a string as possible). Example use: .. code-block:: python logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) """ if proc_id() == 0: # only root process logs configurations config_json = convert_json(config) if self.exp_name is not None: config_json['exp_name'] = self.exp_name output = json.dumps(config_json, separators=(',', ':\t'), indent=4, sort_keys=True) if self.verbose and self.level > 0: print(colorize('Run with config:', color='yellow', bold=True)) print(output) with open(osp.join(self.log_dir, "config.json"), 'w') as out: out.write(output)
def close(self): """Close opened output files immediately after training in order to avoid number of open files overflow. Avoids the following error: OSError: [Errno 24] Too many open files """ if proc_id() == 0: self.output_file.close()
def __init__(self, log_dir, log_costs=True): self.log_dir = log_dir self.env = None self.ac = None self.log_costs = log_costs # open returns.csv file at the beginning to avoid disk access errors # on our HPC servers... if mpi_tools.proc_id() == 0: os.makedirs(log_dir, exist_ok=True) self.ret_file_name = 'returns.csv' self.ret_file = open(os.path.join(log_dir, self.ret_file_name), 'w') # Register close function is executed for normal program termination atexit.register(self.ret_file.close) if log_costs: self.c_file_name = 'costs.csv' self.costs_file = open(os.path.join(log_dir, self.c_file_name), 'w') atexit.register(self.costs_file.close) else: self.ret_file_name = None self.ret_file = None if log_costs: self.c_file_name = None self.costs_file = None
def test_mpi_matrix_updates(self): """ OnlineMeanStd module is updated with a batch of vector inputs, i.e. inputs of shape M x N. Note that std dev might differ more than 1e-5 when epochs > 10. """ epochs = 20 cores = 4 T = 500 obs_shape = (6, ) try: if mpi_tools.mpi_fork(n=cores): sys.exit() pid = mpi_tools.proc_id() # print(f'Here is process:', mpi_tools.proc_id()) # === calculation through online updates rms = OnlineMeanStd(shape=obs_shape) for ep in range(epochs): # shape of batch: T x obs_shape obs_batch = self.get_data(T, obs_shape[0], ep, pid) rms.update(obs_batch) mpi_mean = rms.mean.numpy() mpi_std = rms.std.numpy() # ===== calculate ground truths obs_list = [self.get_data(T, obs_shape[0], ep, pid=i) for i in range(cores) for ep in range(epochs) ] obs = np.vstack(obs_list) gt_mean = np.mean(obs, axis=0) gt_std = np.std(obs, axis=0) # if mpi_tools.proc_id() == 0: # print('gt_mean:', gt_mean) # print('mpi_mean:', mpi_mean) # print('gt_std:', gt_std) # print('mpi_std:', mpi_std) self.assertTrue(np.allclose(mpi_mean, gt_mean)) self.assertTrue(np.allclose(mpi_std, gt_std)) self.assertTrue(self.perform_single_pass(rms, obs_shape)) # necessary to prevent system exit with error... except SystemExit: print('Join....')
def dump_tabular(self) -> None: """ Write all of the diagnostics from the current iteration. Writes both to stdout, and to the output file. """ if proc_id() == 0: vals = list() self.epoch += 1 # Print formatted information into console key_lens = [len(key) for key in self.log_headers] max_key_len = max(15, max(key_lens)) keystr = '%' + '%d' % max_key_len fmt = "| " + keystr + "s | %15s |" n_slashes = 22 + max_key_len print("-" * n_slashes) if self.verbose and self.level > 0 else None for key in self.log_headers: val = self.log_current_row.get(key, "") valstr = "%8.3g" % val if hasattr(val, "__float__") else val if self.verbose and self.level > 0: print(fmt % (key, valstr)) vals.append(val) if self.verbose and self.level > 0: print("-" * n_slashes, flush=True) # Write into the output file (can be any text file format, e.g. CSV) if self.output_file is not None: if self.first_row: self.output_file.write(",".join(self.log_headers) + "\n") self.output_file.write(",".join(map(str, vals)) + "\n") self.output_file.flush() if self.summary_writer is not None: [ self.summary_writer.add_scalar(k, v, global_step=self.epoch) for (k, v) in zip(self.log_headers, vals) ] # Flushes the event file to disk. Call this method to make sure # that all pending events have been written to disk. self.summary_writer.flush() # free logged information in all processes... self.log_current_row.clear() self.first_row = False
def test_mpi_vector_updates(self): """Test with vector inputs. Note that std dev might differ more than 1e-5 when epochs > 10. """ epochs = 10 cores = 4 T = 500 input_shape = (1,) try: mpi_tools.mpi_fork(n=cores) p = mpi_tools.proc_id() # print(f'Here is process:', mpi_tools.proc_id()) # === calculation through online updates rms = OnlineMeanStd(shape=input_shape) for ep in range(epochs): # shape of batch: T x input_shape vector_input = self.get_data(T, input_shape[0], ep, p).flatten() rms.update(vector_input) mpi_mean = rms.mean.numpy() mpi_std = rms.std.numpy() # ===== calculate ground truths obs_list = [self.get_data(T, input_shape[0], ep, pid=i) for i in range(cores) for ep in range(epochs) ] obs = np.vstack(obs_list) gt_mean = np.mean(obs, axis=0) gt_std = np.std(obs, axis=0) # if mpi_tools.proc_id() == 0: # print('gt_mean:', gt_mean) # print('mpi_mean:', mpi_mean) # print('gt_std:', gt_std) # print('mpi_std:', mpi_std) self.assertTrue(np.allclose(mpi_mean, gt_mean)) self.assertTrue(np.allclose(mpi_std, gt_std, rtol=1e-2)) self.assertTrue(self.perform_single_pass(rms, input_shape)) # necessary to prevent system exit with error... except SystemExit: print('Join....')
def test_mpi_version_of_algorithms(self): """ Run all the specified algorithms with MPI.""" cores = 4 if mpi_tools.mpi_fork(n=cores): # forks the current script and use MPI return # use return instead of sys.exit() to exit test with 'OK'... is_root = mpi_tools.proc_id() == 0 algs = ['iwpg', 'npg', 'trpo', 'lag-trpo', 'pdo', 'cpo'] for alg in algs: try: print(f'Run {alg.upper()}') if is_root else None ac, env = self.check_alg(alg, 'HopperBulletEnv-v0', cores=cores) self.assertTrue(isinstance(env, gym.Env)) except NotImplementedError: print('No MPI yet supported...') if is_root else None else: # sleep one sec to finish all console prints... time.sleep(1)
def eval(self, env, ac, num_evaluations): """ Evaluate actor critic module for given number of evaluations. """ self.ac = ac self.ac.eval() # disable exploration noise if isinstance(env, gym.Env): self.env = env elif isinstance(env, str): self.env = gym.make(env) else: raise TypeError('Env is not of type: str, gym.Env') size = mpi_tools.num_procs() num_local_evaluations = num_evaluations // size returns = np.zeros(num_local_evaluations, dtype=np.float32) costs = np.zeros(num_local_evaluations, dtype=np.float32) ep_lengths = np.zeros(num_local_evaluations, dtype=np.float32) for i in range(num_local_evaluations): returns[i], ep_lengths[i], costs[i] = self.eval_once() # Gather returns from all processes # Note: only root process owns valid data... returns = list(mpi_tools.gather_and_stack(returns)) costs = list(mpi_tools.gather_and_stack(costs)) # now write returns as column into output file... if mpi_tools.proc_id() == 0: self.write_to_file(self.ret_file, contents=returns) print('Saved to:', os.path.join(self.log_dir, self.ret_file_name)) if self.log_costs: self.write_to_file(self.costs_file, contents=costs) print(f'Mean Ret: { np.mean(returns)} \t' f'Mean EpLen: {np.mean(ep_lengths)} \t' f'Mean Costs: {np.mean(costs)}') self.ac.train() # back to train mode return np.array(returns), np.array(ep_lengths), np.array(costs)
help=f'Define the init seed, e.g. {random_seed}') parser.add_argument('--search', action='store_true', help='If given search over learning rates.') parser.add_argument('--log-dir', type=str, default=default_log_dir, help='Define a custom directory for logging.') args, unparsed_args = parser.parse_known_args() # Use number of physical cores as default. If also hardware threading CPUs # should be used, enable this by the use_number_of_threads=True use_number_of_threads = True if args.cores > physical_cores else False if mpi_tools.mpi_fork(args.cores, use_number_of_threads=use_number_of_threads): # Re-launches the current script with workers linked by MPI sys.exit() print('Unknowns:', unparsed_args) if mpi_tools.proc_id() == 0 else None model = Model(alg=args.alg, env_id=args.env, log_dir=args.log_dir, init_seed=args.seed, unparsed_args=unparsed_args, use_mpi=not args.no_mpi) model.compile(num_runs=args.runs, num_cores=args.cores) model.fit() model.eval() if args.play: model.play()
def get_data(M, N, epoch, pid=mpi_tools.proc_id()): """Returns data matrix of shape MxN.""" start = pid*10000 + 4 * epoch stop = pid*10000 + M * N + 4 * epoch step = 1 return 0.001 * np.arange(start, stop, step).reshape((M, N))
def __init__( self, actor: str, ac_kwargs: dict, env_id: str, epochs: int, logger_kwargs: dict, adv_estimation_method: str = 'gae', alg='iwpg', check_freq: int = 25, entropy_coef: float = 0.01, gamma: float = 0.99, lam: float = 0.95, # GAE scalar lam_c: float = 0.95, # GAE scalar for cost estimation max_ep_len: int = 1000, max_grad_norm: float = 0.5, num_mini_batches: int = 16, # used for value network training optimizer: str = 'Adam', # policy optimizer pi_lr: float = 3e-4, steps_per_epoch: int = 32 * 1000, # number global steps per epoch target_kl: float = 0.01, train_pi_iterations: int = 80, train_v_iterations: int = 5, trust_region='plain', # used for easy filtering in plot utils use_cost_value_function: bool = False, use_entropy: bool = False, use_exploration_noise_anneal: bool = False, use_kl_early_stopping: bool = False, use_linear_lr_decay: bool = True, use_max_grad_norm: bool = False, use_reward_scaling: bool = True, use_reward_penalty: bool = False, use_shared_weights: bool = False, use_standardized_advantages: bool = False, use_standardized_obs: bool = True, verbose: bool = True, vf_lr: float = 1e-3, weight_initialization: str = 'kaiming_uniform', save_freq: int = 10, seed: int = 0, video_freq: int = -1, # set to positive integer for video recording **kwargs # use to log parameters from child classes ): """ Parameters ---------- actor ac_kwargs env_id epochs logger_kwargs adv_estimation_method alg check_freq entropy_coef gamma lam lam_c max_ep_len max_grad_norm num_mini_batches optimizer pi_lr steps_per_epoch target_kl train_pi_iterations train_v_iterations trust_region use_cost_value_function use_entropy use_exploration_noise_anneal use_kl_early_stopping use_linear_lr_decay use_max_grad_norm use_reward_scaling use_reward_penalty use_shared_weights use_standardized_advantages use_standardized_obs verbose vf_lr weight_initialization save_freq seed video_freq kwargs """ # Environment calls # TODO: call gym.make with **kwargs (to allow customization) self.env = env = gym.make(env_id) if isinstance(env_id, str) else env_id # Collect information from environment if it has an time wrapper if hasattr(self.env, '_max_episode_steps'): max_ep_len = self.env._max_episode_steps self.adv_estimation_method = adv_estimation_method self.alg = alg self.check_freq = check_freq self.entropy_coef = entropy_coef if use_entropy else 0.0 self.epoch = 0 # iterated in learn method self.epochs = epochs self.lam = lam self.local_steps_per_epoch = steps_per_epoch // mpi_tools.num_procs() self.logger_kwargs = logger_kwargs self.max_ep_len = max_ep_len self.max_grad_norm = max_grad_norm self.num_mini_batches = num_mini_batches self.pi_lr = pi_lr self.save_freq = save_freq self.seed = seed self.steps_per_epoch = steps_per_epoch self.target_kl = target_kl self.train_pi_iterations = train_pi_iterations self.train_v_iterations = train_v_iterations self.use_cost_value_function = use_cost_value_function self.use_exploration_noise_anneal = use_exploration_noise_anneal self.use_kl_early_stopping = use_kl_early_stopping self.use_linear_lr_decay = use_linear_lr_decay self.use_max_grad_norm = use_max_grad_norm self.use_reward_penalty = use_reward_penalty self.use_reward_scaling = use_reward_scaling self.use_standardized_obs = use_standardized_obs self.use_standardized_advantages = use_standardized_advantages self.video_freq = video_freq self.vf_lr = vf_lr # ==== Call assertions.... self._sanity_checks() # === Set up logger and save configuration to disk # get local parameters before logger instance to avoid unnecessary print self.params = locals() self.logger = self._init_logger() self.logger.save_config(self.params) # === Seeding seed += 10000 * mpi_tools.proc_id() torch.manual_seed(seed) np.random.seed(seed) self.env.seed(seed=seed) # === Setup actor-critic module self.ac = core.ActorCriticWithCosts( actor_type=actor, observation_space=env.observation_space, action_space=env.action_space, use_standardized_obs=use_standardized_obs, use_scaled_rewards=use_reward_scaling, use_shared_weights=use_shared_weights, weight_initialization=weight_initialization, ac_kwargs=ac_kwargs) # === set up MPI specifics self._init_mpi() # === Set up experience buffer self.buf = core.Buffer( actor_critic=self.ac, obs_dim=env.observation_space.shape, act_dim=env.action_space.shape, size=self.local_steps_per_epoch, gamma=gamma, lam=lam, adv_estimation_method=adv_estimation_method, use_scaled_rewards=use_reward_scaling, standardize_env_obs=use_standardized_obs, standardize_advantages=use_standardized_advantages, lam_c=lam_c, use_reward_penalty=use_reward_penalty, ) # Set up optimizers for policy and value function self.pi_optimizer = core.get_optimizer(optimizer, module=self.ac.pi, lr=pi_lr) self.vf_optimizer = core.get_optimizer('Adam', module=self.ac.v, lr=vf_lr) if use_cost_value_function: self.cf_optimizer = core.get_optimizer('Adam', module=self.ac.c, lr=self.vf_lr) # Set up video recorder self.recorder = self._init_video_recorder() # setup scheduler for policy learning rate decay self.scheduler = self._init_learning_rate_scheduler() # Set up model saving self.logger.setup_torch_saver(self.ac) self.logger.torch_save() # setup statistics self.start_time = time.time() self.epoch_time = time.time() self.loss_pi_before = 0.0 self.loss_v_before = 0.0 self.loss_c_before = 0.0 self.logger.log('Start with training.')
def write_to_file(file, contents: list): if mpi_tools.proc_id() == 0: column = [str(x) for x in contents] file.write("\n".join(column) + "\n") file.flush()