def main(args, base_dir): """Execute multiple training operations.""" for i in range(args.n_training): # value of the next seed seed = args.seed + i # create a save directory folder (if it doesn't exist) dir_name = os.path.join( base_dir, '{}/{}'.format(args.env_name, strftime("%Y-%m-%d-%H:%M:%S"))) ensure_dir(dir_name) # get the hyperparameters hp = get_hyperparameters(args, FeedForwardPolicy) # add the seed for logging purposes params_with_extra = hp.copy() params_with_extra['seed'] = seed params_with_extra['env_name'] = args.env_name params_with_extra['policy_name'] = "FeedForwardPolicy" # add the hyperparameters to the folder with open(os.path.join(dir_name, 'hyperparameters.json'), 'w') as f: json.dump(params_with_extra, f, sort_keys=True, indent=4) run_exp(env=args.env_name, hp=hp, steps=args.total_steps, dir_name=dir_name, evaluate=args.evaluate, seed=seed, eval_interval=args.eval_interval, log_interval=args.log_interval, save_interval=args.save_interval)
def main(args, base_dir): """Execute multiple training operations.""" for i in range(args.n_training): # value of the next seed seed = args.seed + i # The time when the current experiment started. now = strftime("%Y-%m-%d-%H:%M:%S") # Create a save directory folder (if it doesn't exist). if args.log_dir is not None: dir_name = args.log_dir else: dir_name = os.path.join(base_dir, '{}/{}'.format( args.env_name, now)) ensure_dir(dir_name) # Get the policy class. if args.alg == "TD3": from hbaselines.fcnet.td3 import FeedForwardPolicy elif args.alg == "SAC": from hbaselines.fcnet.sac import FeedForwardPolicy elif args.alg == "PPO": from hbaselines.fcnet.ppo import FeedForwardPolicy elif args.alg == "TRPO": from hbaselines.fcnet.trpo import FeedForwardPolicy else: raise ValueError("Unknown algorithm: {}".format(args.alg)) # Get the hyperparameters. hp = get_hyperparameters(args, FeedForwardPolicy) # Add the seed for logging purposes. params_with_extra = hp.copy() params_with_extra['seed'] = seed params_with_extra['env_name'] = args.env_name params_with_extra['policy_name'] = "FeedForwardPolicy" params_with_extra['algorithm'] = args.alg params_with_extra['date/time'] = now # Add the hyperparameters to the folder. with open(os.path.join(dir_name, 'hyperparameters.json'), 'w') as f: json.dump(params_with_extra, f, sort_keys=True, indent=4) run_exp( env=args.env_name, policy=FeedForwardPolicy, hp=hp, dir_name=dir_name, evaluate=args.evaluate, seed=seed, eval_interval=args.eval_interval, log_interval=args.log_interval, save_interval=args.save_interval, initial_exploration_steps=args.initial_exploration_steps, ckpt_path=args.ckpt_path, )
def learn(self, total_timesteps, log_dir=None, seed=None, log_interval=2000, eval_interval=50000, save_interval=10000, initial_exploration_steps=10000): """Perform the complete training operation. Parameters ---------- total_timesteps : int the total number of samples to train on log_dir : str the directory where the training and evaluation statistics, as well as the tensorboard log, should be stored seed : int or None the initial seed for training, if None: keep current seed log_interval : int the number of training steps before logging training results eval_interval : int number of simulation steps in the training environment before an evaluation is performed save_interval : int number of simulation steps in the training environment before the model is saved initial_exploration_steps : int number of timesteps that the policy is run before training to initialize the replay buffer with samples """ # Create a saver object. self.saver = tf.compat.v1.train.Saver(self.trainable_vars, max_to_keep=total_timesteps // save_interval) # Make sure that the log directory exists, and if not, make it. ensure_dir(log_dir) ensure_dir(os.path.join(log_dir, "checkpoints")) # Create a tensorboard object for logging. save_path = os.path.join(log_dir, "tb_log") writer = tf.compat.v1.summary.FileWriter(save_path) # file path for training and evaluation results train_filepath = os.path.join(log_dir, "train.csv") eval_filepath = os.path.join(log_dir, "eval.csv") # Setup the seed value. random.seed(seed) np.random.seed(seed) tf.compat.v1.set_random_seed(seed) if self.verbose >= 2: print('Using agent with the following configuration:') print(str(self.__dict__.items())) eval_steps_incr = 0 save_steps_incr = 0 start_time = time.time() with self.sess.as_default(), self.graph.as_default(): # Prepare everything. obs = self.env.reset() self.obs, self.all_obs = self._get_obs(obs) # Add the fingerprint term, if needed. self.obs = self._add_fingerprint(self.obs, self.total_steps, total_timesteps) # Collect preliminary random samples. print("Collecting initial exploration samples...") self._collect_samples(total_timesteps, run_steps=initial_exploration_steps, random_actions=True) print("Done!") # Reset total statistics variables. self.episodes = 0 self.total_steps = 0 self.episode_rew_history = deque(maxlen=100) while True: # Reset epoch-specific variables. self.epoch_episodes = 0 self.epoch_episode_steps = [] self.epoch_episode_rewards = [] for _ in range(round(log_interval / self.nb_rollout_steps)): # If the requirement number of time steps has been met, # terminate training. if self.total_steps >= total_timesteps: return # Perform rollouts. self._collect_samples(total_timesteps) # Train. self._train() # Log statistics. self._log_training(train_filepath, start_time) # Evaluate. if self.eval_env is not None and \ (self.total_steps - eval_steps_incr) >= eval_interval: eval_steps_incr += eval_interval # Run the evaluation operations over the evaluation env(s). # Note that multiple evaluation envs can be provided. if isinstance(self.eval_env, list): eval_rewards = [] eval_successes = [] eval_info = [] for env in self.eval_env: rew, suc, inf = \ self._evaluate(total_timesteps, env) eval_rewards.append(rew) eval_successes.append(suc) eval_info.append(inf) else: eval_rewards, eval_successes, eval_info = \ self._evaluate(total_timesteps, self.eval_env) # Log the evaluation statistics. self._log_eval(eval_filepath, start_time, eval_rewards, eval_successes, eval_info) # Run and store summary. if writer is not None: td_map = self.policy_tf.get_td_map() # Check if td_map is empty. if not td_map: break td_map.update({ self.rew_ph: np.mean(self.epoch_episode_rewards), self.rew_history_ph: np.mean(self.episode_rew_history), }) summary = self.sess.run(self.summary, td_map) writer.add_summary(summary, self.total_steps) # Save a checkpoint of the model. if (self.total_steps - save_steps_incr) >= save_interval: save_steps_incr += save_interval self.save(os.path.join(log_dir, "checkpoints/itr")) # Update the epoch count. self.epoch += 1
def learn(self, total_timesteps, log_dir=None, seed=None, log_interval=2000, eval_interval=50000, save_interval=10000, start_timesteps=50000): """Return a trained model. Parameters ---------- total_timesteps : int the total number of samples to train on log_dir : str the directory where the training and evaluation statistics, as well as the tensorboard log, should be stored seed : int or None the initial seed for training, if None: keep current seed log_interval : int the number of training steps before logging training results eval_interval : int number of simulation steps in the training environment before an evaluation is performed save_interval : int number of simulation steps in the training environment before the model is saved start_timesteps : int, optional number of timesteps that the policy is run before training to initialize the replay buffer with samples """ # Create a saver object. self.saver = tf.compat.v1.train.Saver(self.trainable_vars, max_to_keep=total_timesteps // save_interval) # Make sure that the log directory exists, and if not, make it. ensure_dir(log_dir) ensure_dir(os.path.join(log_dir, "checkpoints")) # Create a tensorboard object for logging. save_path = os.path.join(log_dir, "tb_log") writer = tf.compat.v1.summary.FileWriter(save_path) # file path for training and evaluation results train_filepath = os.path.join(log_dir, "train.csv") eval_filepath = os.path.join(log_dir, "eval.csv") # Setup the seed value. random.seed(seed) np.random.seed(seed) tf.compat.v1.set_random_seed(seed) if self.verbose >= 2: print('Using agent with the following configuration:') print(str(self.__dict__.items())) eval_steps_incr = 0 save_steps_incr = 0 start_time = time.time() with self.sess.as_default(), self.graph.as_default(): variables = tf.get_collection( tf.compat.v1.GraphKeys.TRAINABLE_VARIABLES) variables_to_restore = [ v for v in variables if v.name.split('/')[1] != 'communication' ] model_path_0 = os.path.join( '/home/cil-c42/Projects/h-baselines/data/goal-conditioned-com-mlppr5-0.05intrinsic-mesdim8/AntMaze/2020-01-16-23:40:52', "checkpoints/itr-250000") message_variables = tf.get_collection( tf.compat.v1.GraphKeys.TRAINABLE_VARIABLES) message_variables_to_restore = [ v for v in message_variables if v.name.split('/')[1] == 'communication' ] model_path = os.path.join( '/home/cil-c42/Projects/h-baselines/data/goal-conditioned-com-mlppr5-0.05intrinsic-mesdim8/AntMaze/2020-01-16-23:40:52', "checkpoints/itr-1000000") saver_0 = tf.train.Saver(variables_to_restore) saver_0.restore(self.sess, model_path_0) saver = tf.train.Saver(message_variables_to_restore) saver.restore(self.sess, model_path) # self.policy_tf.initialize() print('load--------------------------------') # Prepare everything. self.obs = self.env.reset() # Add the fingerprint term, if needed. self.obs = self._add_fingerprint(self.obs, self.total_steps, total_timesteps) self.message = [np.zeros(shape=(8, ))] # Collect preliminary random samples. print("Collecting pre-samples...") self._collect_samples( total_timesteps, run_steps=start_timesteps, # random_actions=True) random_actions=False) print("Done!") # Reset total statistics variables. self.episodes = 0 self.total_steps = 0 self.episode_rewards_history = deque(maxlen=100) while True: # Reset epoch-specific variables. self.epoch_episodes = 0 self.epoch_actions = [] self.epoch_q1s = [] self.epoch_q2s = [] self.epoch_actor_losses = [] self.epoch_critic_losses = [] self.epoch_episode_rewards = [] self.epoch_episode_steps = [] self.epoch_cg_losses = [] self.epoch_dynamic_losses = [] for _ in range(log_interval): # If the requirement number of time steps has been met, # terminate training. if self.total_steps >= total_timesteps: return # Perform rollouts. self._collect_samples(total_timesteps) # Train. self._train() # Log statistics. self._log_training(train_filepath, start_time) # Evaluate. if self.eval_env is not None and \ (self.total_steps - eval_steps_incr) >= eval_interval: eval_steps_incr += eval_interval # Run the evaluation operations over the evaluation env(s). # Note that multiple evaluation envs can be provided. if isinstance(self.eval_env, list): eval_rewards = [] eval_successes = [] eval_info = [] for env in self.eval_env: rew, suc, inf = \ self._evaluate(total_timesteps, env) eval_rewards.append(rew) eval_successes.append(suc) eval_info.append(inf) else: eval_rewards, eval_successes, eval_info = \ self._evaluate(total_timesteps, self.eval_env) # Log the evaluation statistics. self._log_eval(eval_filepath, start_time, eval_rewards, eval_successes, eval_info) # Run and store summary. if writer is not None: td_map = self.policy_tf.get_td_map() # Check if td_map is empty. if td_map: td_map.update({ self.rew_ph: np.mean(self.epoch_episode_rewards), self.rew_history_ph: np.mean(self.episode_rewards_history), }) summary = self.sess.run(self.summary, td_map) writer.add_summary(summary, self.total_steps) # Save a checkpoint of the model. if (self.total_steps - save_steps_incr) >= save_interval: save_steps_incr += save_interval self.save(os.path.join(log_dir, "checkpoints/itr")) # Update the epoch count. self.epoch += 1