def train(self): """ Performs startup, then loops by alternating between ``sampler.obtain_samples()`` and ``algo.optimize_agent()``, logging diagnostics at the specified interval. """ n_itr = self.startup() if self.pretrain != 'None': status_file_read = open(self.log_dir + '/last_itr.txt', 'r') starting_itr = int(status_file_read.read().split('\n')[-2]) else: starting_itr = 0 with open(self.log_dir + '/last_itr.txt', 'a') as status_file: # for restart purposes for itr in range(starting_itr, n_itr): logger.set_iteration(itr) with logger.prefix(f"itr #{itr} "): self.agent.sample_mode( itr) # Might not be this agent sampling. samples, traj_infos = self.sampler.obtain_samples(itr) self.agent.train_mode(itr) opt_info, layer_info = self.algo.optimize_agent( itr, samples) self.store_diagnostics(itr, traj_infos, opt_info) if (itr + 1) % self.log_interval_itrs == 0: status_file.write(str(itr) + '\n') self.log_diagnostics(itr) self.log_weights(layer_info) self.shutdown()
def train(self): """ Performs startup, evaluates the initial agent, then loops by alternating between ``sampler.obtain_samples()`` and ``algo.optimize_agent()``. Pauses to evaluate the agent at the specified log interval. """ n_itr = self.startup() with logger.prefix(f"itr #0 "): eval_traj_infos, eval_time = self.evaluate_agent(0) self.log_diagnostics(0, eval_traj_infos, eval_time) for itr in range(n_itr): logger.set_iteration(itr) with logger.prefix(f"itr #{itr} "): if self.transfer and self.transfer_iter == itr: self.sampler.transfer( self.transfer_arg) # Transfer if doing eval_traj_infos, eval_time = self.evaluate_agent( itr) # Eval self.log_diagnostics(itr, eval_traj_infos, eval_time) self.agent.sample_mode(itr) samples, traj_infos = self.sampler.obtain_samples(itr) self.agent.train_mode(itr) opt_info = self.algo.optimize_agent(itr, samples) self.store_diagnostics(itr, traj_infos, opt_info) if (itr + 1) % self.log_interval_itrs == 0: eval_traj_infos, eval_time = self.evaluate_agent(itr) self.log_diagnostics(itr, eval_traj_infos, eval_time) self.shutdown()
def train(self): """ Performs startup, evaluates the initial agent, then loops by alternating between ``sampler.obtain_samples()`` and ``algo.optimize_agent()``. Pauses to evaluate the agent at the specified log interval. """ n_itr = self.startup() with logger.prefix(f"itr #0 "): eval_traj_infos, eval_time = self.evaluate_agent(0) self.log_diagnostics(0, eval_traj_infos, eval_time) if self.pretrain != 'None': status_file_read = open(self.log_dir + '/last_itr.txt', 'r') starting_itr = int(status_file_read.read().split('\n')[-2]) else: starting_itr = 0 with open(self.log_dir + '/last_itr.txt', 'a') as status_file: # for restart purposes if self.pretrain != 'None': starting_itr = int(status_file.read().split('\n')[-2]) else: starting_itr = 0 for itr in range(starting_itr, n_itr): logger.set_iteration(itr) with logger.prefix(f"itr #{itr} "): self.agent.sample_mode(itr) samples, traj_infos = self.sampler.obtain_samples(itr) self.agent.train_mode(itr) opt_info = self.algo.optimize_agent(itr, samples) self.store_diagnostics(itr, traj_infos, opt_info) if (itr + 1) % self.log_interval_itrs == 0: eval_traj_infos, eval_time = self.evaluate_agent(itr) self.log_diagnostics(itr, eval_traj_infos, eval_time) self.shutdown()
def train(self): """ Performs startup, then loops by alternating between ``sampler.obtain_samples()`` and ``algo.optimize_agent()``, logging diagnostics at the specified interval. """ n_itr = self.startup() for itr in range(n_itr): logger.set_iteration(itr) with logger.prefix(f"itr #{itr} "): if self.transfer and self.transfer_iter == itr: self.sampler.transfer( self.transfer_arg) # Transfer if doing self._traj_infos.clear() # Clear trajectory information self._transfer_start(itr, opt_info) self.agent.sample_mode( itr) # Might not be this agent sampling. samples, traj_infos = self.sampler.obtain_samples(itr) self.agent.train_mode(itr) opt_info = self.algo.optimize_agent(itr, samples) self.store_diagnostics(itr, traj_infos, opt_info) if (itr + 1) % self.log_interval_itrs == 0: self.log_diagnostics(itr) if self.n_episodes is not None and self._cum_completed_trajs >= self.n_episodes: break self.shutdown()
def train(self): """ Performs startup, evaluates the initial agent, then loops by alternating between ``sampler.obtain_samples()`` and ``algo.optimize_agent()``. Pauses to evaluate the agent at the specified log interval. """ n_itr = self.startup() with logger.prefix(f"itr #0 "): eval_traj_infos, eval_time = self.evaluate_agent(0) self.log_diagnostics(0, eval_traj_infos, eval_time) for itr in range(n_itr): logger.set_iteration(itr) with logger.prefix(f"itr #{itr} "): self.agent.sample_mode(itr) samples, traj_infos = self.sampler.obtain_samples(itr) self.agent.train_mode(itr) opt_info = self.algo.optimize_agent(itr, samples) self.store_diagnostics(itr, traj_infos, opt_info) if self.prioritized_level_replay: for traj_info, value_error in zip(traj_infos, opt_info.valueLoss): self.level_replay.update_seed_score( traj_info.seed, value_error) seeds = [ self.level_replay.sample() for _ in range(self.sampler.n_worker) ] self.sampler.set_env_seeds(seeds) if (itr + 1) % self.log_interval_itrs == 0: eval_traj_infos, eval_time = self.evaluate_agent(itr) self.log_diagnostics(itr, eval_traj_infos, eval_time) self.shutdown()
def train(self): """ Run the optimizer in a loop. Check whether enough new samples have been generated, and throttle down if necessary at each iteration. Log at an interval in the number of sampler iterations, not optimizer iterations. """ throttle_itr, delta_throttle_itr = self.startup() throttle_time = 0. sampler_itr = itr = 0 if self._eval: while self.ctrl.sampler_itr.value < 1: # Sampler does eval first. time.sleep(THROTTLE_WAIT) traj_infos = drain_queue(self.traj_infos_queue, n_sentinel=1) self.store_diagnostics(0, 0, traj_infos, ()) self.log_diagnostics(0, 0, 0) log_counter = 0 while True: # Run until sampler hits n_steps and sets ctrl.quit=True. logger.set_iteration(itr) with logger.prefix(f"opt_itr #{itr} "): while self.ctrl.sampler_itr.value < throttle_itr: if self.ctrl.quit.value: break time.sleep(THROTTLE_WAIT) throttle_time += THROTTLE_WAIT if self.ctrl.quit.value: break if self.ctrl.opt_throttle is not None: self.ctrl.opt_throttle.wait() throttle_itr += delta_throttle_itr opt_info = self.algo.optimize_agent( itr, sampler_itr=self.ctrl.sampler_itr.value) self.agent.send_shared_memory() # To sampler. sampler_itr = self.ctrl.sampler_itr.value traj_infos = (list() if self._eval else drain_queue( self.traj_infos_queue)) self.store_diagnostics(itr, sampler_itr, traj_infos, opt_info) if (sampler_itr // self.log_interval_itrs > log_counter): if self._eval: with self.ctrl.sampler_itr.get_lock(): traj_infos = drain_queue(self.traj_infos_queue, n_sentinel=1) self.store_diagnostics(itr, sampler_itr, traj_infos, ()) self.log_diagnostics(itr, sampler_itr, throttle_time) log_counter += 1 throttle_time = 0. itr += 1 # Final log: sampler_itr = self.ctrl.sampler_itr.value traj_infos = drain_queue(self.traj_infos_queue) if traj_infos or not self._eval: self.store_diagnostics(itr, sampler_itr, traj_infos, ()) self.log_diagnostics(itr, sampler_itr, throttle_time) self.shutdown()
def train(self): """ Performs startup, evaluates the initial agent, then loops by alternating between ``sampler.obtain_samples()`` and ``algo.optimize_agent()``. Pauses to evaluate the agent at the specified log interval. """ n_itr = self.startup() with logger.prefix(f"itr #0 "): player_eval_traj_infos, observer_eval_traj_infos, eval_time = self.evaluate_agent( 0) self.log_diagnostics(0, player_eval_traj_infos, observer_eval_traj_infos, eval_time) for itr in range(n_itr): logger.set_iteration(itr) with logger.prefix(f"itr #{itr} "): self.agent.sample_mode(itr) player_samples, player_traj_infos, observer_samples, observer_traj_infos = self.sampler.obtain_samples( itr) self.agent.train_mode(itr) player_opt_info = () observer_opt_info = () if self.alt_train: if self.agent.train_mask[0] and (itr % 2 == 0): player_opt_info = self.player_algo.optimize_agent( itr // 2, player_samples) elif self.agent.train_mask[1]: observer_opt_info = self.observer_algo.optimize_agent( itr // 2, observer_samples) else: if self.agent.train_mask[0]: player_opt_info = self.player_algo.optimize_agent( itr, player_samples) if self.agent.train_mask[1]: observer_opt_info = self.observer_algo.optimize_agent( itr, observer_samples) self.store_diagnostics(itr, player_traj_infos, observer_traj_infos, player_opt_info, observer_opt_info) if (itr + 1) % self.log_interval_itrs == 0: player_eval_traj_infos, observer_eval_traj_infos, eval_time = self.evaluate_agent( itr) if self.wandb_log: self.wandb_logging( itr, player_traj_infos=player_eval_traj_infos, observer_traj_infos=observer_eval_traj_infos) self.log_diagnostics(itr, player_eval_traj_infos, observer_eval_traj_infos, eval_time) self.shutdown()
def train(self): self.startup() self.algo.train() for itr in range(self.n_updates): logger.set_iteration(itr) with logger.prefix(f"itr #{itr} "): opt_info = self.algo.optimize(itr) # perform one update self.store_diagnostics(itr, opt_info) if (itr + 1) % self.log_interval_updates == 0: self.algo.eval() val_info = self.algo.validation(itr) self.log_diagnostics(itr, val_info) self.algo.train() self.shutdown()
def train(self): """ Performs startup, evaluates the initial agent, then loops by alternating between ``sampler.obtain_samples()`` and ``algo.optimize_agent()``. Pauses to evaluate the agent at the specified log interval. """ n_itr = self.startup() with logger.prefix(f"itr #0 "): eval_traj_infos, eval_time = self.evaluate_agent(0) self.log_diagnostics(0, eval_traj_infos, eval_time) for itr in range(n_itr): logger.set_iteration(self.get_cum_steps(itr)) with logger.prefix(f"itr #{itr} "): self.agent.sample_mode(itr) samples, traj_infos = self.sampler.obtain_samples(itr) self.agent.train_mode(itr) opt_info = self.algo.optimize_agent(itr, samples) self.store_diagnostics(itr, traj_infos, opt_info) if (itr + 1) % self.log_interval_itrs == 0: eval_traj_infos, eval_time = self.evaluate_agent(itr) p_error = self.sampler.env_kwargs['error_rate'] for traj_info in eval_traj_infos: traj_info['p_error'] = p_error self.log_diagnostics(itr, eval_traj_infos, eval_time) avg_lifetime = np.nanmean( np.array([x['lifetime'] for x in eval_traj_infos])) if avg_lifetime > (1 / p_error): if p_error < 0.010: p_error = 0.011 self.sampler.env_kwargs['error_rate'] = p_error self.sampler.eval_env_kwargs[ 'error_rate'] = p_error print(f'new p error is {p_error}', flush=True) self.shutdown() self.startup() else: print( f'didnt change p_error - currently at {p_error}' ) # for env in self.sampler.collector.envs + self.sampler.eval_collector.envs: # env.p_phys = new_p_error # env.p_meas = new_p_error print(f'training end due to n_itr', flush=True) self.shutdown()
def train(self): """ Performs startup, then loops by alternating between ``sampler.obtain_samples()`` and ``algo.optimize_agent()``, logging diagnostics at the specified interval. """ n_itr = self.startup() for itr in range(n_itr): logger.set_iteration(itr) with logger.prefix(f"itr #{itr} "): self.agent.sample_mode(itr) # Might not be this agent sampling. samples, traj_infos = self.sampler.obtain_samples(itr) self.agent.train_mode(itr) opt_info = self.algo.optimize_agent(itr, samples) self.store_diagnostics(itr, traj_infos, opt_info) if (itr + 1) % self.log_interval_itrs == 0: self.log_diagnostics(itr) self.shutdown()
def train(self): """ Performs startup, then loops by alternating between ``sampler.obtain_samples()`` and ``algo.optimize_agent()``, logging diagnostics at the specified interval. """ n_itr = self.startup() for itr in range(n_itr): logger.set_iteration(itr) with logger.prefix(f"itr #{itr} "): self.agent.sample_mode( itr) # Might not be this agent sampling. player_samples, player_traj_infos, observer_samples, observer_traj_infos = self.sampler.obtain_samples( itr) self.agent.train_mode(itr) player_opt_info = () observer_opt_info = () if self.alt_train: if self.agent.train_mask[0] and (itr % 2 == 0): player_opt_info = self.player_algo.optimize_agent( itr // 2, player_samples) elif self.agent.train_mask[1]: observer_opt_info = self.observer_algo.optimize_agent( itr // 2, observer_samples) else: if self.agent.train_mask[0]: player_opt_info = self.player_algo.optimize_agent( itr, player_samples) if self.agent.train_mask[1]: observer_opt_info = self.observer_algo.optimize_agent( itr, observer_samples) self.store_diagnostics(itr, player_traj_infos, observer_traj_infos, player_opt_info, observer_opt_info) if (itr + 1) % self.log_interval_itrs == 0: if self.wandb_log: self.wandb_logging(itr) self.log_diagnostics(itr) self.shutdown()
def train(self): """ Performs startup, then loops by alternating between ``sampler.obtain_samples()`` and ``algo.optimize_agent()``, logging diagnostics at the specified interval. """ n_itr = self.startup() for itr in range(n_itr): logger.set_iteration(itr) with logger.prefix(f"itr #{itr} "): if itr % 200 == 0: # try to log distribution gradient norm of agent # gradients = [] policy_gradients = [] value_gradients = [] all_value_diffs = [] all_ratios = [] all_rewards = [] all_unnorm_rewards = [] num_iters = 100 all_returns = [] for i in range(num_iters): samples, traj_infos = self.sampler.obtain_samples(itr) returns = [ti.Return for ti in traj_infos] all_returns.extend(returns) # mb_grads, value_diffs, ratios, norm_rewards, unnorm_rewards = self.algo.compute_minibatch_gradients(samples) # gradients.extend(mb_grads) p_grads, v_grads, value_diffs, ratios, norm_rewards, unnorm_rewards = self.algo.compute_minibatch_gradients( samples) policy_gradients.extend(p_grads) value_gradients.extend(v_grads) all_value_diffs.extend(value_diffs) all_rewards.extend(norm_rewards.numpy()) all_unnorm_rewards.extend(unnorm_rewards.numpy()) # all_ratios.extend(ratios) # print('ratios', all_ratios) if i % 10 == 0: print('grad', i) # average all gradients # mean_gradient = np.mean(np.array(gradients), axis=0) mean_p_grad = np.mean(np.array(policy_gradients), axis=0) mean_v_grad = np.mean(np.array(value_gradients), axis=0) # compute gradient noises p_cosines = [] p_grad_noise_norms = [] v_cosines = [] v_grad_noise_norms = [] for i in range(num_iters): p_noise = np.linalg.norm(policy_gradients[i] - mean_p_grad) v_noise = np.linalg.norm(value_gradients[i] - mean_v_grad) p_grad_noise_norms.append(p_noise) v_grad_noise_norms.append(v_noise) p_cos = -1 * ( cosine(policy_gradients[i], mean_p_grad) - 1) v_cos = -1 * (cosine(value_gradients[i], mean_v_grad) - 1) p_cosines.append(p_cos) v_cosines.append(v_cos) print(p_grad_noise_norms) print(v_grad_noise_norms) np.save( '/home/vincent/repos/rlpyt/log/policy_gradnoisenorms' + str(itr) + '_' + str(self.seed), p_grad_noise_norms) np.save( '/home/vincent/repos/rlpyt/log/value_gradnoisenorms' + str(itr) + '_' + str(self.seed), v_grad_noise_norms) np.save( '/home/vincent/repos/rlpyt/log/policy_cosines' + str(itr) + '_' + str(self.seed), p_cosines) np.save( '/home/vincent/repos/rlpyt/log/value_cosines' + str(itr) + '_' + str(self.seed), v_cosines) np.save( '/home/vincent/repos/rlpyt/log/value_diffs' + str(itr) + '_' + str(self.seed), value_diffs) np.save( '/home/vincent/repos/rlpyt/log/norm_rewards' + str(itr) + '_' + str(self.seed), all_rewards) np.save( '/home/vincent/repos/rlpyt/log/unnorm_rewards' + str(itr) + '_' + str(self.seed), all_unnorm_rewards) np.save( '/home/vincent/repos/rlpyt/log/returns' + str(itr) + '_' + str(self.seed), all_returns) self.agent.sample_mode( itr) # Might not be this agent sampling. samples, traj_infos = self.sampler.obtain_samples(itr) self.agent.train_mode(itr) opt_info, ratios, rews = self.algo.optimize_agent(itr, samples) if itr % 100 == 0: np.save( '/home/vincent/repos/rlpyt/log/adv_ratios' + str(itr) + '_' + str(self.seed), ratios) # np.save('/home/vincent/repos/rlpyt/log/rewards' + str(itr) + '_' + str(self.seed), # rews.numpy()) self.store_diagnostics(itr, traj_infos, opt_info) if (itr + 1) % self.log_interval_itrs == 0: self.log_diagnostics(itr) self.shutdown()
def train(self, return_buffer=False, check_running=True): """ Performs startup, evaluates the initial agent, then loops by alternating between ``sampler.obtain_samples()`` and ``algo.optimize_agent()``. Pauses to evaluate the agent at the specified log interval. """ best_itr = 0 # initial_pi_loss = None # initial_q_loss = None # min_save_itr = self.min_save_args['min_save_itr'] # min_save_pi_loss_ratio = self.min_save_args['min_save_pi_loss_ratio'] # min_save_q_loss_ratio = self.min_save_args['min_save_q_loss_ratio'] eval_reward_avg_all = [] n_itr = self.startup() # Evaluate first - initialize initial and best eval reward (before running random exploration) - load policy first, and then reset with logger.prefix(f"itr eval"): self.agent.load_state_dict() eval_traj_infos, eval_time = self.evaluate_agent(itr=0) ini_eval_reward_avg = self.get_eval_reward(eval_traj_infos) # self.log_diagnostics(0, eval_traj_infos, eval_time) self.agent.reset_model() print(f'Initial eval reward: {ini_eval_reward_avg}') logger.log(f'Initial eval reward: {ini_eval_reward_avg}') # Initialize best reward best_eval_reward_avg = ini_eval_reward_avg # best_eval_reward_avg = -1000 # dummy for itr in range(n_itr): logger.set_iteration(itr) with logger.prefix(f"itr #{itr} "): self.agent.sample_mode(itr) samples, traj_infos = self.sampler.obtain_samples(itr) self.agent.train_mode(itr) opt_info = self.algo.optimize_agent(itr, samples) save_cur = False # Find if in min_itr_learn (random exploration using bad policy) if len(opt_info.piLoss) == 0: min_itr_learn = True else: min_itr_learn = False self.store_diagnostics(itr, traj_infos, opt_info) # It is possible that save_cur never satisfied in all itrs, then do not update policy for this retrain if (itr + 1) % self.log_interval_itrs == 0: eval_traj_infos, eval_time = self.evaluate_agent(itr) eval_reward_avg = self.get_eval_reward(eval_traj_infos) # Do not save at initial itrs if not min_itr_learn: eval_reward_avg_all += [eval_reward_avg] eval_reward_window = eval_reward_avg_all[ -self.running_window_size:] # Get running average if len(eval_reward_avg_all ) >= self.running_window_size: running_avg = np.mean(eval_reward_window) else: running_avg = -1000 # dummy # Get running std s0 = sum(1 for a in eval_reward_window) s1 = sum(a for a in eval_reward_window) s2 = sum(a * a for a in eval_reward_window) running_std = np.sqrt( (s0 * s2 - s1 * s1) / (s0 * (s0 - 1))) # Determine if saving current snapshot if check_running and ( running_avg - ini_eval_reward_avg ) > 0 and eval_reward_avg > best_eval_reward_avg and running_std < self.running_std_thres: best_eval_reward_avg = eval_reward_avg best_itr = itr save_cur = True elif not check_running and eval_reward_avg > best_eval_reward_avg: best_eval_reward_avg = eval_reward_avg best_itr = itr save_cur = True self.log_diagnostics(itr, eval_traj_infos, eval_time, save_cur) if (itr + 1) % 10 == 0: logger.log(f'Average eval reward: {eval_reward_avg}') print( f'Average eval reward at itr {itr}: {eval_reward_avg}' ) self.shutdown() if return_buffer: return best_itr, self.algo.replay_buffer_dict() else: return best_itr