def train_step(self, N, env=None, sample_mode='trajectories', horizon=1e6, gamma=0.995, gae_lambda=0.97, num_cpu='max', env_kwargs=None, ): # Clean up input arguments env = self.env.env_id if env is None else env if sample_mode != 'trajectories' and sample_mode != 'samples': print("sample_mode in NPG must be either 'trajectories' or 'samples'") quit() ts = timer.time() if sample_mode == 'trajectories': input_dict = dict(num_traj=N, env=env, policy=self.policy, horizon=horizon, base_seed=self.seed, num_cpu=num_cpu, env_kwargs=env_kwargs) paths = trajectory_sampler.sample_paths(**input_dict) elif sample_mode == 'samples': input_dict = dict(num_samples=N, env=env, policy=self.policy, horizon=horizon, base_seed=self.seed, num_cpu=num_cpu, env_kwargs=env_kwargs) paths = trajectory_sampler.sample_data_batch(**input_dict) if self.save_logs: self.logger.log_kv('time_sampling', timer.time() - ts) self.seed = self.seed + N if self.seed is not None else self.seed # compute returns process_samples.compute_returns(paths, gamma) # compute advantages process_samples.compute_advantages(paths, self.baseline, gamma, gae_lambda) # train from paths eval_statistics = self.train_from_paths(paths) eval_statistics.append(N) # log number of samples if self.save_logs: num_samples = np.sum([p["rewards"].shape[0] for p in paths]) self.logger.log_kv('num_samples', num_samples) # fit baseline if self.save_logs: ts = timer.time() error_before, error_after = self.baseline.fit(paths, return_errors=True) self.logger.log_kv('time_VF', timer.time()-ts) self.logger.log_kv('VF_error_before', error_before) self.logger.log_kv('VF_error_after', error_after) else: self.baseline.fit(paths) return eval_statistics
def train_agent( job_name, agent, seed=0, niter=101, gamma=0.995, gae_lambda=None, num_cpu=1, sample_mode='trajectories', num_traj=50, num_samples=50000, # has precedence, used with sample_mode = 'samples' save_freq=10, evaluation_rollouts=None, plot_keys=None, irl_kwargs=None, env_kwargs=None, temperature_decay=0.95, temperature_min=0, temperature_max=0, training_folder='Runs', should_fresh_start=False, run_no=None, fixed_evaluation_init_states=False): np.random.seed(seed) print("Job name:", job_name) training_path = os.path.join(training_folder, job_name) if plot_keys is None: plot_keys = ['stoc_pol_mean'] if run_no is not None: training_path = check_run_folders(training_path, run_no) if not os.path.isdir(training_path): os.makedirs(training_path) previous_dir = os.getcwd() os.chdir( training_path) # important! we are now in the directory to save data if os.path.isdir('iterations') == False: os.mkdir('iterations') if os.path.isdir('logs') == False and agent.save_logs == True: os.mkdir('logs') best_policy = copy.deepcopy(agent.policy) mean_evaluation_pol_performance = 0.0 if isinstance(env_kwargs, dict): e = GymEnv(agent.env.env_id, **env_kwargs) else: e = GymEnv(agent.env.env_id) i_start = _load_latest_policy_and_logs( agent, policy_dir='iterations', logs_dir='logs', should_fresh_start=should_fresh_start) train_curve = agent.global_status['best_perf'] * np.ones(niter) def save_progress(): if agent.save_logs: agent.logger.save_log('logs/') make_train_plots(log=agent.logger.log, keys=plot_keys, save_loc='logs/') checkpoint_file = 'checkpoint_%i.pickle' % i pickle.dump(agent.checkpoint, open('iterations/' + checkpoint_file, 'wb')) # check if agent has custom save_checkpoint function defined, if so use it save_checkpoint_funct = getattr(agent, "save_checkpoint", None) if save_checkpoint_funct: save_checkpoint_funct(path='iterations/', iteration=i) pickle.dump(best_policy, open('iterations/best_policy.pickle', 'wb')) if i_start: print("Resuming from an existing job folder ...") for i in range(i_start, niter): print( "......................................................................................" ) if run_no is not None: print("ITERATION : %i, RUN : %i " % (i, run_no)) else: print("ITERATION : %i " % i) new_temperature = (temperature_max - temperature_min) * ( temperature_decay**i) + temperature_min if new_temperature < 0 or temperature_max == 0: new_temperature = 0 agent.policy.set_temperature(new_temperature) if agent.save_logs: agent.logger.log_kv('temperature', new_temperature) if train_curve[i - 1] > agent.global_status['best_perf']: best_policy = copy.deepcopy(agent.policy) agent.global_status['best_perf'] = train_curve[i - 1] N = num_traj if sample_mode == 'trajectories' else num_samples args = dict(N=N, itr=i, sample_mode=sample_mode, gamma=gamma, gae_lambda=gae_lambda, num_cpu=num_cpu, env_kwargs=env_kwargs) # calculate no. of policy updates (used for IRL) policy_updates_count = calculate_policy_update_count(i, irl_kwargs) if irl_kwargs is not None: args['return_paths'] = True sampler_paths = [] # do policy update for j in range(policy_updates_count): output = agent.train_step(**args) if isinstance(output, tuple): sampler_paths.extend(output[1]) stats = output[0] else: stats = output if j == 0: train_curve[i] = stats[0] else: train_curve[i] = train_curve[i] + (1 / (1 + j) * (stats[0] - train_curve[i])) if agent.save_logs: agent.logger.log_kv('iteration', i) # IRL discriminator update if irl_kwargs is not None: agent.fit_irl(sampler_paths, main_loop_step=i, main_loop_percentage=i / niter, num_cpu=num_cpu, policy_updates_count=policy_updates_count) if evaluation_rollouts is not None and evaluation_rollouts > 0: print("Performing evaluation rollouts ........") eval_paths = sample_paths( num_traj=evaluation_rollouts, policy=agent.policy, num_cpu=num_cpu, env=e.env_id, eval_mode=True, base_seed=seed, env_kwargs=env_kwargs, fixed_init_states=fixed_evaluation_init_states) if hasattr(agent, "irl_model"): eval_paths = agent.eval_irl(eval_paths, training_paths_from_policy=False) mean_evaluation_pol_performance = np.mean( [np.sum(path['rewards']) for path in eval_paths]) if agent.save_logs: agent.logger.log_kv('eval_score', mean_evaluation_pol_performance) eval_success_rate = e.env.evaluate_success(eval_paths) agent.logger.log_kv('eval_success_rate', eval_success_rate) if agent.save_logs: agent.logger.align_rows() if i % save_freq == 0 and i > 0: save_progress() # print results to console if i == 0: result_file = open('results.txt', 'w') print("Iter | Stoc Pol | Mean Pol | Best (Stoc) \n") result_file.write( "Iter | Sampling Pol | Evaluation Pol | Best (Sampled) \n") result_file.close() print("[ %s ] %4i %5.2f %5.2f %5.2f " % (timer.asctime(timer.localtime(timer.time())), i, train_curve[i], mean_evaluation_pol_performance, agent.global_status['best_perf'])) result_file = open('results.txt', 'a') result_file.write("%4i %5.2f %5.2f %5.2f \n" % (i, train_curve[i], mean_evaluation_pol_performance, agent.global_status['best_perf'])) result_file.close() if agent.save_logs: print_data = sorted( filter(lambda v: np.asarray(v[1]).size == 1, agent.logger.get_current_log().items())) print(tabulate(print_data)) # final save if i_start < niter: save_progress() else: print( "Requested iteration number equal to the found checkpoint iteration count. All done, exiting." ) os.chdir(previous_dir)
seed=SEED, # hvp_sample_frac=job_data['hvp_frac'], normalized_step_size=job_data['step_size'], save_logs=True) paths = [] for outer_iter in range(job_data['num_iter']): ts = timer.time() print("================> ITERATION : %i " % outer_iter) print("Getting interaction data from real dynamics ...") if outer_iter == 0: iter_paths = trajectory_sampler.sample_paths(job_data['n_init_paths'], agent.env, agent.policy, eval_mode=False, base_seed=SEED) else: iter_paths = sample_paths(num_traj=job_data['paths_per_iter'], env=agent.env, policy=agent.policy, eval_mode=False, base_seed=SEED + outer_iter) # reset the environment (good for hardware) e.reset() for p in iter_paths: paths.append(p)
def train_agent( job_name, agent, seed=0, niter=101, gamma=0.995, gae_lambda=None, num_cpu=1, sample_mode='trajectories', num_traj=50, num_samples=50000, # has precedence, used with sample_mode = 'samples' save_freq=10, evaluation_rollouts=None, plot_keys=['stoc_pol_mean'], ): np.random.seed(seed) if os.path.isdir(job_name) == False: os.mkdir(job_name) previous_dir = os.getcwd() os.chdir(job_name) # important! we are now in the directory to save data if os.path.isdir('iterations') == False: os.mkdir('iterations') if os.path.isdir('logs') == False and agent.save_logs == True: os.mkdir('logs') best_policy = copy.deepcopy(agent.policy) best_perf = -1e8 train_curve = best_perf * np.ones(niter) mean_pol_perf = 0.0 e = GymEnv(agent.env.env_id) for i in range(niter): print( "......................................................................................" ) print("ITERATION : %i " % i) if train_curve[i - 1] > best_perf: best_policy = copy.deepcopy(agent.policy) best_perf = train_curve[i - 1] N = num_traj if sample_mode == 'trajectories' else num_samples args = dict(N=N, sample_mode=sample_mode, gamma=gamma, gae_lambda=gae_lambda, num_cpu=num_cpu) stats = agent.train_step(**args) train_curve[i] = stats[0] if evaluation_rollouts is not None and evaluation_rollouts > 0: print("Performing evaluation rollouts ........") eval_paths = sample_paths(num_traj=evaluation_rollouts, policy=agent.policy, num_cpu=num_cpu, env=e.env_id, eval_mode=True, base_seed=seed) mean_pol_perf = np.mean( [np.sum(path['rewards']) for path in eval_paths]) if agent.save_logs: agent.logger.log_kv('eval_score', mean_pol_perf) if i % save_freq == 0 and i > 0: if agent.save_logs: agent.logger.save_log('logs/') make_train_plots(log=agent.logger.log, keys=plot_keys, save_loc='logs/') policy_file = 'policy_%i.pickle' % i baseline_file = 'baseline_%i.pickle' % i pickle.dump(agent.policy, open('iterations/' + policy_file, 'wb')) pickle.dump(agent.baseline, open('iterations/' + baseline_file, 'wb')) pickle.dump(best_policy, open('iterations/best_policy.pickle', 'wb')) # print results to console if i == 0: result_file = open('results.txt', 'w') print("Iter | Stoc Pol | Mean Pol | Best (Stoc) \n") result_file.write( "Iter | Sampling Pol | Evaluation Pol | Best (Sampled) \n") result_file.close() print("[ %s ] %4i %5.2f %5.2f %5.2f " % (timer.asctime(timer.localtime( timer.time())), i, train_curve[i], mean_pol_perf, best_perf)) result_file = open('results.txt', 'a') result_file.write("%4i %5.2f %5.2f %5.2f \n" % (i, train_curve[i], mean_pol_perf, best_perf)) result_file.close() if agent.save_logs: print_data = sorted( filter(lambda v: np.asarray(v[1]).size == 1, agent.logger.get_current_log().items())) print(tabulate(print_data)) # final save pickle.dump(best_policy, open('iterations/best_policy.pickle', 'wb')) if agent.save_logs: agent.logger.save_log('logs/') make_train_plots(log=agent.logger.log, keys=plot_keys, save_loc='logs/') os.chdir(previous_dir)
save_freq=5, evaluation_rollouts=None) print("========================================") print("Expert policy training complete !!!") print("========================================") print("time taken = %f" % (timer.time() - ts)) print("========================================") # ------------------------------ # Get demonstrations print("========================================") print("Collecting expert demonstrations") print("========================================") expert_pol = pickle.load( open('swimmer_exp1/iterations/best_policy.pickle', 'rb')) demo_paths = sample_paths(num_traj=5, policy=expert_pol, env=e.env_id) # ------------------------------ # Train BC policy = MLP(e.spec, hidden_sizes=(32, 32), seed=SEED) bc_agent = BC(demo_paths, policy=policy, epochs=20, batch_size=64, lr=1e-3) # will use Adam by default ts = timer.time() print("========================================") print("Running BC with expert demonstrations") print("========================================") bc_agent.train() print("========================================") print("BC training complete !!!") print("time taken = %f" % (timer.time() - ts)) print("========================================")
def train_step( self, N, itr, env=None, sample_mode='trajectories', horizon=1e6, gamma=0.995, gae_lambda=0.97, num_cpu='max', env_kwargs=None, return_paths=False, ): # Clean up input arguments env = self.env.env_id if env is None else env if sample_mode != 'trajectories' and sample_mode != 'samples': print( "sample_mode in NPG must be either 'trajectories' or 'samples'" ) quit() ts = timer.time() if sample_mode == 'trajectories': input_dict = dict(num_traj=N, env=env, policy=self.policy, horizon=horizon, base_seed=self.seed, num_cpu=num_cpu, env_kwargs=env_kwargs) paths = trajectory_sampler.sample_paths(**input_dict) if self.augmentation is not None: paths = self.augmentation.augment_paths( paths, num_cpu=num_cpu, augment_times=self.direct_learning_augment_samples_count) elif sample_mode == 'samples': input_dict = dict(num_samples=N, env=env, policy=self.policy, horizon=horizon, base_seed=self.seed, num_cpu=num_cpu, env_kwargs=env_kwargs) paths = trajectory_sampler.sample_data_batch(**input_dict) else: raise ValueError( "sample_mode has to be either trajectories or samples, given:", sample_mode) if self.save_logs: self.logger.log_kv('time_sampling', timer.time() - ts) self.seed = self.seed + N if self.seed is not None else self.seed if return_paths: original_paths = paths.copy() if self.dump_paths: self.fusion.save_itr_paths(itr=itr, paths=paths) if hasattr(self, "irl_model"): paths = self.eval_irl(paths) if hasattr(self, "demo_paths") and self.demo_paths is not None: self.demo_paths = self.eval_irl( self.demo_paths, training_paths_from_policy=False) # compute returns process_samples.compute_returns(paths, gamma) # compute advantages process_samples.compute_advantages(paths, self.baseline, gamma, gae_lambda) # train from paths eval_statistics = self.train_from_paths(paths) eval_statistics.append(N) # log number of samples if self.save_logs: num_samples = np.sum([p["rewards"].shape[0] for p in paths]) self.logger.log_kv('num_samples', num_samples) # fit baseline if self.save_logs: ts = timer.time() error_before, error_after = self.baseline.fit(paths, return_errors=True) self.logger.log_kv('time_VF', timer.time() - ts) self.logger.log_kv('VF_error_before', error_before) self.logger.log_kv('VF_error_after', error_after) else: self.baseline.fit(paths) if return_paths: return eval_statistics, original_paths else: return eval_statistics