def main(env_name, num_episodes, render, VideoSave, gamma, lam, kl_targ, batch_size): killer = GracefulKiller() env, obs_dim, act_dim = init_gym(env_name, render) obs_dim += 1 # add 1 to obs dimension for time step feature (see run_episode()) now = datetime.utcnow().strftime("%b-%d_%H-%M-%S") # create unique directories logger = Logger(logname=env_name, now=now) #aigym_path = os.path.join('/tmp', env_name, now) #env = wrappers.Monitor(env, aigym_path, force=True) scaler = Scaler(obs_dim, env_name) scaler.resume() val_func = NNValueFunction(obs_dim, env_name) policy = Policy(obs_dim, act_dim, kl_targ, env_name) episode = 0 capture = False while episode < num_episodes: if VideoSave and not capture: env.ScreenCapture(5) capture = True trajectories = run_policy(env, policy, scaler, logger, episodes=batch_size) episode += len(trajectories) if killer.kill_now: if input('Terminate training (y/[n])? ') == 'y': break killer.kill_now = False logger.close() policy.close_sess() val_func.close_sess()
def main(env_name, num_episodes, render, gamma, lam, kl_targ, batch_size): """ Main training loop Args: env_name: OpenAI Gym environment name, e.g. 'Hopper-v1' num_episodes: maximum number of episodes to run gamma: reward discount factor (float) lam: lambda from Generalized Advantage Estimate kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new) batch_size: number of episodes per policy training batch """ killer = GracefulKiller() env, obs_dim, act_dim = init_gym(env_name, render) obs_dim += 1 # add 1 to obs dimension for time step feature (see run_episode()) now = datetime.utcnow().strftime("%b-%d_%H-%M-%S") # create unique directories logger = Logger(logname=env_name, now=now) scaler = Scaler(obs_dim, env_name) val_func = NNValueFunction(obs_dim, env_name) policy = Policy(obs_dim, act_dim, kl_targ, env_name) # run a few episodes of untrained policy to initialize scaler: run_policy(env, policy, scaler, logger, episodes=5) episode = 0 #capture = False while episode < num_episodes: trajectories = run_policy(env, policy, scaler, logger, episodes=batch_size) episode += len(trajectories) """if episode > 600 and not capture: env.ScreenCapture(5) capture = True""" add_value(trajectories, val_func) # add estimated values to episodes add_disc_sum_rew(trajectories, gamma) # calculated discounted sum of Rs add_gae(trajectories, gamma, lam) # calculate advantage # concatenate all episodes into single NumPy arrays observes, actions, advantages, disc_sum_rew = build_train_set(trajectories) # add various stats to training log: log_batch_stats(observes, actions, advantages, disc_sum_rew, logger, episode) policy.update(observes, actions, advantages, logger) # update policy val_func.fit(observes, disc_sum_rew, logger) # update value function logger.write(display=True) # write logger results to file and stdout scaler.save() if killer.kill_now: if input('Terminate training (y/[n])? ') == 'y': break killer.kill_now = False logger.close() policy.close_sess() val_func.close_sess()
class Experiment: def __init__(self, env_name, discount, num_iterations, lamb, animate, kl_target, show): self.env_name = env_name self.env = gym.make(env_name) if env_name == "FetchReach-v0": self.env = gym.wrappers.FlattenDictWrapper(self.env, ['observation', 'desired_goal', 'achieved_goal']) gym.spaces.seed(1234) self.obs_dim = self.env.observation_space.shape[0]# + 1 # adding time step as feature self.act_dim = self.env.action_space.shape[0] self.discount = discount self.num_iterations = num_iterations self.lamb = lamb self.animate = animate self.episodes = 20 self.killer = GracefulKiller() # self.policy = ProximalPolicy(self.obs_dim, self.act_dim, self.env.action_space, kl_target, discount=discount, # lamb=lamb) self.policy = NoTracePolicy(self.obs_dim, self.act_dim, self.env.action_space, kl_target, epochs=20) # using MC return would be more helpful self.value_func = l2TargetValueFunc(self.obs_dim, epochs=10) # self.value_func = ValueFunc(self.obs_dim, discount=discount, lamb=1) if not show: # save copies of file shutil.copy(inspect.getfile(self.policy.__class__), OUTPATH) shutil.copy(inspect.getfile(self.value_func.__class__), OUTPATH) shutil.copy(inspect.getfile(self.__class__), OUTPATH) self.log_file = open(OUTPATH + 'log.csv', 'w') self.write_header = True print('observation dimension:', self.obs_dim) print('action dimension:', self.act_dim) # Use of a scaler is crucial self.scaler = Scaler(self.obs_dim) self.init_scaler() def init_scaler(self): print('fitting scaler') observation_samples = [] for i in range(5): observation = [] obs = self.env.reset() observation.append(obs) obs = obs.astype(np.float64).reshape((1, -1)) done = False step = 0 while not done: # obs = np.append(obs, [[step]], axis=1) # add time step feature action = self.policy.get_sample(obs).reshape((1, -1)).astype(np.float64) if self.env_name == "FetchReach-v0": obs_new, reward, done, _ = self.env.step(action.reshape(-1)) else: obs_new, reward, done, _ = self.env.step(action) observation.append(obs_new) obs = obs_new.astype(np.float64).reshape((1, -1)) step += 1e-3 observation_samples.append(observation) observation_samples = np.concatenate(observation_samples, axis=0) # print(observation_samples.shape) self.scaler.update(observation_samples) def normalize_obs(self, obs): scale, offset = self.scaler.get() obs_scaled = (obs-offset)*scale self.scaler.update(obs.astype(np.float64).reshape((1, -1))) return obs_scaled def run_one_episode(self): """ collect data only :param save: :param train_policy: :param train_value_func: :param animate: :return: """ obs = self.env.reset() observes, actions, rewards = [],[],[] done = False step = 0 while not done: if self.animate: self.env.render() obs = obs.astype(np.float64).reshape((1, -1)) obs = self.normalize_obs(obs) # obs = np.append(obs, [[step]], axis=1) # add time step feature observes.append(obs) action = self.policy.get_sample(obs).reshape((1, -1)).astype(np.float64) actions.append(action) if self.env_name == "FetchReach-v0": obs_new, reward, done, _ = self.env.step(action.reshape(-1)) else: obs_new, reward, done, _ = self.env.step(action) if not isinstance(reward, float): reward = np.asscalar(reward) rewards.append(reward) obs = obs_new step += 0.003 return np.concatenate(observes), np.concatenate(actions), np.array(rewards) def discounted_sum(self, l, factor): discounted = [] sum = 0 for i in reversed(l): discounted.append(factor*sum+i) sum = factor*sum+i return np.array(list(reversed(discounted))) def run_policy(self, episodes): trajectories = [] for e in range(episodes): observes, actions, rewards = self.run_one_episode() trajectory = {'observes': observes, 'actions': actions, 'rewards': rewards} # scale rewards if self.discount < 0.999: rewards = rewards*(1-self.discount) trajectory['values'] = self.value_func.predict(observes) trajectory['mc_return'] = self.discounted_sum(rewards, self.discount) trajectory['td_residual'] = rewards + self.discount*np.append(trajectory['values'][1:],0) - trajectory['values'] trajectory['gae'] = self.discounted_sum(trajectory['td_residual'], self.discount*self.lamb) trajectories.append(trajectory) return trajectories def run_expr(self): ep_steps = [] ep_rewards = [] ep_entropy = [] i = 0 while i < self.num_iterations: trajectories = self.run_policy(20) i += len(trajectories) observes = np.concatenate([t['observes'] for t in trajectories]) actions = np.concatenate([t['actions'] for t in trajectories]) mc_returns = np.concatenate([t['mc_return'] for t in trajectories]) # advantages = np.concatenate([t['td_residual'] for t in trajectories]) advantages = np.concatenate([t['gae'] for t in trajectories]) # normalize advantage estimates advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-6) value_func_loss = self.value_func.update(observes, mc_returns) policy_loss, kl, entropy, beta = self.policy.update(observes, actions, advantages) avg_rewards = np.sum(np.concatenate([t['rewards'] for t in trajectories])) / self.episodes avg_timesteps = np.average([len(t['rewards']) for t in trajectories]) log = {} # compute statistics such as mean and std log['steps'] = avg_timesteps log['rewards'] = avg_rewards log['policy_loss'] = policy_loss log['kl'] = kl log['entropy'] = entropy log['value_func_loss'] = value_func_loss log['beta'] = beta # display print('episode: ', i) print('average steps: {0}, average rewards: {1}'.format(log['steps'], log['rewards'])) for key in ['policy_loss', 'kl', 'entropy', 'beta', 'value_func_loss']: print('{:s}: {:.2g}'.format(key, log[key])) print('\n') ep_steps.append(log['steps']) ep_rewards.append(log['rewards']) ep_entropy.append(log['entropy']) # write to log.csv if self.write_header: fieldnames = [x for x in log.keys()] self.writer = csv.DictWriter(self.log_file, fieldnames=fieldnames) self.writer.writeheader() self.write_header = False self.writer.writerow(log) # we want the csv file to preserve information even if the program terminates earlier than scheduled. self.log_file.flush() # save model weights if stopped manually if self.killer.kill_now: if input('Terminate training (y/[n])? ') == 'y': break self.killer.kill_now = False # if (i+1)%20 == 0: # print('episode: ', i+1) # print('average steps', np.average(steps)) # print('average rewards', np.average(rewards)) self.policy.save(OUTPATH) self.value_func.save(OUTPATH) self.scaler.save(OUTPATH) plt.figure(figsize=(12,9)) if self.env_name.startswith('Fetch'): ax1 = plt.subplot(121) plt.xlabel('episodes') plt.ylabel('policy entropy') plt.plot(ep_entropy) scale_x = self.episodes ticks_x = ticker.FuncFormatter(lambda x, pos: '{0:g}'.format(x * scale_x)) ax1.xaxis.set_major_formatter(ticks_x) else: ax1 = plt.subplot(121) plt.xlabel('episodes') plt.ylabel('steps') plt.plot(ep_steps) scale_x = self.episodes ticks_x = ticker.FuncFormatter(lambda x, pos: '{0:g}'.format(x * scale_x)) ax1.xaxis.set_major_formatter(ticks_x) ax2 = plt.subplot(122) plt.xlabel('episodes') plt.ylabel('episodic rewards') plt.plot(ep_rewards) scale_x = self.episodes ticks_x = ticker.FuncFormatter(lambda x, pos: '{0:g}'.format(x * scale_x)) ax2.xaxis.set_major_formatter(ticks_x) plt.savefig(OUTPATH + 'train.png') def load_model(self, load_from): from tensorflow.python.tools import inspect_checkpoint as chkp # # print all tensors in checkpoint file # chkp.print_tensors_in_checkpoint_file(load_from+'policy/policy.pl', tensor_name='', all_tensors=True, all_tensor_names=True) self.policy.load(load_from + 'policy/policy.pl') self.value_func.load(load_from + 'value_func/value_func.pl') def demonstrate_agent(self, load_from): self.load_model(load_from) with open(load_from + "scaler.pkl", 'rb') as file: self.scaler = pickle.load(file) self.animate = True for i in range(10): observes, actons, rewards = self.run_one_episode() ep_rewards = np.sum(rewards) ep_steps = len(rewards) print("Total steps: {0}, total rewards: {1}\n".format(ep_steps, ep_rewards))
class GeneratorAgentPure(object): def __init__(self, env, policy_function, value_function, discriminator, gamma, lam, init_qpos, init_qvel, logger=None): self.env = env self.obs_dim = env.observation_space.shape[0] self.act_dim = env.action_space.shape[0] self.policy = policy_function self.value = value_function self.discriminator = discriminator self.gamma = gamma self.lam = lam self.init_qpos = init_qpos self.init_qvel = init_qvel self.scaler = Scaler(self.obs_dim) # logger self.logger = logger # set scaler's scale and offset by collecting 5 episodes self.collect(timesteps=2048) def discount(self, x, gamma): return scipy.signal.lfilter([1.0], [1.0, -gamma], x[::-1])[::-1] def get_random(self): idx = np.random.randint(low=0, high=self.init_qpos.shape[1], size=1) return np.squeeze(self.init_qpos[:, idx]), np.squeeze(self.init_qvel[:, idx]) def collect(self, timesteps): trajectories = [] trew_stat = [] scale, offset = self.scaler.get() self.logger.log('scale_offset', [scale, offset]) buffer_time = 0 while buffer_time < timesteps: unscaled_obs, scaled_obs, actions, rewards = [], [], [], [] egocentric = [] done = False obs = self.env.reset() qpos, qvel = self.get_random() # we are setting initial qpos and qvel from expert self.env.set_state(qpos, qvel) timestep = 0 while not done and timestep < 1000: obs = obs.astype(np.float32).reshape(1, -1) unscaled_obs.append(obs) obs = (obs - offset) * scale scaled_obs.append(obs) acts = self.policy.sample(obs) actions.append(acts.astype(np.float32).reshape(1, -1)) obs, rew, done, _ = self.env.step(acts) rewards.append(rew) timestep += 1 buffer_time += 1 # statistics trew_stat.append(np.sum(rewards)) # episode info traj_obs = np.concatenate(scaled_obs) traj_unscaled_obs = np.concatenate(unscaled_obs) traj_acts = np.concatenate(actions) #traj_rews = np.array(rewards, dtype=np.float64) traj_rews = np.squeeze( self.discriminator.get_rewards(traj_unscaled_obs, traj_acts)) # scale rewards using running std of the experiment # traj_scaled_rews = traj_rews * np.squeeze(rew_scale) traj_scaled_rews = traj_rews # calculate discount sum of rewards traj_disc_rews = self.discount(traj_scaled_rews, self.gamma) # calculate advantages traj_values = self.value.predict(traj_obs) deltas = traj_scaled_rews - traj_values + np.append( traj_values[1:] * self.gamma, 0) traj_advantages = self.discount(deltas, self.gamma * self.lam) trajectory = { 'observations': traj_obs, 'actions': traj_acts, 'tdlam': traj_disc_rews, 'advantages': traj_advantages, 'unscaled_obs': traj_unscaled_obs } trajectories.append(trajectory) # update observation scaler uns_obs = np.concatenate([t['unscaled_obs'] for t in trajectories]) self.scaler.update(uns_obs) # update rewards scaler #uns_rews = np.concatenate([t['unscaled_rews'] for t in trajectories]) #self.rew_scaler.update(uns_rews) observations = np.concatenate( [t['observations'] for t in trajectories]) actions = np.concatenate([t['actions'] for t in trajectories]) tdlam = np.concatenate([t['tdlam'] for t in trajectories]) advantages = np.concatenate([t['advantages'] for t in trajectories]) advantages = (advantages - np.mean(advantages)) / np.std(advantages) # check stats print('mean_trew: %f' % np.mean(trew_stat)) self.logger.log('trew_stat', np.mean(trew_stat)) return observations, uns_obs, actions, tdlam, advantages
def train_models(env_name, num_episodes, gamma, lam, kl_targ, coef, use_lr_adjust, ada_kl_penalty, seed, epochs, phi_epochs, max_timesteps, reg_scale, phi_lr, phi_hs, policy_size, phi_obj, load_model): env, obs_dim, act_dim = init_gym(env_name) set_global_seeds(seed) env.seed(seed) env._max_episode_steps = max_timesteps obs_dim += 1 # add 1 to obs dimension for time step feature (see run_episode()) now = datetime.utcnow().strftime( "%b-%d_%H:%M:%S") # create unique directories aigym_path = os.path.join('log-files/', env_name, now) env = wrappers.Monitor(env, aigym_path, force=True, video_callable=False) scaler = Scaler(obs_dim) val_func = NNValueFunction(obs_dim) policy = Policy(obs_dim, act_dim, kl_targ, epochs, phi_epochs, policy_size=policy_size, phi_hidden_sizes=phi_hs, reg_scale=reg_scale, lr_phi=phi_lr, phi_obj=phi_obj) run_policy(env, policy, scaler, num_episodes, max_timesteps=max_timesteps, mode=load_model) # run a few to init scaler episode = 0 for i in range(2000): print("sampling and training at %s iteration\n" % (i)) trajectories, traj_len_list = run_policy(env, policy, scaler, num_episodes, max_timesteps=max_timesteps, mode=load_model) num_traj = len(trajectories) episode += len(trajectories) add_value(trajectories, val_func) add_disc_sum_rew(trajectories, gamma) add_gae(trajectories, gamma, lam) observes, actions, advantages, disc_sum_rew = build_train_set( trajectories) policy.update(load_model, observes, actions, advantages, use_lr_adjust, ada_kl_penalty, c=0.) # update policy val_func.fit(observes, disc_sum_rew) # Save models policy.save_policy() val_func.save_val_func() refine_scaler = False if refine_scaler == True: run_policy(env, policy, scaler, num_episodes, max_timesteps=max_timesteps, mode=load_model) # run a few to refine scaler with open('models/scaler/scaler.pkl', 'wb') as output: pickle.dump(scaler, output, pickle.HIGHEST_PROTOCOL) logger.log("saved model")
def main(env_name, num_episodes, gamma, lam, kl_targ, batch_size, hid1_mult, policy_logvar): """ Main training loop Args: env_name: OpenAI Gym environment name, e.g. 'Hopper-v1' num_episodes: maximum number of episodes to run gamma: reward discount factor (float) lam: lambda from Generalized Advantage Estimate kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new) batch_size: number of episodes per policy training batch hid1_mult: hid1 size for policy and value_f (mutliplier of obs dimension) policy_logvar: natural log of initial policy variance """ killer = GracefulKiller() env, obs_dim, act_dim = init_gym(env_name) obs_dim += 1 # add 1 to obs dimension for time step feature (see run_episode()) now = datetime.utcnow().strftime( "%b-%d_%H:%M:%S") # create unique directories logger = Logger(logname=env_name, now=now) aigym_path = os.path.join('/tmp', env_name, now) #env = wrappers.Monitor(env, aigym_path, force=True) scaler = Scaler(obs_dim) val_func = NNValueFunction(obs_dim, hid1_mult) policy = Policy(env_name, obs_dim, act_dim, kl_targ, hid1_mult, policy_logvar) # run a few episodes of untrained policy to initialize scaler: run_policy(env, policy, scaler, logger, episodes=5) episode = 0 while episode < num_episodes: trajectories = run_policy(env, policy, scaler, logger, episodes=batch_size) episode += len(trajectories) add_value(trajectories, val_func) # add estimated values to episodes add_disc_sum_rew(trajectories, gamma) # calculated discounted sum of Rs add_gae(trajectories, gamma, lam) # calculate advantage # concatenate all episodes into single NumPy arrays observes, actions, advantages, disc_sum_rew = build_train_set( trajectories) # add various stats to training log: log_batch_stats(observes, actions, advantages, disc_sum_rew, logger, episode) policy.update(observes, actions, advantages, logger) # update policy val_func.fit(observes, disc_sum_rew, logger) # update value function logger.write(display=True) # write logger results to file and stdout if killer.kill_now: if input('Terminate training (y/[n])? ') == 'y': break killer.kill_now = False scale, offset = scaler.get() data = {'SCALE': scale, 'OFFSET': offset} directory_to_store_data = '../saved_models/' + env_name + '/' if not os.path.exists(directory_to_store_data): os.makedirs(directory_to_store_data) file_name = directory_to_store_data + 'scale_and_offset.pkl' with open(file_name, 'wb') as f: pickle.dump(data, f) logger.close() policy.close_sess() val_func.close_sess()
def main(env_name, num_episodes, gamma, lam, kl_targ, batch_size): """ Main training loop Args: env_name: OpenAI Gym environment name, e.g. 'Hopper-v1' num_episodes: maximum number of episodes to run gamma: reward discount factor (float) lam: lambda from Generalized Advantage Estimate kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new) batch_size: number of episodes per policy training batch """ killer = GracefulKiller() #TODO Change init_gym for one of my functions env, obs_dim, act_dim = init_gym(env_name) obs_dim += 1 # add 1 to obs dimension for time step feature (see run_episode()) now = datetime.utcnow().strftime("%b-%d_%H:%M:%S").replace( ":", "_") # create unique directories logger = Logger(logname=env_name, now=now) pathFolder = logger.pathFolder #Change wrappers.Monitor for a class of mine that controls de simulation #Creo que el wrapper no sirve de nada para mi ejemplo #env = wrappers.Monitor(env, aigym_path, force=True) scaler = Scaler(obs_dim) val_func = NNValueFunction(obs_dim) policy = Policy(obs_dim, act_dim, kl_targ) #Esto es para alimentar con el optimo trajectories = initiatePolicyWithOptimum(env, policy, scaler, logger) add_value(trajectories, val_func) # add estimated values to episodes add_disc_sum_rew(trajectories, gamma) # calculated discounted sum of Rs add_gae(trajectories, gamma, lam) # calculate advantage # concatenate all episodes into single NumPy arrays observes, actions, advantages, disc_sum_rew = build_train_set(trajectories) print(actions.shape) policy.update(observes, actions, advantages, logger) # update policy val_func.fit(observes, disc_sum_rew, logger) # update value function # No estoy seguro de si esto es necesario ya # run a few episodes of untrained policy to initialize scaler: run_policy(env, policy, scaler, logger, episodes=5) episode = 0 while episode < num_episodes: trajectories = run_policy(env, policy, scaler, logger, episodes=batch_size) episode += len(trajectories) add_value(trajectories, val_func) # add estimated values to episodes add_disc_sum_rew(trajectories, gamma) # calculated discounted sum of Rs add_gae(trajectories, gamma, lam) # calculate advantage # concatenate all episodes into single NumPy arrays observes, actions, advantages, disc_sum_rew = build_train_set( trajectories) # add various stats to training log: log_batch_stats(observes, actions, advantages, disc_sum_rew, logger, episode) policy.update(observes, actions, advantages, logger) # update policy val_func.fit(observes, disc_sum_rew, logger) # update value function logger.write(display=True) # write logger results to file and stdout if killer.kill_now: if input('Terminate training (y/[n])? ') == 'y': break killer.kill_now = False logger.close() policy.close_sess(pathFolder) val_func.close_sess(pathFolder)
def main(env_name, num_episodes, gamma, lam, kl_targ, batch_size, hid1_mult, policy_logvar, max_frames, load_and_run): """ Main training loop Args: env_name: OpenAI Gym environment name, e.g. 'Hopper-v1' num_episodes: maximum number of episodes to run gamma: reward discount factor (float) lam: lambda from Generalized Advantage Estimate kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new) batch_size: number of episodes per policy training batch hid1_mult: hid1 size for policy and value_f (mutliplier of obs dimension) policy_logvar: natural log of initial policy variance """ killer = GracefulKiller() env, obs_dim, act_dim = init_gym(env_name) print("Env loaded.") obs_dim += 1 # add 1 to obs dimension for time step feature (see run_episode()) now = datetime.utcnow().strftime( "%b-%d_%H:%M:%S") # create unique directories logger = Logger(logname=env_name, now=now) aigym_path = os.path.join('/tmp', env_name, now) # env = wrappers.Monitor(env, aigym_path, force=True) scaler = Scaler(obs_dim) val_func = NNValueFunction(obs_dim, hid1_mult) policy = Policy(obs_dim, act_dim, kl_targ, hid1_mult, policy_logvar) # run a few episodes of untrained policy to initialize scaler: run_policy(env, policy, scaler, logger, episodes=5) if load_and_run: val_func.load_weights() while True: run_episode(env, policy, scaler, animate=True) exit() episode = 0 print("Running episodes...") while episode < num_episodes: episode_startime = time.time() trajectories = run_policy(env, policy, scaler, logger, episodes=batch_size, max_frames=max_frames) episode += len(trajectories) add_value(trajectories, val_func) # add estimated values to episodes add_disc_sum_rew(trajectories, gamma) # calculated discounted sum of Rs try: add_gae(trajectories, gamma, lam) # calculate advantage # concatenate all episodes into single NumPy arrays observes, actions, advantages, disc_sum_rew = build_train_set( trajectories) except Exception as e: print(e) print('skipping...') continue # add various stats to training log: log_batch_stats(observes, actions, advantages, disc_sum_rew, logger, episode) policy.update(observes, actions, advantages, logger) # update policy val_func.fit(observes, disc_sum_rew, logger) # update value function logger.write(display=True) # write logger results to file and stdout if killer.kill_now: if input('Terminate training (y/[n])? ') == 'y': break killer.kill_now = False print("Batch took %i seconds to run." % (time.time() - episode_startime)) if not episode % 1000: val_func.save_weights() run_episode(env, policy, scaler, animate=True) logger.close() policy.close_sess() val_func.close_sess()
def main(env_name, num_episodes, gamma, lam, kl_targ, batch_size, net_size_factor, noise_bias, weight, use_ppoclip): """ Main training loop Args: env_name: OpenAI Gym environment name, e.g. 'Hopper-v1' num_episodes: maximum number of episodes to run gamma: reward discount factor (float) lam: lambda from Generalized Advantage Estimate kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new) batch_size: number of episodes per policy training batch """ killer = GracefulKiller() env, obs_dim, act_dim = init_gym(env_name) obs_dim += 1 # add 1 to obs dimension for time step feature (see run_episode()) # now = datetime.utcnow().strftime("%b-%d_%H:%M:%S") # create unique directories now = datetime.now().strftime("%b-%d_%H:%M:%S") + "_single" logger = Logger(logname=env_name, now=now) aigym_path = os.path.join('/tmp', env_name, now) # env = wrappers.Monitor(env, aigym_path, force=True) scaler = Scaler(obs_dim) if weight == "None": val_func = NNValueFunction(obs_dim, net_size_factor=net_size_factor) policy = None if use_ppoclip == "False": policy = Policy(obs_dim, act_dim, kl_targ, net_size_factor=net_size_factor, noise_bias=noise_bias) elif use_ppoclip == "True": policy = PolicyClip(obs_dim, act_dim, kl_targ, net_size_factor=net_size_factor, noise_bias=noise_bias) #assert False, "Not tested" else: assert False, "Unreachable" else: token = weight.split(".") token[-3] = token[-3][:-5] + "value" weight_2 = ".".join(token) val_func = NNValueFunctionContinue(weight_2, obs_dim, net_size_factor=net_size_factor) policy = PolicyContinue(weight, obs_dim, act_dim, kl_targ, net_size_factor=net_size_factor, noise_bias=noise_bias) # run a few episodes of untrained policy to initialize scaler: run_policy(env, policy, scaler, logger, episodes=5) episode = 0 while episode < num_episodes: trajectories = run_policy(env, policy, scaler, logger, episodes=batch_size) episode += len(trajectories) add_value(trajectories, val_func) # add estimated values to episodes add_disc_sum_rew(trajectories, gamma) # calculated discounted sum of Rs add_gae(trajectories, gamma, lam) # calculate advantage # concatenate all episodes into single NumPy arrays observes, actions, advantages, disc_sum_rew = build_train_set( trajectories) # add various stats to training log: log_batch_stats(observes, actions, advantages, disc_sum_rew, logger, episode) policy.update(observes, actions, advantages, logger, scaler) # update policy val_func.fit(observes, disc_sum_rew, logger) # update value function logger.write(display=True) # write logger results to file and stdout if killer.kill_now: if input('Terminate training (y/[n])? ') == 'y': break killer.kill_now = False logger.close() # with open("test_dump", 'w') as f: # pickle.dump(policy, f) policy.close_sess() val_func.close_sess()
def main(env_name, num_episodes, gamma, lam, kl_targ, batch_size,hid1_mult, policy_logvar, weights_path, init_episode, experiment_name, resume, augment=False): """ Main training loop Args: env_name: OpenAI Gym environment name, e.g. 'Hopper-v1' num_episodes: maximum number of episodes to run gamma: reward discount factor (float) lam: lambda from Generalized Advantage Estimate kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new) batch_size: number of episodes per policy training batch hid1_mult: hid1 size for policy and value_f (mutliplier of obs dimension) policy_logvar: natural log of initial policy variance """ killer = GracefulKiller() logger = Logger(logname=env_name, sub_dir=experiment_name) aigym_path = os.path.join('results', env_name, experiment_name) if resume: weights_path = aigym_path ckpt = tf.train.get_checkpoint_state(weights_path) init_episode = int(os.path.basename(ckpt.model_checkpoint_path).split('-')[1]) env, obs_dim, act_dim = init_gym(env_name) obs_dim = 45 # obs_dim += 1 # add 1 to obs dimension for time step feature (see run_episode()) # env = wrappers.Monitor(env, aigym_path, force=True) if augment: obs_dim *= 2 scaler = Scaler(obs_dim) val_func = NNValueFunction(obs_dim, hid1_mult) policy = Policy(obs_dim, act_dim, kl_targ, hid1_mult, policy_logvar, weights_path) # run a few episodes of untrained policy to initialize scaler: run_policy(env, policy, scaler, logger, 5, augment) episode = init_episode while episode <= num_episodes: if episode % 1000 is 0: # record one episode record(env_name, aigym_path, policy, scaler, augment) policy.save(aigym_path, episode) trajectories = run_policy(env, policy, scaler, logger, batch_size, augment) episode += len(trajectories) add_value(trajectories, val_func) # add estimated values to episodes add_disc_sum_rew(trajectories, gamma) # calculated discounted sum of Rs add_gae(trajectories, gamma, lam) # calculate advantage # concatenate all episodes into single NumPy arrays observes, actions, advantages, disc_sum_rew = build_train_set(trajectories) # add various stats to training log: log_batch_stats(observes, actions, advantages, disc_sum_rew, logger, episode) policy.update(observes, actions, advantages, logger) # update policy val_func.fit(observes, disc_sum_rew, logger) # update value function logger.write(display=True) # write logger results to file and stdout if killer.kill_now: if input('Terminate training (y/[n])? ') == 'y': break killer.kill_now = False #record one last episode record(env_name, aigym_path, policy, scaler, augment) logger.close() policy.close_sess() val_func.close_sess()
gamma = 0.995 lam = 0.98 batch_size = 5 env = gym.make(env_name) obs_dim = env.observation_space.shape[0] obs_dim += 1 # add 1 to obs dimension for time step feature (see run_episode()) act_dim = env.action_space.shape[0] # sess = tf.Session() policy = Policy(obs_dim, act_dim) val_func = NNValueFunction(obs_dim) # sess.run(tf.compat.v1.initializers.global_variables()) now = datetime.utcnow().strftime("%b-%d_%H:%M:%S") # create unique directories logger = Logger(logname=env_name, now=now) scaler = Scaler(obs_dim) run_policy(env, policy, scaler, logger, episodes=5) episode = 0 while episode < num_episodes: trajectories = run_policy(env, policy, scaler, logger, episodes=batch_size) episode += len(trajectories) observes, actions, advantages, disc_sum_rew = build_train_set(trajectories, val_func, gamma, lam) policy.update(observes, actions, advantages, logger) val_func.fit(observes, disc_sum_rew, logger) logger.log({ '_Episode': episode, }) logger.write(display=True)
class Experiment: def __init__(self, discount, num_iterations, lamb, animate, kl_target, **kwargs): self.env_name = 'RoboschoolHumanoidFlagrun-v1' self.env = gym.make(self.env_name) gym.spaces.seed(1234) # for reproducibility self.obs_dim = self.env.observation_space.shape[0] + 1 # adding time step as feature self.act_dim = self.env.action_space.shape[0] self.discount = discount self.num_iterations = num_iterations self.lamb = lamb self.animate = animate self.buffer = Buffer(1000000, self.obs_dim, self.act_dim) # 1000000 is the size they have used in paper self.episodes = 20 # larger episodes can reduce variance self.killer = GracefulKiller() self.policy = QPropPolicy(self.obs_dim, self.act_dim, self.env.action_space, kl_target, epochs=20) self.critic = DeterministicCritic(self.obs_dim, self.act_dim, self.discount, OUTPATH) self.value_func = l2TargetValueFunc(self.obs_dim, epochs=10) if 'show' in kwargs and not kwargs['show']: # save copies of file shutil.copy(inspect.getfile(self.policy.__class__), OUTPATH) shutil.copy(inspect.getfile(self.value_func.__class__), OUTPATH) shutil.copy(inspect.getfile(self.critic.__class__), OUTPATH) shutil.copy(inspect.getfile(self.__class__), OUTPATH) self.log_file = open(OUTPATH + 'log.csv', 'w') self.write_header = True print('Observation dimension:', self.obs_dim) print('Action dimension:', self.act_dim) # The use of a scaler is crucial self.scaler = Scaler(self.obs_dim) self.init_scaler() def init_scaler(self): """ Collection observations from 5 episodes to initialize Scaler. :return: a properly initialized scaler """ print('Fitting scaler') observation_samples = [] for i in range(5): observation = [] obs = self.env.reset() observation.append(obs) obs = obs.astype(np.float64).reshape((1, -1)) done = False step = 0 while not done: obs = np.append(obs, [[step]], axis=1) # add time step feature action = self.policy.get_sample(obs).reshape((1, -1)).astype(np.float64) obs_new, reward, done, _ = self.env.step(action.reshape(-1)) observation.append(obs_new) obs = obs_new.astype(np.float64).reshape((1, -1)) step += 1e-3 observation_samples.append(observation) observation_samples = np.concatenate(observation_samples, axis=0) self.scaler.update(observation_samples) def normalize_obs(self, obs): """ Transform and update the scaler on the fly. :param obs: Raw observation :return: normalized observation """ scale, offset = self.scaler.get() obs_scaled = (obs-offset)*scale self.scaler.update(obs.astype(np.float64).reshape((1, -1))) return obs_scaled def run_one_episode(self): """ collect a trajectory of (obs, act, reward, obs_next) """ obs = self.env.reset() observes, actions, rewards = [],[],[] done = False step = 0 while not done: if self.animate: self.env.render() obs = obs.astype(np.float64).reshape((1, -1)) obs = self.normalize_obs(obs) obs = np.append(obs, [[step]], axis=1) # add time step feature at normalized observation observes.append(obs) action = self.policy.get_sample(obs).reshape((1, -1)).astype(np.float64) actions.append(action) obs_new, reward, done, _ = self.env.step(action.reshape(-1)) if not isinstance(reward, float): reward = np.asscalar(reward) rewards.append(reward) obs = obs_new step += 0.003 return np.concatenate(observes), np.concatenate(actions), np.array(rewards) def discounted_sum(self, l, factor): """ Discounted sum of return or advantage estimates along a trajectory. :param l: a list containing the values of discounted summed interest. :param factor: discount factor in the disc_sum case or discount*lambda for GAE :return: discounted sum of l with regard to factor """ discounted = [] sum = 0 for i in reversed(l): discounted.append(factor*sum+i) sum = factor*sum+i return np.array(list(reversed(discounted))) def run_policy(self, episodes): """ Gather a batch of trajectory samples. :param episodes: size of batch. :return: a batch of samples """ trajectories = [] for e in range(episodes): observes, actions, rewards = self.run_one_episode() trajectory = {'observes': observes, 'actions': actions, 'rewards': rewards, 'scaled_rewards': rewards*(1-self.discount)} trajectories.append(trajectory) return trajectories def run_expr(self): ep_steps = [] ep_rewards = [] ep_entropy = [] i = 0 while i < self.num_iterations: trajectories = self.run_policy(20) # add to experience replay buffer self.buffer.append(trajectories) print('buffer size:', self.buffer.size()) i += len(trajectories) # for E=20, T=50, the total number of samples would be 1000 # In future needs to account for not uniform time steps per episode. # e.g. in Hopper-v2 environment not every episode has same time steps # E = len(trajectories) # num_samples = np.sum([len(t['rewards']) for t in trajectories]) gradient_steps = np.sum([len(t['rewards']) for t in trajectories]) """train critic""" # train all samples in the buffer, to the extreme # self.critic.fit(self.policy, self.buffer, epochs=20, num_samples=self.buffer.size()) # train some samples minibatches only critic_loss_mean, critic_loss_std = self.critic.another_fit_func(self.policy, self.buffer, gradient_steps) """calculation of episodic discounted return only needs rewards""" mc_returns = np.concatenate([self.discounted_sum(t['scaled_rewards'], self.discount) for t in trajectories]) """using current batch of samples to update baseline""" observes = np.concatenate([t['observes'] for t in trajectories]) actions = np.concatenate([t['actions'] for t in trajectories]) value_func_loss = self.value_func.update(observes, mc_returns) """compute GAE""" for t in trajectories: t['values'] = self.value_func.predict(t['observes']) # IS it really legitimate to insert 0 at the last obs? t['td_residual'] = t['scaled_rewards'] + self.discount * np.append(t['values'][1:], 0) - t['values'] t['gae'] = self.discounted_sum(t['td_residual'], self.discount * self.lamb) advantages = np.concatenate([t['gae'] for t in trajectories]) """normalize advantage estimates, Crucial step""" advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-6) """compute control variate""""" cv = self.critic.get_contorl_variate(self.policy, observes, actions) # cv must not be centered # cv = (cv - cv.mean()) / (cv.std() + 1e-6) """conservative control variate""" eta = [1 if i > 0 else 0 for i in advantages*cv] """center learning signal""" # check that advantages and CV should be of size E*T # eta controls the on-off of control variate learning_signal = advantages - eta*cv # learning_signal = (learning_signal - learning_signal.mean()) / (learning_signal.std() + 1e-6) """controlled taylor eval term""" ctrl_taylor = np.concatenate([ [eta[i]*act] for i, act in enumerate(self.critic.get_taylor_eval(self.policy, observes))]) """policy update""" ppo_loss, ddpg_loss, kl, entropy, beta = self.policy.update(observes, actions, learning_signal, ctrl_taylor) avg_rewards = np.sum(np.concatenate([t['rewards'] for t in trajectories])) / self.episodes avg_timesteps = np.average([len(t['rewards']) for t in trajectories]) log = {} # save training statistics log['steps'] = avg_timesteps log['rewards'] = avg_rewards log['critic_loss'] = critic_loss_mean log['policy_ppo_loss'] = ppo_loss log['policy_ddpg_loss'] = ddpg_loss log['kl'] = kl log['entropy'] = entropy log['value_func_loss'] = value_func_loss log['beta'] = beta # display print('episode: ', i) print('average steps: {0}, average rewards: {1}'.format(log['steps'], log['rewards'])) for key in ['critic_loss', 'policy_ppo_loss', 'policy_ddpg_loss', 'value_func_loss', 'kl', 'entropy', 'beta']: print('{:s}: {:.2g}'.format(key, log[key])) print('\n') ep_steps.append(log['steps']) ep_rewards.append(log['rewards']) ep_entropy.append(log['entropy']) # write to log.csv if self.write_header: fieldnames = [x for x in log.keys()] self.writer = csv.DictWriter(self.log_file, fieldnames=fieldnames) self.writer.writeheader() self.write_header = False self.writer.writerow(log) # we want the csv file to preserve information even if the program terminates earlier than scheduled. self.log_file.flush() # save model weights if stopped early if self.killer.kill_now: if input('Terminate training (y/[n])? ') == 'y': break self.killer.kill_now = False self.policy.save(OUTPATH) self.value_func.save(OUTPATH) self.critic.save(OUTPATH) self.scaler.save(OUTPATH) plt.figure(figsize=(12,9)) if self.env_name.startswith('Fetch'): ax1 = plt.subplot(121) plt.xlabel('episodes') plt.ylabel('policy entropy') plt.plot(ep_entropy) scale_x = self.episodes ticks_x = ticker.FuncFormatter(lambda x, pos: '{0:g}'.format(x * scale_x)) ax1.xaxis.set_major_formatter(ticks_x) else: ax1 = plt.subplot(121) plt.xlabel('episodes') plt.ylabel('steps') plt.plot(ep_steps) scale_x = self.episodes ticks_x = ticker.FuncFormatter(lambda x, pos: '{0:g}'.format(x * scale_x)) ax1.xaxis.set_major_formatter(ticks_x) ax2 = plt.subplot(122) plt.xlabel('episodes') plt.ylabel('episodic rewards') plt.plot(ep_rewards) scale_x = self.episodes ticks_x = ticker.FuncFormatter(lambda x, pos: '{0:g}'.format(x * scale_x)) ax2.xaxis.set_major_formatter(ticks_x) plt.savefig(OUTPATH + 'train.png') def load_model(self, load_from): """ Load all Function Approximators plus a Scaler. Replaybuffer is not restored though. :param load_from: Dir containing saved weights. """ from tensorflow.python.tools import inspect_checkpoint as chkp # # print all tensors in checkpoint file # chkp.print_tensors_in_checkpoint_file(load_from+'policy/policy.pl', tensor_name='', all_tensors=True, all_tensor_names=True) self.policy.load(load_from + 'policy/') self.value_func.load(load_from + 'value_func/') self.critic.load(load_from+'critic/') with open(load_from + "scaler.pkl", 'rb') as file: self.scaler = pickle.load(file) def demonstrate_agent(self, load_from): """ Simply run the policy without training. :param load_from: :return: """ self.load_model(load_from) while True: observes, actons, rewards = self.run_one_episode() ep_rewards = np.sum(rewards) ep_steps = len(rewards) print("Total steps: {0}, total rewards: {1}\n".format(ep_steps, ep_rewards))
class Agent: def __init__(self, env, name, chief=None): assert name == 'chief' or 'worker' in name if 'worker' in name: assert chief is not None self.chief = chief else: self.scaler = Scaler(Config.data.state_dim) self.name = name self.env = env self.sess = None self.coord = None with tf.variable_scope(name): self._build_graph() def _build_graph(self): self.actor = Actor() self.critic = Critic() if 'worker' in self.name: self._build_update_op() def _build_update_op(self): global_step = tf.train.get_global_step() tf.assign_add(global_step, 1, name='global_step_add') with tf.variable_scope('sync'): with tf.variable_scope('pull'): pull_a_params_op = [ actor_param.assign(chief_param) for actor_param, chief_param in zip( self.actor.params, self.chief.actor.params) ] pull_c_params_op = [ critic_param.assign(chief_param) for critic_param, chief_param in zip( self.critic.params, self.chief.critic.params) ] self.pull_op = tf.group(pull_a_params_op + pull_c_params_op) with tf.variable_scope('push'): update_a_op = self.chief.actor.optimizer.apply_gradients( zip(self.actor.grads, self.chief.actor.params)) update_c_op = self.chief.critic.optimizer.apply_gradients( zip(self.critic.grads, self.chief.critic.params)) self.update_op = tf.group([update_a_op, update_c_op]) def init_scaler(self, init_episode=5): for e in range(init_episode): observation = self.env.reset() states = [] done = False count = 0 while not done: states.append(observation) action = self.choose_action(observation) next_observation, reward, done, info = self.env.step(action) observation = next_observation if Config.train.get('max_episode_steps', None): count += 1 if count == Config.train.max_episode_steps: break self.scaler.update(np.array(states)) def update_chief(self, states, actions, target_v): feed_dict = {self.critic.states: states} value = self.sess.run(self.critic.value, feed_dict) td_error = np.array(target_v) - value feed_dict = { self.critic.states: states, self.critic.target_v: target_v, self.actor.states: states, self.actor.actions: actions, self.actor.td_error: td_error } self.sess.run([ self.critic.loss, self.update_op, self.name + '/global_step_add:0' ], feed_dict) def pull_params(self): self.sess.run(self.pull_op) def cal_target_v(self, done, next_observation, rewards): if done: next_value = 0 else: next_value = self.sess.run( self.critic.value, { self.critic.states: [self.chief.scaler.normalize(next_observation)] })[0, 0] target_v = [] for reward in rewards[::-1]: next_value = reward + Config.train.reward_decay * next_value target_v.append([next_value]) target_v.reverse() return target_v def choose_action(self, observation): if Config.data.action_type == 'discrete': policy = self.sess.run(self.actor.policy, {self.actor.states: [observation]})[0] action = np.random.choice(range(Config.data.action_num), p=policy) else: action = self.sess.run(self.actor.sample, {self.actor.states: [observation]}) return action def eval(self, animate=False): assert self.name == 'chief' observation = self.env.reset() ep_reward = 0 count = 0 done = False while not done: if animate: self.env.render() action = self.choose_action(self.scaler.normalize(observation)) next_observation, reward, done, info = self.env.step(action) ep_reward += reward observation = next_observation if Config.train.get('max_episode_steps', None): count += 1 if count == Config.train.max_episode_steps: break return ep_reward def work(self): total_step = 0 states, actions, rewards, unscaled_states = [], [], [], [] self.pull_params() while not self.coord.should_stop(): observation = self.env.reset() ep_reward = 0 done = False count = 0 while not done: unscaled_states.append(observation) observation = self.chief.scaler.normalize(observation) states.append(observation) action = self.choose_action(observation) next_observation, reward, done, info = self.env.step(action) total_step += 1 ep_reward += reward actions.append(action) rewards.append(reward) if total_step % Config.train.update_n_iter == 0 or done: target_v = self.cal_target_v(done, next_observation, rewards) self.update_chief(states, actions, target_v) self.chief.scaler.update(np.array(unscaled_states)) states, actions, rewards, unscaled_states = [], [], [], [] self.pull_params() observation = next_observation if Config.train.get('max_episode_steps', None): count += 1 if count == Config.train.max_episode_steps: break
def main2(env_name, num_episodes, gamma, lam, kl_targ, batch_size, net_size_factor, noise_bias, weight, use_ppoclip): """ Main training loop Args: env_name: OpenAI Gym environment name, e.g. 'Hopper-v1' num_episodes: maximum number of episodes to run gamma: reward discount factor (float) lam: lambda from Generalized Advantage Estimate kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new) batch_size: number of episodes per policy training batch """ global alive_coef, progress_coef, threshold1, threshold2, change_rate killer = GracefulKiller() env, obs_dim, act_dim = init_gym(env_name) obs_dim += 1 # add 1 to obs dimension for time step feature (see run_episode()) # now = datetime.utcnow().strftime("%b-%d_%H:%M:%S") # create unique directories now = datetime.now().strftime( "%b-%d_%H:%M:%S") + "_multi_hop_{},{},{}".format( change_rate, threshold1, threshold2) logger = Logger(logname=env_name, now=now) aigym_path = os.path.join('/tmp', env_name, now) # env = wrappers.Monitor(env, aigym_path, force=True) scaler = Scaler(obs_dim) if weight == "None": val_func = NNValueFunction(obs_dim, net_size_factor=net_size_factor, alive_coef=alive_coef, progress_coef=progress_coef, reward_dim=reward_dim) policy = Policy(obs_dim, act_dim, kl_targ, net_size_factor=net_size_factor, noise_bias=noise_bias) else: token = weight.split(".") token[-3] = token[-3][:-5] + "value" weight_2 = ".".join(token) # assert False, "unreachable" val_func = NNValueFunctionContinue(weight_2, obs_dim, net_size_factor=net_size_factor, alive_coef=alive_coef, progress_coef=progress_coef) policy = PolicyContinue(weight, obs_dim, act_dim, kl_targ, net_size_factor=net_size_factor, noise_bias=noise_bias) # run a few episodes of untrained policy to initialize scaler: run_policy(env, policy, scaler, logger, episodes=5) episode = 0 flag1 = False flag2 = False flag3 = False reward_queue = [] queue_num = 100 while episode < num_episodes: trajectories = run_policy(env, policy, scaler, logger, episodes=batch_size) episode += len(trajectories) add_value(trajectories, val_func) # add estimated values to episodes add_disc_sum_rew(trajectories, gamma) # calculated discounted sum of Rs add_gae(trajectories, gamma, lam) # calculate advantage # concatenate all episodes into single NumPy arrays observes, actions, advantages, disc_sum_rew = build_train_set( trajectories) # add various stats to training log: log_batch_stats(observes, actions, advantages, disc_sum_rew, logger, episode) policy.update(observes, actions, advantages, logger, scaler) # update policy val_func.fit(observes, disc_sum_rew, logger) # update value function logger.write(display=True) # write logger results to file and stdout if killer.kill_now: if input('Terminate training (y/[n])? ') == 'y': break killer.kill_now = False alive_sum = 0 progr_sum = 0 for t in trajectories: tmp_rewards = t['orig_rewards'] tmp_rewards = np.sum(tmp_rewards, axis=0) alive_sum += tmp_rewards[0] progr_sum += tmp_rewards[1] reward_queue.append(np.mean([t['rewards'].sum() for t in trajectories])) reward_queue = reward_queue[-queue_num:] reward_std = np.std(np.array(reward_queue)) print("Reward std by {} episode : {}".format(queue_num, reward_std)) if alive_sum >= 5000: flag3 = True if (flag3 and alive_sum > progr_sum * threshold1) or flag1: flag1 = True alive_coef -= change_rate progress_coef += change_rate val_func.alive_coef = float(alive_coef) val_func.progress_coef = float(progress_coef) if alive_sum < progr_sum * threshold2: flag1 = False if progr_sum > alive_sum * threshold1 or flag2: flag2 = True alive_coef += change_rate progress_coef -= change_rate val_func.alive_coef = float(alive_coef) val_func.progress_coef = float(progress_coef) if progr_sum < alive_sum * threshold2: flag2 = False print(alive_sum, progr_sum) logger.log_model_3({ "alive_coef": alive_coef, "progress_coef": progress_coef, "alive_sum": alive_sum, "progr_sum": progr_sum }) logger.close() policy.close_sess() val_func.close_sess()
def main(env_name, num_episodes, gamma, lam, kl_targ, batch_size, hid1_mult, policy_logvar): ''' ''' ################## # shared policy # ################## tic = time.clock() manarger = MPManager() manarger.start() shared_env, shared_obs_dim, shared_act_dim = init_gym(env_name) shared_obs_dim += 1 # add 1 to obs dimension for time step feature (see run_episode()) now = datetime.utcnow().strftime( "%b-%d_%H:%M:%S") # create unique directories shared_logger = Logger(logname=env_name, now=now + "-Master") shared_aigym_path = os.path.join('./vedio', env_name, now + "-Master") #env = wrappers.Monitor(env, aigym_path, force=True) shared_scaler = Scaler(shared_obs_dim) shared_val_func = NNValueFunction(shared_obs_dim, hid1_mult, -1, None) shared_policy = Policy(shared_obs_dim, shared_act_dim, kl_targ, hid1_mult, policy_logvar, -1, None) learning_rate_input = tf.placeholder("float") grad_applier = RMSPropApplier(learning_rate=learning_rate_input, decay=RMSP_ALPHA, momentum=0.0, epsilon=RMSP_EPSILON, clip_norm=GRAD_NORM_CLIP, device=device) # lacal policy declair env_a = [None] * N_WORKERS obs_dim_a = [None] * N_WORKERS act_dim_a = [None] * N_WORKERS logger_a = [None] * N_WORKERS aigym_path_a = [None] * N_WORKERS now = datetime.utcnow().strftime( "%b-%d_%H:%M:%S") # create unique directories val_func_a = [None] * N_WORKERS policy_a = [None] * N_WORKERS scaler_a = [None] * N_WORKERS for i in range(N_WORKERS): env_a[i], obs_dim_a[i], act_dim_a[i] = init_gym(env_name) obs_dim_a[ i] += 1 # add 1 to obs dimension for time step feature (see run_episode()) logger_a[i] = Logger(logname=env_name, now=now + "-" + str(i)) aigym_path_a[i] = os.path.join('./vedio', env_name, now + "-" + str(i)) #env_a[i] = wrappers.Monitor(env, aigym_path, force=True) scaler_a[i] = Scaler(obs_dim_a[i]) val_func_a[i] = NNValueFunction(obs_dim_a[i], hid1_mult, i, shared_val_func) val_func_a[i].apply_gradients = grad_applier.apply_gradients( shared_val_func.get_vars(), val_func_a[i].gradients) policy_a[i] = Policy(obs_dim_a[i], act_dim_a[i], kl_targ, hid1_mult, policy_logvar, i, shared_policy) policy_a[i].apply_gradients = grad_applier.apply_gradients( shared_policy.get_vars(), policy_a[i].gradients) # init tensorflow sess = tf.Session(config=tf.ConfigProto(log_device_placement=False, allow_soft_placement=True)) init = tf.global_variables_initializer() ## start sess sess.run(init) ## init shared scalar policy run_policy(sess, shared_env, shared_policy, shared_scaler, shared_logger, episodes=5) def single_work(thread_idx): """ training loop Args: env_name: OpenAI Gym environment name, e.g. 'Hopper-v1' num_episodes: maximum number of episodes to run gamma: reward discount factor (float) lam: lambda from Generalized Advantage Estimate kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new) batch_size: number of episodes per policy training batch hid1_mult: hid1 size for policy and value_f (mutliplier of obs dimension) policy_logvar: natural log of initial policy variance """ env = env_a[thread_idx] policy = policy_a[thread_idx] #obs_dim = obs_dim_a[thread_idx] #act_dim = act_dim_a[thread_idx] logger = logger_a[thread_idx] aigym_path = aigym_path_a[thread_idx] scaler = scaler_a[thread_idx] val_func = val_func_a[thread_idx] print("=== start thread " + str(policy.get_thread_idx()) + " " + policy.get_scope() + " ===") print(shared_policy.get_vars()) print(policy.get_vars()) # run a few episodes of untrained policy to initialize scaler: #run_policy(sess, env, policy, scaler, logger, episodes=5) #policy.sync(shared_policy) #val_func.sync(shared_val_func) episode = 0 while episode < num_episodes: ## copy global var into local sess.run(policy.sync) sess.run(val_func.sync) ## compute new model on local policy trajectories = run_policy(sess, env, policy, scaler, logger, episodes=batch_size) episode += len(trajectories) add_value(sess, trajectories, val_func) # add estimated values to episodes add_disc_sum_rew(trajectories, gamma) # calculated discounted sum of Rs add_gae(trajectories, gamma, lam) # calculate advantage # concatenate all episodes into single NumPy arrays observes, actions, advantages, disc_sum_rew = build_train_set( trajectories) # add various stats to training log: log_batch_stats(observes, actions, advantages, disc_sum_rew, logger, episode, time.clock() - tic) policy.update(sess, observes, actions, advantages, logger) # update policy val_func.fit(sess, observes, disc_sum_rew, logger) # update value function #cur_learning_rate = self._anneal_learning_rate(global_t) feed_dict = { policy.old_log_vars_ph: policy.old_log_vars_np, policy.old_means_ph: policy.old_means_np, policy.obs_ph: observes, policy.act_ph: actions, policy.advantages_ph: advantages, policy.beta_ph: policy.beta, policy.lr_ph: policy.lr, policy.eta_ph: policy.eta, learning_rate_input: policy.lr } sess.run(policy.apply_gradients, feed_dict) shared_policy.update(sess, observes, actions, advantages, shared_logger) feed_dict = { val_func.obs_ph: observes, val_func.val_ph: disc_sum_rew, learning_rate_input: val_func.lr } sess.run(val_func.apply_gradients, feed_dict) shared_val_func.fit(sess, observes, disc_sum_rew, shared_logger) shared_logger.log({'_Time': time.clock() - tic}) logger.write( display=True) # write logger results to file and stdout logger.close() ## end def single work train_threads = [] for i in range(N_WORKERS): train_threads.append(threading.Thread(target=single_work, args=(i, ))) [t.start() for t in train_threads] [t.join() for t in train_threads] saver = tf.train.Saver() for i in range(N_WORKERS): logger_a[i].close() #path = os.path.join('log-files', env_name, now+'-Master', 'checkpoint') #saver.save(sess, path ) sess.close()
def main(env_name, num_episodes, gamma, lam, kl_targ, batch_size, hid1_mult, policy_logvar, task_identity): """ Main training loop Args: env_name: OpenAI Gym environment name, e.g. 'Hopper-v1' num_episodes: maximum number of episodes to run gamma: reward discount factor (float) lam: lambda from Generalized Advantage Estimate kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new) batch_size: number of episodes per policy training batch hid1_mult: hid1 size for policy and value_f (mutliplier of obs dimension) policy_logvar: natural log of initial policy variance """ print('Training started for ' + env_name + ' and task_identity ' + str(task_identity)) killer = GracefulKiller() env, obs_dim, act_dim = init_gym(env_name=env_name, task_identity=task_identity) obs_dim += 1 # add 1 to obs dimension for time step feature (see run_episode()) now = datetime.utcnow().strftime( "%b-%d_%H:%M:%S") # create unique directories logger = Logger(logname=env_name, now=now) scaler = Scaler(obs_dim) val_func = NNValueFunction(obs_dim, hid1_mult) policy = Policy(obs_dim, act_dim, kl_targ, hid1_mult, policy_logvar, env_name, task_identity) # run a few episodes of untrained policy to initialize scaler: run_policy(env, policy, scaler, logger, episodes=5) episode = 0 while episode < num_episodes: trajectories = run_policy(env, policy, scaler, logger, episodes=batch_size) episode += len(trajectories) add_value(trajectories, val_func) # add estimated values to episodes add_disc_sum_rew(trajectories, gamma) # calculated discounted sum of Rs add_gae(trajectories, gamma, lam) # calculate advantage # concatenate all episodes into single NumPy arrays observes, actions, advantages, disc_sum_rew = build_train_set( trajectories) # add various stats to training log: log_batch_stats(observes, actions, advantages, disc_sum_rew, logger, episode) policy.update(observes, actions, advantages, logger) # update policy val_func.fit(observes, disc_sum_rew, logger) # update value function logger.write(display=True) # write logger results to file and stdout if killer.kill_now: if input('Terminate training (y/[n])? ') == 'y': break killer.kill_now = False scale, offset = scaler.get() #scale_and_offset_data = {'scale': scale, 'offset': offset} #scale_and_offset_file = 'scale_and_offset_file_' + env_name + '_' + task_identity + '.pkl' #with open(scale_and_offset_file, 'wb') as f: # pickle.dump(scale_and_offset_data, f) #### Saving expert trajectories after sufficient training has been made ## Visualization #aigym_path = os.path.join(VIDEO_LOGS_DIRECTORY, env_name, now) #env = wrappers.Monitor(env, aigym_path, force=True) trajectories = run_policy(env, policy, scaler, logger, episodes=DEMONSTRATOR_EPISODES_TO_LOG) data_to_store = { DEMONSTRATOR_TRAJECTORY_KEY: trajectories, SCALE_KEY: scale, OFFSET_KEY: offset } directory_to_store_trajectories = './../' + DEMONSTRATOR_TRAJECTORIES_DIRECTORY if not os.path.exists(directory_to_store_trajectories): os.makedirs(directory_to_store_trajectories) file_to_store_trajectories = directory_to_store_trajectories + env_name + '_' + task_identity + '.pkl' with open(file_to_store_trajectories, "wb") as f: pickle.dump(data_to_store, f) logger.close() policy.close_sess() val_func.close_sess()
def main(env_name, num_episodes, gamma, lam, kl_targ, batch_size, hid1_mult, init_logvar): """ Main training loop Args: env_name: OpenAI Gym environment name, e.g. 'Hopper-v1' num_episodes: maximum number of episodes to run gamma: reward discount factor (float) lam: lambda from Generalized Advantage Estimate kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new) batch_size: number of episodes per policy training batch hid1_mult: hid1 size for policy and value_f (multiplier of obs dimension) init_logvar: natural log of initial policy variance """ print('load model (l)?') loading = input('') pybullet.connect(pybullet.DIRECT) killer = GracefulKiller() env, obs_dim, act_dim = init_gym(env_name) obs_dim += 1 # add 1 to obs dimension for time step feature (see run_episode()) # print('obs_dim') # 45 for HumanoidFlagrunBulletEnv-v0, HumanoidFlagrunHarderBulletEnv-v0 # print(obs_dim) # print('act_dim') # 17 for HumanoidFlagrunBelletEnv-v0, HumanoidFlagrunHarderBulletEnv-v0 # print(act_dim) # input('') now = datetime.utcnow().strftime("%b-%d_%H:%M:%S") # create unique directories logger = Logger(logname=env_name, now=now) aigym_path = os.path.join('/tmp', env_name, now) env = wrappers.Monitor(env, aigym_path, force=True) scaler = Scaler(obs_dim) val_func = NNValueFunction(obs_dim, hid1_mult, loading) policy = Policy(obs_dim, act_dim, kl_targ, hid1_mult, init_logvar) # run a few episodes of untrained policy to initialize scaler: run_policy(env, policy, scaler, logger, episodes=5) policy_model = policy.get_trpo_policy_model() valNN_model = val_func.get_valNN_model() lr = val_func.get_lr() if loading == 'l': policy_model.load_weights('pol_weights.h5') pol_weights = policy_model.get_weights() print('pol_weights') print(pol_weights) input('') loading == 'n' save_weights_flag = 1 episode = 0 while episode < num_episodes: trajectories = run_policy(env, policy, scaler, logger, episodes=batch_size) episode += len(trajectories) if episode <= batch_size: if loading == 'l': traj = open('trajectories.obj', 'rb') trajectories = pickle.load(traj) traj.close() print('342') input('') elif episode == num_episodes-batch_size: traj = open('trajectories.obj','wb') pickle.dump(trajectories,traj) traj.close() print('348') input('') add_value(trajectories, val_func) # add estimated values to episodes add_disc_sum_rew(trajectories, gamma) # calculated discounted sum of Rs add_gae(trajectories, gamma, lam) # calculate advantage # concatenate all episodes into single NumPy arrays observes, actions, advantages, disc_sum_rew = build_train_set(trajectories) # add various stats to training log: log_batch_stats(observes, actions, advantages, disc_sum_rew, logger, episode) policy.update(observes, actions, advantages, logger) # update policy if episode > 50: policy_model = policy.get_trpo_policy_model() print('about to save model') input('') policy_model.save('policy_model') val_func.fit(observes, disc_sum_rew, logger) # update value function logger.write(display=True) # write logger results to file and stdout if killer.kill_now: if input('Terminate training (y/[n])? ') == 'y': break killer.kill_now = False logger.close() if save_weights_flag == 1: valNN_model.save('val_weights.h5') policy_weights = policy_model.get_weights() print('policy_weights') print(policy_weights) input('') # policy_model.save_weights('pol_weights.hdf5') policy_model.save_weights('pol_weights.h5')
def main(env_name, num_episodes, gamma, lamda, kl_targ, batch_size, hid1_mult, init_pol_logvar, animate,\ save_video, num_episodes_sim, task_params, task_name): """ Main training loop Args: env_name: OpenAI Gym environment name, e.g. 'Hopper-v1' num_episodes: maximum number of episodes to run gamma: reward discount factor (float) lamda: lambda from Generalized Advantage Estimate kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new) batch_size: number of episodes per policy training batch hid1_mult: hid1 size for policy and value_f (mutliplier of obs dimension) init_pol_logvar: natural log of initial policy variance save_video: Boolean determining if videos of the agent will be saved num_episodes_sim: Number of episodes to simulate/save videos for task_params: list of parameters to modify each environment for a different task task_name: name user assigns to the task being used to modify the environment """ # **************** Environment Initialization and Paths *************** env, obs_dim, act_dim = init_gym(env_name) obs_dim += 1 # add 1 to obs dimension for time step feature (see run_episode()) # Paths print("\n\n---- PATHS: ----") now = datetime.utcnow().strftime( "%b-%d_%H:%M:%S") # create unique directories logger = Logger(logname=env_name, now=now) # logger object aigym_path = os.path.join('./videos', env_name, task_name, now) # videos folders agent_path = os.path.join('agents', env_name, now) # agent / policy folders os.makedirs(agent_path) print("Path for Saved Videos: {}".format(aigym_path)) print("Path for Saved Agents: {}\n".format(agent_path)) # Initialize Policy, Value Networks and Scaler scaler = Scaler(obs_dim) val_func = NNValueFunction(obs_dim, hid1_mult) policy = Policy(obs_dim, act_dim, kl_targ, hid1_mult, init_pol_logvar) run_policy(env, policy, scaler, logger, episodes=5) # run some episodes to initialize scaler # Start Trainning animate = True if animate == "True" else False save_video = True if save_video == "True" else False saver_perc = int( num_episodes * 0.02) # determinines when the agent and video should be saved saver_offset = saver_perc killer = GracefulKiller() episode = 0 while episode < num_episodes: # Obtain 'batch_size' trajectories and add additional intermediate calculations trajectories = run_policy(env, policy, scaler, logger, episodes=batch_size, animate=animate) episode += len(trajectories) add_value(trajectories, val_func) # add estimated values to episodes add_disc_sum_rew(trajectories, gamma) # calculated discounted sum of Rs add_gae(trajectories, gamma, lamda) # calculate advantage # Concatenate all episodes into single NumPy arrays observes, actions, advantages, disc_sum_rew = build_train_set( trajectories) # Logging Stats log_batch_stats(observes, actions, advantages, disc_sum_rew, logger, episode) # Update Policy and Value Networks policy.update(observes, actions, advantages, logger) # update policy val_func.fit(observes, disc_sum_rew, logger) # update value function logger.write(display=True) # write logger results to file and stdout # Store Policy, Value Network and Scaler: every 20% of total episodes or in first/last episode if episode >= saver_offset or episode >= num_episodes or episode <= batch_size or killer.kill_now: # TODO: Make saving agent/video a method so that it can be called in killer.kill_now saver_offset += saver_perc policy.tf_saver.save(policy.sess, "{}/policy_ep_{}".format( agent_path, episode)) # Save Policy Network val_func.tf_saver.save(val_func.sess, "{}/val_func_ep_{}".format( agent_path, episode)) # Save Value Network pickle.dump( scaler, open("{}/scaler_ep_{}.p".format(agent_path, episode), 'wb')) print("---- Saved Agent at Episode {} ----".format(episode)) # Save video of current agent/policy if save_video: print("---- Saving Video at Episode {} ----".format(episode)) _ = sim_agent( env, policy, scaler, num_episodes_sim, save_video=True, out_dir=aigym_path + "/vid_ep_{}/{}_{}".format(episode, task_name, task)) env.close() # closes window open by monitor wrapper env, _, _ = init_gym( env_name ) # Recreate env as it is killed when saving videos print("\n\n") # If Ctrl + C is Pressed, ask user if Trainning shall be terminated if killer.kill_now: if input('Terminate training (y/[n])? ') == 'y': break killer.kill_now = False # Terminate Sessions env.close() logger.close() policy.close_sess() val_func.close_sess()
def main(env_name, num_episodes, gamma, lam, kl_targ, batch_size): """ Main training loop Args: env_name: OpenAI Gym environment name, e.g. 'Hopper-v1' num_episodes: maximum number of episodes to run gamma: reward discount factor (float) lam: lambda from Generalized Advantage Estimate kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new) batch_size: number of episodes per policy training batch """ killer = GracefulKiller() env, obs_dim, act_dim = init_gym(env_name, False) if time_state: obs_dim += 1 # add 1 to obs dimension for time step feature (see run_episode()) now = datetime.utcnow().strftime( "%b-%d_%H-%M-%S") # create unique directories logger = Logger(logname=env_name, now=now) scaler = Scaler(obs_dim, env_name) val_func = NNValueFunction(obs_dim, env_name, True) arg = [obs_dim, act_dim, kl_targ, time_state, env_name] policy = Policy(obs_dim, act_dim, kl_targ, env_name, True) episode = 0 #progresses = None while episode < num_episodes: trajectories, progress = run_policy(env, policy, scaler, logger, arg, episodes=batch_size) #TODO change init setup try: progresses except: progresses = progress else: progresses = np.concatenate([progresses, progress], 1) episode += len(trajectories) add_value(trajectories, val_func) # add estimated values to episodes add_disc_sum_rew(trajectories, gamma) # calculated discounted sum of Rs add_gae(trajectories, gamma, lam) # calculate advantage # concatenate all episodes into single NumPy arrays observes, actions, advantages, disc_sum_rew = build_train_set( trajectories) # add various stats to training log: log_batch_stats(observes, actions, advantages, disc_sum_rew, logger, episode) policy.update(observes, actions, advantages, logger) # update policy val_func.fit(observes, disc_sum_rew, logger) # update value function logger.write(display=True) # write logger results to file and stdout scaler.save() if killer.kill_now: if input('Terminate training (y/[n])? ') == 'y': break killer.kill_now = False path = os.path.join('savedmodel/' + env_name) path = os.path.join(path, 'prog.dat') progresses.dump(path) logger.close() policy.close_sess() val_func.close_sess()
def main(env_name, num_episodes, gamma, lam, kl_targ, batch_size, hid1_mult, policy_logvar, print_results, risk_targ): """ Main training loop Args: env_name: OpenAI Gym environment name, e.g. 'Hopper-v1' num_episodes: maximum number of episodes to run gamma: reward discount factor (float) lam: lambda from Generalized Advantage Estimate kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new) batch_size: number of episodes per policy training batch hid1_mult: hid1 size for policy and value_f (mutliplier of obs dimension) policy_logvar: natural log of initial policy variance """ killer = GracefulKiller() env, obs_dim, act_dim = init_gym(env_name) obs_dim += 1 # add 1 to obs dimension for time step feature (see run_episode()) now_utc = datetime.utcnow() # create unique directories now = str(now_utc.day) + '-' + now_utc.strftime('%b') + '-' + str( now_utc.year) + '_' + str( ((now_utc.hour - 4) % 24)) + '.' + str(now_utc.minute) + '.' + str( now_utc.second) # adjust for Montreal Time Zone logger = Logger(logname=env_name, now=now) aigym_path = os.path.join('/tmp', env_name, now) #env = wrappers.Monitor(env, aigym_path, force=True) scaler = Scaler(obs_dim) val_func = NNValueFunction(obs_dim, hid1_mult) policy = Policy(obs_dim, act_dim, kl_targ, hid1_mult, policy_logvar, risk_targ, 'CVaR', batch_size, 1) # run a few episodes of untrained policy to initialize scaler: run_policy(env, policy, scaler, logger, episodes=5) episode = 0 kl_terms = np.array([]) beta_terms = np.array([]) if print_results: rew_graph = np.array([]) mean_rew_graph = np.array([]) #big_li_rew_nodisc0 = np.array([]) while episode < num_episodes: trajectories = run_policy(env, policy, scaler, logger, episodes=batch_size) episode += len(trajectories) add_value(trajectories, val_func) # add estimated values to episodes #predicted_values_0 = [t['values'][0] for t in trajectories] add_disc_sum_rew( trajectories, gamma, scaler.mean_rew, np.sqrt(scaler.var_rew)) # calculated discounted sum of Rs add_gae(trajectories, gamma, lam, scaler.mean_rew, np.sqrt(scaler.var_rew)) # calculate advantage nodisc0 = -0.0001 * np.array( [t['rewards'].sum() for t in trajectories]) # scaled for gradients print(nodisc0) disc0 = [t['disc_sum_rew'][0] for t in trajectories] print('scaled sum rewards', nodisc0) observes, actions, advantages, disc_sum_rew = build_train_set( trajectories) # add various stats to training log: log_batch_stats(observes, actions, advantages, disc_sum_rew, logger, episode) lamb = policy.update(observes, actions, advantages, nodisc0, logger) # update policy val_func.fit(observes, disc_sum_rew, logger) # update value function logger.write(display=True) # write logger results to file and stdout kl_terms = np.append(kl_terms, policy.check_kl) x1 = list(range(1, (len(kl_terms) + 1))) rewards = plt.plot(x1, kl_terms) plt.title('RAPPO') plt.xlabel("Episode") plt.ylabel("KL Divergence") plt.savefig("KL_curve.png") plt.close("KL_curve.png") beta_terms = np.append(beta_terms, policy.beta) x2 = list(range(1, (len(beta_terms) + 1))) mean_rewards = plt.plot(x2, beta_terms) plt.title('RAPPO') plt.xlabel("Batch") plt.ylabel("Beta Lagrange Multiplier") plt.savefig("lagrange_beta_curve.png") plt.close("lagrange_beta_curve.png") if killer.kill_now: if input('Terminate training (y/[n])? ') == 'y': break killer.kill_now = False if print_results: rew_graph = np.append(rew_graph, disc0) x1 = list(range(1, (len(rew_graph) + 1))) rewards = plt.plot(x1, rew_graph) plt.title('RAPPO') plt.xlabel("Episode") plt.ylabel("Discounted sum of rewards") plt.savefig("learning_curve.png") plt.close() mean_rew_graph = np.append(mean_rew_graph, np.mean(disc0)) x2 = list(range(1, (len(mean_rew_graph) + 1))) mean_rewards = plt.plot(x2, mean_rew_graph) plt.title('RAPPO') plt.xlabel("Batch") plt.ylabel("Mean of Last Batch") plt.savefig("learning_curve2.png") plt.close() if print_results: tr = run_policy(env, policy, scaler, logger, episodes=1000) sum_rewww = [t['rewards'].sum() for t in tr] hist_dat = np.array(sum_rewww) fig = plt.hist(hist_dat, bins=2000, edgecolor='b', linewidth=1.2) plt.title('RAPPO') plt.xlabel("Sum of Rewards") plt.ylabel("Frequency") plt.savefig("RA_ppo.png") plt.close() with open('sum_rew_final_policy.pkl', 'wb') as f: pickle.dump(sum_rewww, f) logger.final_log() logger.close() policy.close_sess() val_func.close_sess()
def main(env_name, num_episodes, gamma, lam, kl_targ, batch_size, hid1_mult, policy_logvar, **kwargs): """ Main training loop Args: env_name: OpenAI Gym environment name, e.g. 'Hopper-v1' num_episodes: maximum number of episodes to run gamma: reward discount factor (float) lam: lambda from Generalized Advantage Estimate kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new) batch_size: number of episodes per policy training batch hid1_mult: hid1 size for policy and value_f (mutliplier of obs dimension) policy_logvar: natural log of initial policy variance """ memory = deque([]) memory_size = kwargs['memory_size'] killer = GracefulKiller() env, obs_dim, act_dim = init_gym(env_name) obs_dim += 1 # add 1 to obs dimension for time step feature (see run_episode()) now = datetime.utcnow().strftime( "%b-%d_%H:%M:%S") # create unique directories logger = Logger(logname=env_name, now=now) aigym_path = os.path.join('/tmp', env_name, now) env = wrappers.Monitor(env, aigym_path, force=True) scaler = Scaler(obs_dim) val_func = NNValueFunction(obs_dim, hid1_mult) target_policy = Policy(obs_dim, act_dim, kl_targ, hid1_mult, policy_logvar) # kl_targ = 0? explore_policy = Policy(obs_dim, act_dim, kl_targ, hid1_mult, policy_logvar) # run a few episodes of untrained policy to initialize scaler: run_policy(env, target_policy, scaler, logger, episodes=5, fix_drct_dist=0) run_policy(env, explore_policy, scaler, logger, episodes=5, fix_drct_dist=0) episode = 0 fix_drct_dist_range = (0.3, 0) while episode < num_episodes: # save model if episode % 200 == 0: save_path = target_policy.saver.save( target_policy.sess, "/home/csc63182/testspace/models/halfcheetah-trpo/model-%d.ckpt" % (episode)) # run a few episodes fix_drct_dist = ( (episode * fix_drct_dist_range[1]) + (num_episodes - episode) * fix_drct_dist_range[0]) / num_episodes target_trajectories = run_policy(env, target_policy, scaler, logger, episodes=batch_size, fix_drct_dist=0) explore_trajectories = run_policy(env, explore_policy, scaler, logger, episodes=batch_size, fix_drct_dist=fix_drct_dist) # Add to memory n_explore = max(0, int(batch_size * (1 - episode / num_episodes)) - 1) trajectories = target_trajectories + explore_trajectories[:n_explore] episode += batch_size memory += trajectories while len(memory) > memory_size: memory.popleft() # train explore network add_value(explore_trajectories, val_func) # add estimated values to episodes add_disc_sum_rew(explore_trajectories, gamma) # calculated discounted sum of Rs add_gae(explore_trajectories, gamma, lam) # calculate advantage observes, actions, advantages, disc_sum_rew = build_train_set( explore_trajectories) explore_policy.update(observes, actions, advantages, logger) # update policy val_func.fit(observes, disc_sum_rew, logger) # update value function # train target network # re-sample trajectories trajectories = sample(memory, batch_size) add_value(trajectories, val_func) # add estimated values to episodes add_disc_sum_rew(trajectories, gamma) # calculated discounted sum of Rs add_gae(trajectories, gamma, lam) # calculate advantage # concatenate all episodes into single NumPy arrays observes, actions, advantages, disc_sum_rew = build_train_set( trajectories) # add various stats to training log: log_batch_stats(observes, actions, advantages, disc_sum_rew, logger, episode) target_policy.update(observes, actions, advantages, logger) # update policy val_func.fit(observes, disc_sum_rew, logger) # update value function logger.write(display=True) # write logger results to file and stdout if killer.kill_now: if input('Terminate training (y/[n])? ') == 'y': break killer.kill_now = False with open('rewards_%s.txt' % kwargs['log_postfix'], 'w') as f: for reward in rewards_record: f.write('%f\n' % reward) plt.plot((np.arange(len(rewards_record)) + 1) * batch_size, rewards_record) plt.savefig('learning_curve_%s.png' % kwargs['log_postfix']) logger.close() explore_policy.close_sess() target_policy.close_sess() val_func.close_sess()
def main(num_episodes, gamma, lam, kl_targ, batch_size, hid1_mult, policy_logvar): """ Main training loop Args: num_episodes: maximum number of episodes to run gamma: reward discount factor (float) lam: lambda from Generalized Advantage Estimate kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new) batch_size: number of episodes per policy training batch hid1_mult: hid1 size for policy and value_f (mutliplier of obs dimension) policy_logvar: natural log of initial policy variance """ killer = GracefulKiller() env, obs_dim, act_dim = init_gym() obs_dim += 1 # add 1 to obs dimension for time step feature (see run_episode()) scaler = Scaler(obs_dim) val_func = NNValueFunction(obs_dim, hid1_mult) policy = Policy(obs_dim, act_dim, kl_targ, hid1_mult, policy_logvar) # run a few episodes of untrained policy to initialize scaler: run_policy(env, policy, scaler, episodes=5) episode = 0 #Inizialize reward list (to keep track of improvements) avg_rew_list = [] while episode < num_episodes: print(episode) trajectories = run_policy(env, policy, scaler, episodes=batch_size) episode += len(trajectories) add_value(trajectories, val_func) # add estimated values to episodes add_disc_sum_rew(trajectories, gamma) # calculated discounted sum of Rs add_gae(trajectories, gamma, lam) # calculate advantage # concatenate all episodes into single NumPy arrays observes, actions, advantages, disc_sum_rew = build_train_set( trajectories) # add various stats to training log: policy.update(observes, actions, advantages) # update policy val_func.fit(observes, disc_sum_rew) # update value function avg_rew_list.append(avg_rewards(trajectories)) #Save every 20000 epidodes models (value_func, policy, scaler) and average rewards if not episode % 20000: print("Saving models") policy.save(episode) val_func.save(episode) f = open("models/scaler-" + str(episode) + ".pkl", 'wb') pickle.dump(scaler, f, pickle.HIGHEST_PROTOCOL) f.close() f2 = open("models/rewards-" + str(episode) + ".pkl", 'wb') pickle.dump(deepcopy(avg_rew_list), f2, pickle.HIGHEST_PROTOCOL) f2.close() if killer.kill_now: if input('Terminate training (y/[n])? ') == 'y': break killer.kill_now = False #Show animation at the end of training while True: obs = env.reset() step = 0.0 scale, offset = scaler.get() scale[-1] = 1.0 offset[-1] = 0.0 done = False while not done: obs = obs.astype(np.float32).reshape((1, -1)) obs = np.append(obs, [[step]], axis=1) obs = (obs - offset) * scale action = policy.sample(obs).reshape((1, -1)).astype(np.float32) obs, reward, done, _ = env.step(np.squeeze(action, axis=0)) env.render1() env.render2() step += 1e-3 policy.close_sess() val_func.close_sess()
def main(env_name, num_episodes, gamma, lam, kl_targ, batch_size, hid1_mult, policy_logvar, clipping_range): """ Main training loop Args: env_name: OpenAI Gym environment name, e.g. 'Hopper-v1' num_episodes: maximum number of episodes to run gamma: reward discount factor (float) lam: lambda from Generalized Advantage Estimate kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new) batch_size: number of episodes per policy training batch hid1_mult: hid1 size for policy and value_f (mutliplier of obs dimension) policy_logvar: natural log of initial policy variance """ killer = GracefulKiller() env, obs_dim, act_dim = init_gym(env_name) obs_dim += 1 # add 1 to obs dimension for time step feature (see run_episode()) now = datetime.utcnow().strftime( "%b-%d_%H:%M:%S") # create unique directories logger = Logger(logname=env_name, now=now) scaler = Scaler(obs_dim) val_func = NNValueFunction(obs_dim, hid1_mult) policy = Policy(obs_dim, act_dim, kl_targ, hid1_mult, policy_logvar, clipping_range) # run a few episodes of untrained policy to initialize scaler: run_policy(env, policy, scaler, logger, episodes=5) episode = 0 while episode < num_episodes: trajectories = run_policy(env, policy, scaler, logger, episodes=batch_size) episode += len(trajectories) add_value(trajectories, val_func) # add estimated values to episodes add_disc_sum_rew(trajectories, gamma) # calculated discounted sum of Rs add_gae(trajectories, gamma, lam) # calculate advantage # concatenate all episodes into single NumPy arrays observes, actions, advantages, disc_sum_rew = build_train_set( trajectories) # add various stats to training log: log_batch_stats(observes, actions, advantages, disc_sum_rew, logger, episode) policy.update(observes, actions, advantages, logger) # update policy val_func.fit(observes, disc_sum_rew, logger) # update value function logger.write(display=True) # write logger results to file and stdout if killer.kill_now: if input('Terminate training (y/[n])? ') == 'y': break killer.kill_now = False if episode % 100 == 0: policy.save_sess() logger.close() policy.close_sess() val_func.close_sess()
def main(env_name, num_iterations, gamma, lam, kl_targ, batch_size, hid1_mult, policy_logvar, coef, use_lr_adjust, ada_kl_penalty, seed, epochs, phi_epochs, max_timesteps, reg_scale, phi_lr, phi_hs, policy_size, phi_obj): """ Main training loop Args: env_name: OpenAI Gym environment name, e.g. 'Hopper-v1' num_iterations: maximum number of iterations to run gamma: reward discount factor (float) lam: lambda from Generalized Advantage Estimate kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new) batch_size: number of episodes per policy training batch hid1_mult: hid1 size for policy and value_f (mutliplier of obs dimension) policy_logvar: natural log of initial policy variance coef: coefficient of Stein control variate use_lr_adjust: whether adjust lr based on kl ada_kl_penalty: whether adjust kl penalty max_timesteps: maximum time steps per trajectory reg_scale: regularization coefficient policy_size: policy network size phi_obj: FitQ or MinVar """ env, obs_dim, act_dim = init_gym(env_name) set_global_seeds(seed) env.seed(seed) env._max_episode_steps = max_timesteps obs_dim += 1 # add 1 to obs dimension for time step feature (see run_episode()) now = datetime.utcnow().strftime("%b-%d_%H:%M:%S") aigym_path = os.path.join('log-files/', env_name, now) env = wrappers.Monitor(env, aigym_path, force=True, video_callable=False) scaler = Scaler(obs_dim) val_func = NNValueFunction(obs_dim, hid1_mult) policy = Policy(obs_dim, act_dim, kl_targ, hid1_mult, policy_logvar, epochs, phi_epochs, policy_size=policy_size, phi_hidden_sizes=phi_hs, c_ph=coef, reg_scale=reg_scale, lr_phi=phi_lr, phi_obj=phi_obj) # run a few episodes of untrained policy to initialize scaler: run_policy(env, policy, scaler, batch_size=1000, max_timesteps=max_timesteps) for _ in range(num_iterations): logger.log("\n#Training Iter %d" % (_)) logger.log("Draw Samples..") trajectories = run_policy(env, policy, scaler, batch_size=batch_size, max_timesteps=max_timesteps) add_value(trajectories, val_func) # add estimated values to episodes add_disc_sum_rew(trajectories, gamma) # calculated discounted sum of Rs add_gae(trajectories, gamma, lam) # calculate advantage # concatenate all episodes into single NumPy arrays observes, actions, advantages, disc_sum_rew = build_train_set( trajectories) # add various stats to training log: log_batch_stats(observes, actions, advantages, disc_sum_rew) logger.log("Starting Training...") policy.update(observes, actions, advantages, \ use_lr_adjust, ada_kl_penalty) # update policy val_func.fit(observes, disc_sum_rew) # update value function logger.log('--------------------------------\n') policy.close_sess() val_func.close_sess()
class Experiment: def __init__(self, env_name, discount, num_iterations, lamb, animate, kl_target, show): self.env_name = env_name self.env = gym.make(env_name) if env_name == "FetchReach-v0": self.env = gym.wrappers.FlattenDictWrapper( self.env, ['observation', 'desired_goal', 'achieved_goal']) gym.spaces.seed(1234) self.obs_dim = self.env.observation_space.shape[ 0] + 1 # adding time step as feature self.act_dim = self.env.action_space.shape[0] self.discount = discount self.num_iterations = num_iterations self.lamb = lamb self.animate = animate self.buffer = Buffer(50000, self.obs_dim, self.act_dim) self.episodes = 20 self.killer = GracefulKiller() self.policy = QPropPolicy(self.obs_dim, self.act_dim, self.env.action_space, kl_target, epochs=20) self.critic = DeterministicCritic(self.obs_dim, self.act_dim, self.discount, OUTPATH) # using MC return would be more helpful self.value_func = l2TargetValueFunc(self.obs_dim, epochs=10) if not show: # save copies of file shutil.copy(inspect.getfile(self.policy.__class__), OUTPATH) shutil.copy(inspect.getfile(self.value_func.__class__), OUTPATH) shutil.copy(inspect.getfile(self.__class__), OUTPATH) self.log_file = open(OUTPATH + 'log.csv', 'w') self.write_header = True print('observation dimension:', self.obs_dim) print('action dimension:', self.act_dim) # Use of a scaler is crucial self.scaler = Scaler(self.obs_dim) self.init_scaler() def init_scaler(self): """ 5 episodes empirically determined. :return: """ print('Fitting scaler') observation_samples = [] for i in range(5): observation = [] obs = self.env.reset() observation.append(obs) obs = obs.astype(np.float64).reshape((1, -1)) done = False step = 0 while not done: obs = np.append(obs, [[step]], axis=1) # add time step feature action = self.policy.get_sample(obs).reshape( (1, -1)).astype(np.float64) if self.env_name == "FetchReach-v0": obs_new, reward, done, _ = self.env.step( action.reshape(-1)) else: obs_new, reward, done, _ = self.env.step(action) observation.append(obs_new) obs = obs_new.astype(np.float64).reshape((1, -1)) step += 1e-3 observation_samples.append(observation) observation_samples = np.concatenate(observation_samples, axis=0) self.scaler.update(observation_samples) def normalize_obs(self, obs): """ transform and update on the fly. :param obs: :return: """ scale, offset = self.scaler.get() obs_scaled = (obs - offset) * scale self.scaler.update(obs.astype(np.float64).reshape((1, -1))) return obs_scaled def run_one_episode(self): """ collect data only :param save: :param train_policy: :param train_value_func: :param animate: :return: """ obs = self.env.reset() observes, actions, rewards = [], [], [] done = False step = 0 while not done: if self.animate: self.env.render() obs = obs.astype(np.float64).reshape((1, -1)) obs = self.normalize_obs(obs) obs = np.append(obs, [[step]], axis=1) # add time step feature observes.append(obs) action = self.policy.get_sample(obs).reshape( (1, -1)).astype(np.float64) actions.append(action) if self.env_name == "FetchReach-v0": obs_new, reward, done, _ = self.env.step(action.reshape(-1)) else: obs_new, reward, done, _ = self.env.step(action) if not isinstance(reward, float): reward = np.asscalar(reward) rewards.append(reward) obs = obs_new step += 0.003 return np.concatenate(observes), np.concatenate(actions), np.array( rewards) def discounted_sum(self, l, factor): discounted = [] sum = 0 for i in reversed(l): discounted.append(factor * sum + i) sum = factor * sum + i return np.array(list(reversed(discounted))) def run_policy(self, episodes): """ gather a batch of samples. :param episodes: :return: """ trajectories = [] for e in range(episodes): observes, actions, rewards = self.run_one_episode() trajectory = { 'observes': observes, 'actions': actions, 'rewards': rewards, 'scaled_rewards': rewards * (1 - self.discount) } trajectories.append(trajectory) return trajectories def run_expr(self): ep_steps = [] ep_rewards = [] ep_entropy = [] i = 0 while i < self.num_iterations: trajectories = self.run_policy(20) # add to experience replay buffer self.buffer.append(trajectories) print('buffer size:', self.buffer.size()) i += len(trajectories) # for E=20, T=50, the total number of samples would be 1000 # In future needs to account for not uniform time steps per episode. # e.g. in Hopper-v2 environment not every episode has same time steps # E = len(trajectories) # num_samples = np.sum([len(t['rewards']) for t in trajectories]) gradient_steps = np.sum([len(t['rewards']) for t in trajectories]) if self.env_name == "FetchReach-v0": assert (gradient_steps == 20 * 50) """train critic""" # train all samples in the buffer, to the extreme # self.critic.fit(self.policy, self.buffer, epochs=10, num_samples=self.buffer.size()) # train some samples minibatches only self.critic.another_fit_func(self.policy, self.buffer, 5000) """calculation of episodic discounted return only needs rewards""" mc_returns = np.concatenate([ self.discounted_sum(t['scaled_rewards'], self.discount) for t in trajectories ]) """using current batch of samples to update baseline""" observes = np.concatenate([t['observes'] for t in trajectories]) actions = np.concatenate([t['actions'] for t in trajectories]) value_func_loss = self.value_func.update(observes, mc_returns) """compute GAE""" for t in trajectories: t['values'] = self.value_func.predict(t['observes']) # IS it really legitimate to insert 0 at the last obs? t['td_residual'] = t[ 'scaled_rewards'] + self.discount * np.append( t['values'][1:], 0) - t['values'] t['gae'] = self.discounted_sum(t['td_residual'], self.discount * self.lamb) advantages = np.concatenate([t['gae'] for t in trajectories]) advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-6) """compute control variate""" "" cv = self.critic.get_contorl_variate(self.policy, observes, actions) """conservative control variate""" eta = [1 if i > 0 else 0 for i in advantages * cv] """center learning signal""" # check that advantages and CV should be of size E*T # eta controls the on-off of control variate learning_signal = advantages - eta * cv # learning_signal = (learning_signal - learning_signal.mean()) / (learning_signal.std() + 1e-6) """controlled taylor eval term""" ctrl_taylor = np.concatenate( [[eta[i] * act] for i, act in enumerate( self.critic.get_taylor_eval(self.policy, observes))]) policy_loss, kl, entropy, beta = self.policy.update( observes, actions, learning_signal, ctrl_taylor) # normalize advantage estimates # advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-6) avg_rewards = np.sum( np.concatenate([t['rewards'] for t in trajectories])) / self.episodes avg_timesteps = np.average( [len(t['rewards']) for t in trajectories]) log = {} # compute statistics such as mean and std log['steps'] = avg_timesteps log['rewards'] = avg_rewards log['policy_loss'] = policy_loss log['kl'] = kl log['entropy'] = entropy log['value_func_loss'] = value_func_loss log['beta'] = beta # display print('episode: ', i) print('average steps: {0}, average rewards: {1}'.format( log['steps'], log['rewards'])) for key in [ 'policy_loss', 'kl', 'entropy', 'beta', 'value_func_loss' ]: print('{:s}: {:.2g}'.format(key, log[key])) print('\n') ep_steps.append(log['steps']) ep_rewards.append(log['rewards']) ep_entropy.append(log['entropy']) # write to log.csv if self.write_header: fieldnames = [x for x in log.keys()] self.writer = csv.DictWriter(self.log_file, fieldnames=fieldnames) self.writer.writeheader() self.write_header = False self.writer.writerow(log) # we want the csv file to preserve information even if the program terminates earlier than scheduled. self.log_file.flush() # save model weights if stopped manually if self.killer.kill_now: if input('Terminate training (y/[n])? ') == 'y': break self.killer.kill_now = False # if (i+1)%20 == 0: # print('episode: ', i+1) # print('average steps', np.average(steps)) # print('average rewards', np.average(rewards)) self.policy.save(OUTPATH) self.value_func.save(OUTPATH) self.scaler.save(OUTPATH) plt.figure(figsize=(12, 9)) if self.env_name.startswith('Fetch'): ax1 = plt.subplot(121) plt.xlabel('episodes') plt.ylabel('policy entropy') plt.plot(ep_entropy) scale_x = self.episodes ticks_x = ticker.FuncFormatter( lambda x, pos: '{0:g}'.format(x * scale_x)) ax1.xaxis.set_major_formatter(ticks_x) else: ax1 = plt.subplot(121) plt.xlabel('episodes') plt.ylabel('steps') plt.plot(ep_steps) scale_x = self.episodes ticks_x = ticker.FuncFormatter( lambda x, pos: '{0:g}'.format(x * scale_x)) ax1.xaxis.set_major_formatter(ticks_x) ax2 = plt.subplot(122) plt.xlabel('episodes') plt.ylabel('episodic rewards') plt.plot(ep_rewards) scale_x = self.episodes ticks_x = ticker.FuncFormatter( lambda x, pos: '{0:g}'.format(x * scale_x)) ax2.xaxis.set_major_formatter(ticks_x) plt.savefig(OUTPATH + 'train.png') def load_model(self, load_from): from tensorflow.python.tools import inspect_checkpoint as chkp # # print all tensors in checkpoint file # chkp.print_tensors_in_checkpoint_file(load_from+'policy/policy.pl', tensor_name='', all_tensors=True, all_tensor_names=True) self.policy.load(load_from + 'policy/policy.pl') self.value_func.load(load_from + 'value_func/value_func.pl') def demonstrate_agent(self, load_from): self.load_model(load_from) with open(load_from + "scaler.pkl", 'rb') as file: self.scaler = pickle.load(file) self.animate = True for i in range(10): observes, actons, rewards = self.run_one_episode() ep_rewards = np.sum(rewards) ep_steps = len(rewards) print("Total steps: {0}, total rewards: {1}\n".format( ep_steps, ep_rewards))
class Agent: #Warning! policy.py and critic.py are still work in progress and contain many global variables that should be converted to #class member variables. Before that is done, all instances of Agent must use the same values for the following: #PPOepsilon,nHidden,nUnitsPerLayer,activation,H,entropyLossWeight,sdLowLimit def __init__(self, stateDim: int, actionDim: int, actionMin: np.array, actionMax: np.array, learningRate=0.0005, gamma=0.99, GAElambda=0.95, PPOepsilon=0.2, PPOentropyLossWeight=0, nHidden: int = 2, nUnitsPerLayer: int = 128, mode="PPO-CMA-m", activation="lrelu", H: int = 9, entropyLossWeight: float = 0, sdLowLimit=0.01, useScaler: bool = True, criticTimestepScale=0.001): #Create policy network print("Creating policy") self.actionMin = actionMin.copy() self.actionMax = actionMax.copy() self.actionDim = actionDim self.stateDim = stateDim self.useScaler = useScaler if useScaler: self.scaler = Scaler(stateDim) self.scalerInitialized = False self.normalizeAdvantages = True self.gamma = gamma self.GAElambda = GAElambda self.criticTimestepScale = 0 if gamma == 0 else criticTimestepScale #with gamma==0, no need for this piEpsilon = None nHistory = 1 negativeAdvantageAvoidanceSigma = 0 if mode == "PPO-CMA" or mode == "PPO-CMA-m": usePPOLoss = False #if True, we use PPO's clipped surrogate loss function instead of the standard -A_i * log(pi(a_i | s_i)) separateVarAdapt = True self.reluAdvantages = True if mode == "PPO-CMA" else False nHistory = H #policy mean adapts immediately, policy covariance as an aggreagate of this many past iterations useSigmaSoftClip = True negativeAdvantageAvoidanceSigma = 1 if mode == "PPO-CMA-m" else 0 elif mode == "PPO": usePPOLoss = True #if True, we use PPO's clipped surrogate loss function instead of the standard -A_i * log(pi(a_i | s_i)) separateVarAdapt = False # separateSigmaAdapt=False self.reluAdvantages = False useSigmaSoftClip = True piEpsilon = 0 else: raise ("Unknown mode {}".format(mode)) self.policy = Policy( stateDim, actionDim, actionMin, actionMax, entropyLossWeight=PPOentropyLossWeight, networkActivation=activation, networkDepth=nHidden, networkUnits=nUnitsPerLayer, networkSkips=False, learningRate=learningRate, minSigma=sdLowLimit, PPOepsilon=PPOepsilon, usePPOLoss=usePPOLoss, separateVarAdapt=separateVarAdapt, nHistory=nHistory, useSigmaSoftClip=useSigmaSoftClip, piEpsilon=piEpsilon, negativeAdvantageAvoidanceSigma=negativeAdvantageAvoidanceSigma) #Create critic network, +1 stateDim because at least in OpenAI gym, episodes are time-limited and the value estimates thus depend on simulation time. #Thus, we use time step as an additional feature for the critic. #Note that this does not mess up generalization, as the feature is not used for the policy during training or at runtime print("Creating critic network") self.critic = Critic(stateDim=stateDim + 1, learningRate=learningRate, nHidden=nHidden, networkUnits=nUnitsPerLayer, networkActivation=activation, useSkips=False, lossType="L1") #Experience trajectory buffers for the memorize() and updateWithMemorized() methods self.experienceTrajectories = [] self.currentTrajectory = [] #call this after tensorflow's global variables initializer def init(self, sess: tf.Session, verbose=False): #Pretrain the policy to output the initial Gaussian for all states self.policy.init( sess, 0, 1, 0.5 * (self.actionMin + self.actionMax) * np.ones(self.actionDim), 0.5 * (self.actionMax - self.actionMin) * np.ones(self.actionDim), 256, 2000, verbose) #stateObs is an n-by-m tensor, where n = number of observations, m = number of observation variables def act(self, sess: tf.Session, stateObs: np.array, deterministic=False, clipActionToLimits=True): #Expand a single 1d-observation into a batch of 1 vectors if len(stateObs.shape) == 1: stateObs = np.reshape(stateObs, [1, stateObs.shape[0]]) #Query the policy for the action, except for the first iteration where we sample directly from the initial exploration Gaussian #that covers the whole action space. #This is done because we don't know the scale of state observations a priori; thus, we can only init the state scaler in update(), #after we have collected some experience. if self.useScaler and (not self.scalerInitialized): actions = np.random.normal( 0.5 * (self.actionMin + self.actionMax) * np.ones(self.actionDim), 0.5 * (self.actionMax - self.actionMin) * np.ones(self.actionDim), size=[stateObs.shape[0], self.actionDim]) if clipActionToLimits: actions = np.clip( actions, np.reshape(self.actionMin, [1, self.actionDim]), np.reshape(self.actionMax, [1, self.actionDim])) return actions else: if self.useScaler: scaledObs = self.scaler.process(stateObs) else: scaledObs = stateObs if deterministic: actions = self.policy.getExpectation(sess, scaledObs) else: actions = self.policy.sample(sess, scaledObs) if clipActionToLimits: actions = np.clip(actions, self.actionMin, self.actionMax) return actions def memorize(self, observation: np.array, action: np.array, reward: float, nextObservation: np.array, done: bool): e = Experience(observation, action, reward, nextObservation, done) self.currentTrajectory.append(e) if done: self.experienceTrajectories.append(self.currentTrajectory) self.currentTrajectory = [] def getAverageActionStdev(self): if self.useScaler and (not self.scalerInitialized): return np.mean(0.5 * (self.actionMax - self.actionMin)) else: return self.policy.usedSigmaSum / (1e-20 + self.policy.usedSigmaSumCounter) #If you call memorize() after each action, you can update the agent with this method. #If you handle the experience buffers yourself, e.g., due to a multithreaded implementation, use the update() method instead. def updateWithMemorized(self, sess: tf.Session, batchSize: int = 512, nBatches: int = 100, verbose=True, valuesValid=False, timestepsValid=False): self.update(sess, experienceTrajectories=self.experienceTrajectories, batchSize=batchSize, nBatches=nBatches, verbose=verbose, valuesValid=valuesValid, timestepsValid=timestepsValid) averageEpisodeReturn = 0 for t in self.experienceTrajectories: episodeReturn = 0 for e in t: episodeReturn += e.r averageEpisodeReturn += episodeReturn averageEpisodeReturn /= len(self.experienceTrajectories) self.experienceTrajectories = [] self.currentTrajectory = [] return averageEpisodeReturn #experienceTrajectories is a list of lists of Experience instances such that each of the contained lists corresponds to an episode simulation trajectory def update(self, sess: tf.Session, experienceTrajectories, batchSize: int = 512, nBatches: int = 100, verbose=True, valuesValid=False, timestepsValid=False): trajectories = experienceTrajectories #shorthand #Collect all data into linear arrays for training. nTrajectories = len(trajectories) nData = 0 for trajectory in trajectories: nData += len(trajectory) #propagate values backwards along trajectory if not already done if not valuesValid: for i in reversed(range(len(trajectory) - 1)): #value estimates, used for training the critic and estimating advantages trajectory[i].V = trajectory[ i].r + self.gamma * trajectory[i + 1].V #update time steps if not updated if not timestepsValid: for i in range(len(trajectory)): trajectory[i].timeStep = i allStates = np.zeros([nData, self.stateDim]) allActions = np.zeros([nData, self.actionDim]) allValues = np.zeros([nData]) allTimes = np.zeros([nData, 1]) k = 0 for trajectory in trajectories: for e in trajectory: allStates[k, :] = e.s allValues[k] = e.V allActions[k, :] = e.a allTimes[k, 0] = e.timeStep * self.criticTimestepScale k += 1 #Update scalers if self.useScaler: self.scaler.update(allStates) scale, offset = self.scaler.get() self.scalerInitialized = True else: offset = 0 scale = 1 #Scale the observations for training the critic scaledStates = self.scaler.process(allStates) #Train critic def augmentCriticObs(obs: np.array, timeSteps: np.array): return np.concatenate([obs, timeSteps], axis=1) self.critic.train(sess, augmentCriticObs(scaledStates, allTimes), allValues, batchSize, nEpochs=0, nBatches=nBatches, verbose=verbose) #Policy training needs advantages, which depend on the critic we just trained. #We use Generalized Advantage Estimation by Schulman et al. if verbose: print("Estimating advantages...".format(len(trajectories))) for t in trajectories: #query the critic values of all states of this trajectory in one big batch nSteps = len(t) states = np.zeros([nSteps + 1, self.stateDim]) timeSteps = np.zeros([nSteps + 1, 1]) for i in range(nSteps): states[i, :] = t[i].s timeSteps[i, 0] = t[i].timeStep * self.criticTimestepScale states[nSteps, :] = t[nSteps - 1].s_next states = (states - offset) * scale values = self.critic.predict(sess, augmentCriticObs(states, timeSteps)) #GAE loop, i.e., take the instantaneous advantage (how much value a single action brings, assuming that the #values given by the critic are unbiased), and smooth those along the trajectory using 1st-order IIR filter. for step in reversed(range(nSteps - 1)): delta_t = t[step].r + self.gamma * values[step + 1] - values[step] t[step].advantage = delta_t + self.GAElambda * self.gamma * t[ step + 1].advantage #Gather the advantages to linear array and apply ReLU and normalization if needed allAdvantages = np.zeros([nData]) k = 0 for trajectory in trajectories: for e in trajectory: allAdvantages[k] = e.advantage k += 1 if self.reluAdvantages: allAdvantages = np.clip(allAdvantages, 0, np.inf) if self.normalizeAdvantages: aMean = np.mean(allAdvantages) aSd = np.std(allAdvantages) if verbose: print("Advantage mean {}, sd{}".format(aMean, aSd)) allAdvantages /= 1e-10 + aSd #Train policy. Note that this uses original unscaled states, because the PPO-CMA variance training needs a history of #states in the same scale self.policy.train(sess, allStates, allActions, allAdvantages, batchSize, nEpochs=0, nBatches=nBatches, stateOffset=offset, stateScale=scale, verbose=verbose)
def main(env_name, num_episodes, gamma, lamda, kl_targ, clipping_range, pol_loss_type, batch_size, init_pol_logvar, animate,\ save_video, save_rate, num_episodes_sim, task_params, task_name, dims_core_hid, dims_head_hid, act_func_name,\ episode_to_load, now_to_load): """ Main training loop Args: env_name: OpenAI Gym environment name, e.g. 'Hopper-v1' num_episodes: maximum number of episodes to run gamma: reward discount factor (float) lamda: lambda from Generalized Advantage Estimate kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new) clipping_range: max value to clip the policy gradient ratio pol_loss_type: string determining which type of loss to use for the Policy Network batch_size: number of episodes per policy training batch init_pol_logvar: natural log of initial policy variance save_video: Boolean determining if videos of the agent will be saved save_rate: Int determining how often to save videos for num_episodes_sim: Number of episodes to simulate/save videos for task_params: list of parameters to modify each environment for a different task task_name: name user assigns to the task being used to modify the environment """ # **************** Environment Initialization and Paths *************** task_params_str = ''.join(str(e) +', ' for e in task_params) num_tasks = len(task_params) envs = [None]*num_tasks scalers = [None]*num_tasks loggers = [None]*num_tasks print ("\n\n------ PATHS: ------") start_time = datetime.now() if episode_to_load == None: now = start_time.strftime("%b-%d_%H:%M:%S") # If NOT loading from Checkpoint -> used to create unique directories else: assert now_to_load != None,\ "\n\nWARNING: Date time to load ({}) was not provided. Please provide a valid date time of an experiment".format(now_to_load) now = now_to_load logs_path = os.path.join('log-files', env_name, task_name, task_params_str, now) for task in range(num_tasks): # Create task specific environment envs[task], obs_dim, act_dim = init_gym(env_name, task_param = task_params[task]) obs_dim += 1 # add 1 to obs dimension for time step feature (see run_episode()) # Create task specific Paths and logger object loggers[task] = Logger(logname= [env_name, task_name, task_params_str], now=now, \ logname_file= "_{}_{}".format(task_name, task_params[task])) if episode_to_load == None: # If NOT loading from Checkpoint scalers[task] = Scaler(obs_dim) # Auxiliary saver (becase logger sometimes fails or takes to much time) with open(logs_path + '/aux_{}_{}.txt'.format(task_name, task_params[task]), 'w') as f: f.write("_Episode" + " " + "_MeanReward") aigym_path= os.path.join('./videos', env_name, task_name, task_params_str, now) # videos folders agent_path = os.path.join('agents', env_name , task_name, task_params_str, now) # agent / policy folders if episode_to_load == None: # If NOT loading from Checkpoint os.makedirs(agent_path) with open(agent_path + '/commandline_args.txt', 'w') as f: f.write(' '.join(sys.argv[1:])) # save commandline command with open(logs_path + '/commandline_args.txt', 'w') as f: f.write(' '.join(sys.argv[1:])) # save commandline command print("\nPath for Saved Videos : {}".format(aigym_path)) print("Path for Saved Agents: {}\n".format(agent_path)) # **************** Initialize Policy, Value Networks and Scaler *************** print ("\n\n------ NEURAL NETWORKS: ------") dims_core_hid.insert(0, obs_dim) # Modify dims list to have the size of the layer 'n-1' at position '0' dims_head_hid.insert(0, dims_head_hid[-1]) val_func = NNValueFunction(obs_dim, dims_core_hid, dims_head_hid, num_tasks)#, act_func_name) policy = Policy(obs_dim, act_dim, dims_core_hid, dims_head_hid, num_tasks, pol_loss_type = pol_loss_type) # Load from Checkpoint: # Validate intented episode to load OR get last episode number if no target load episode was provided if episode_to_load != None: load_agent_path = agent_path # agent / policy folders saved_ep_list = [file.split(".")[0].split("_")[-1] for file in os.listdir(load_agent_path) if "policy" in file] if episode_to_load == -1: # Get last saved episode episode_to_load = sorted([int(ep_string) for ep_string in saved_ep_list])[-1] else: # Validate if episode_to_load was indeed saved assert str(episode_to_load) in saved_ep_list,\ "\n\nWARNING: Episode you want to load ({}) was not stored during trainning".format(episode_to_load) # Load Policy Network's Ops and Variables & Load Scaler Object policy.tf_saver.restore(policy.sess, "{}/policy_ep_{}".format(load_agent_path, episode_to_load)) val_func.tf_saver.restore(val_func.sess, "{}/val_func_ep_{}".format(load_agent_path, episode_to_load)) scalers = pickle.load(open("{}/scalers_ep_{}.p".format(load_agent_path, episode_to_load), 'rb')) print("\n\n ---- CHECKPOINT LOAD: Episoded Loaded **{}**".format(episode_to_load)) # Delete extra epochs that where logged to the auxiliary logs for task in range(num_tasks): aux_log_path = logs_path + '/aux_{}_{}.txt'.format(task_name, task_params[task]) aux_log = pd.read_table(aux_log_path, delim_whitespace=True) idx_to_cut = aux_log.index[aux_log["_Episode"] == episode_to_load ].tolist()[0] aux_log[0:idx_to_cut+1].to_csv(aux_log_path, header=True, index=False, sep=' ', mode='w') # overwrite trimmed aux_log # If NOT loading from Checkpoint: run some episodes to initialize scalers and create Tensor board dirs elif episode_to_load == None: for task in range(num_tasks): run_policy(envs[task], policy, scalers[task], loggers[task], episodes=5, task=task) # Tensor Board writer os.makedirs(agent_path + '/tensor_board/policy') os.makedirs(agent_path + '/tensor_board/valFunc') tb_pol_writer = tf.summary.FileWriter(agent_path + '/tensor_board/policy', graph=policy.g) tb_val_writer = tf.summary.FileWriter(agent_path + '/tensor_board/valFunc', graph=val_func.g) # **************** Start Training *************** print ("\n\n------ TRAINNING: ------") animate = True if animate == "True" else False save_video = True if save_video == "True" else False saver_offset = save_rate killer = GracefulKiller() if episode_to_load == None: episode = 0 else: episode = episode_to_load # Episode is counted across all tasks i.e. N episodes indicates each tasks has been runned for N times while episode < num_episodes and not killer.kill_now: # **************** Obtain data (train set) *************** observes_all = [None]*num_tasks actions_all = [None]*num_tasks advantages_all = [None]*num_tasks disc_sum_rew_all = [None]*num_tasks episode += batch_size for task in range(num_tasks): # Obtain 'batch_size' trajectories and add additional intermediate calculations trajectories = run_policy(envs[task],policy, scalers[task], loggers[task],episodes=batch_size,task=task,animate=animate) add_value(trajectories, val_func, task) # add estimated values to episodes add_disc_sum_rew(trajectories, gamma) # calculated discounted sum of Rs add_gae(trajectories, gamma, lamda) # calculate advantage # Concatenate all episodes into single NumPy arrays observes_all[task], actions_all[task], advantages_all[task], disc_sum_rew_all[task] = build_train_set(trajectories) # Logging Stats log_batch_stats(observes_all[task], actions_all[task], advantages_all[task], disc_sum_rew_all[task], \ loggers[task], episode) # **************** Update Policy and Value Networks *************** print ("*************************************") for task in range(num_tasks): pol_summary = policy.update(task, observes_all[task], actions_all[task], advantages_all[task], loggers[task]) # update policy val_summary = val_func.fit(task, observes_all[task], disc_sum_rew_all[task], loggers[task]) # update value function # Auxiliary saver (because logger sometimes fails or takes to much time) with open(logs_path + '/aux_{}_{}.txt'.format(task_name, task_params[task]), 'a') as f: f.write("\n" + str(loggers[task].log_entry['_Episode']) + " " + str(loggers[task].log_entry['_MeanReward'])) loggers[task].write(display=True) # write logger results to file and stdout tb_pol_writer.add_summary(pol_summary, global_step=episode) tb_val_writer.add_summary(val_summary, global_step=episode) # **************** Storing NN and Videos *************** # Store Policy, Value Network and Scaler: every 'save_rate' of total episodes or in first/last episode if episode >= saver_offset or episode >=num_episodes or episode <=batch_size or killer.kill_now: # TODO: Make saving agent/video a method so that it can be called in killer.kill_now saver_offset += save_rate policy.tf_saver.save(policy.sess, "{}/policy_ep_{}".format(agent_path, episode)) # Save Policy Network val_func.tf_saver.save(val_func.sess, "{}/val_func_ep_{}".format(agent_path, episode)) # Save Value Network pickle.dump(scalers, open("{}/scalers_ep_{}.p".format(agent_path, episode), 'wb')) print ("---- Saved Agent at Episode {} ----".format(episode)) # Save video of current agent/policy if save_video: print ("---- Saving Video at Episode {} ----".format(episode)) for task in range(num_tasks): print("Environment Wind: {}".format(envs[task].env.world.gravity)) _ = sim_agent(envs[task], policy, task, scalers[task], num_episodes_sim, save_video=True, out_dir=aigym_path + "/vid_ep_{}/{}_{}".format(episode, task_name, task_params[task])) envs[task].close() # closes window open by monitor wrapper envs[task], _, _ = init_gym(env_name,task_param=task_params[task]) # Recreate env as it was killed print("\n\n") # If Ctrl + C is Pressed, ask user if Trainning shall be terminated if killer.kill_now: if input('Terminate training (y/[n])? ') == 'y': break killer.kill_now = False # **************** Terminate Variables ************** for task in range(num_tasks): envs[task].close() loggers[task].close() policy.close_sess() val_func.close_sess() # Save elapsed time end_time = datetime.now() elapsed_time = end_time - start_time timedelta(0, 8, 562000) delta_time = divmod(elapsed_time.days * 86400 + elapsed_time.seconds, 60) delta_str = "Elapsed Time: {} min {} seconds".format(delta_time[0], delta_time[1]) # save elapsed time, 'a' to append not overwrite with open(agent_path + '/commandline_args.txt', 'a') as f: f.write('\n\n' + delta_str) with open(logs_path + '/commandline_args.txt', 'a') as f: f.write('\n\n' + delta_str)
class Recognizer(object): def __init__(self): self.resize_shape = (100, 100) shape_predictor_path = 'data/shape_predictor_68_face_landmarks.dat' self.shape_predictor = dlib.shape_predictor(shape_predictor_path) self.eye_and_mouth_indices = [39, 42, 57] self.template_landmarks = get_template_landmarks( self.eye_and_mouth_indices, self.resize_shape[0]) npload = np.load('data/mean_std2.npz') mean, std = npload['mean'], npload['std'] self.scaler = Scaler(mean=mean, std=std) # model_emb_path = 'data/epoch_17_test_eer0.191621.hdf5' model_path = 'data/cnn_model/epoch_66_val_loss1.206078.hdf5' model_emb_path = 'data/emb_model/model_10_epoch_10_test_eer0.169731_test2_err0.204908.hdf5' # model_path = 'data/cnn_model/epoch_16_val_loss1.231896.hdf5' # model_emb_path = 'data/emb_model/model_8_epoch_15_test_eer0.127431_test2_err0.218662.hdf5' # model_emb_path = 'data/emb_model/model_8_epoch_1_test_eer0.133520_test2_err0.216839.hdf5' # model_emb_path = 'data/emb_model/model_9_epoch_5_test_eer0.127574_test2_err0.229637.hdf5' # model_path = 'data/cnn_model/epoch_232_val_loss1.351451.hdf5' # model_emb_path = 'data/emb_model/model_1_epoch_0_test_eer0.114874.hdf5' # # model_path = 'data/cnn_model/epoch_57_val_loss1.699622.hdf5' # model_emb_path = 'data/emb_model/model_2_epoch_25_test_eer0.106689.hdf5' # model_path = 'data/cnn_model/epoch_29_val_loss1.441430.hdf5' # model_emb_path = 'data/emb_model/model_5_epoch_2_test_eer0.143211.hdf5' # model_emb_path = 'data/emb_model/model_6_epoch_6_test_eer_0.135497_test2_err0.254601.hdf5' # model_emb_path = '../data/Modeltpe2/epoch_0_test_eer0.139840.hdf5' # model_emb_path = '../data/Modeltpe3/epoch_12_test_eer0.107399.hdf5' # model_emb_path = 'data/emb_model/model_4_epoch_1_test_eer0.108006.hdf5' model = keras.models.load_model(model_path) self.model_emb = keras.models.load_model(model_emb_path) self.bottleneck = Bottleneck(model) npload = np.load('data/face_base.npz') self.x, self.y = npload['x'], npload['y'] print(self.x.shape, self.y.shape) with open('data/labels_dict.pkl', 'rb') as file: self.labels_dict = pickle.load(file) self.knn = KNeighborsClassifier(n_neighbors=1, metric=metric, n_jobs=1) self.knn.fit(self.x, self.y) def iterate_similarities(self, emb): for i, person_emb in enumerate(self.x): sim = person_emb @ emb.T yield sim, i def predict(self, img, img_gray, rect): img = align_img(img, img_gray, rect, self.shape_predictor, self.template_landmarks, self.eye_and_mouth_indices, self.resize_shape) batch_x = [img] import matplotlib.pyplot as plt # plt.imshow(img) # plt.show() batch_x = self.scaler.transform(batch_x) batch_x = self.bottleneck.predict(transpose_matrix(batch_x)) batch_x = self.model_emb.predict(batch_x) # batch_x = self.model_emb.predict(transpose_matrix(batch_x)) pred_labels = self.knn.predict(batch_x) neighbors = self.knn.kneighbors(batch_x) label_neighbors = [ self.labels_dict[self.y[ind]] for ind in neighbors[1][0] ] print(label_neighbors, neighbors[0]) # label_ind = max(self.iterate_similarities(batch_x[0]), key=lambda x: x[0])[1] # label = self.y[label_ind] label = pred_labels[0] return self.labels_dict[label], label_neighbors
def __init__(self, stateDim: int, actionDim: int, actionMin: np.array, actionMax: np.array, learningRate=0.0005, gamma=0.99, GAElambda=0.95, PPOepsilon=0.2, PPOentropyLossWeight=0, nHidden: int = 2, nUnitsPerLayer: int = 128, mode="PPO-CMA-m", activation="lrelu", H: int = 9, entropyLossWeight: float = 0, sdLowLimit=0.01, useScaler: bool = True, criticTimestepScale=0.001): #Create policy network print("Creating policy") self.actionMin = actionMin.copy() self.actionMax = actionMax.copy() self.actionDim = actionDim self.stateDim = stateDim self.useScaler = useScaler if useScaler: self.scaler = Scaler(stateDim) self.scalerInitialized = False self.normalizeAdvantages = True self.gamma = gamma self.GAElambda = GAElambda self.criticTimestepScale = 0 if gamma == 0 else criticTimestepScale #with gamma==0, no need for this piEpsilon = None nHistory = 1 negativeAdvantageAvoidanceSigma = 0 if mode == "PPO-CMA" or mode == "PPO-CMA-m": usePPOLoss = False #if True, we use PPO's clipped surrogate loss function instead of the standard -A_i * log(pi(a_i | s_i)) separateVarAdapt = True self.reluAdvantages = True if mode == "PPO-CMA" else False nHistory = H #policy mean adapts immediately, policy covariance as an aggreagate of this many past iterations useSigmaSoftClip = True negativeAdvantageAvoidanceSigma = 1 if mode == "PPO-CMA-m" else 0 elif mode == "PPO": usePPOLoss = True #if True, we use PPO's clipped surrogate loss function instead of the standard -A_i * log(pi(a_i | s_i)) separateVarAdapt = False # separateSigmaAdapt=False self.reluAdvantages = False useSigmaSoftClip = True piEpsilon = 0 else: raise ("Unknown mode {}".format(mode)) self.policy = Policy( stateDim, actionDim, actionMin, actionMax, entropyLossWeight=PPOentropyLossWeight, networkActivation=activation, networkDepth=nHidden, networkUnits=nUnitsPerLayer, networkSkips=False, learningRate=learningRate, minSigma=sdLowLimit, PPOepsilon=PPOepsilon, usePPOLoss=usePPOLoss, separateVarAdapt=separateVarAdapt, nHistory=nHistory, useSigmaSoftClip=useSigmaSoftClip, piEpsilon=piEpsilon, negativeAdvantageAvoidanceSigma=negativeAdvantageAvoidanceSigma) #Create critic network, +1 stateDim because at least in OpenAI gym, episodes are time-limited and the value estimates thus depend on simulation time. #Thus, we use time step as an additional feature for the critic. #Note that this does not mess up generalization, as the feature is not used for the policy during training or at runtime print("Creating critic network") self.critic = Critic(stateDim=stateDim + 1, learningRate=learningRate, nHidden=nHidden, networkUnits=nUnitsPerLayer, networkActivation=activation, useSkips=False, lossType="L1") #Experience trajectory buffers for the memorize() and updateWithMemorized() methods self.experienceTrajectories = [] self.currentTrajectory = []
def main(env_name, num_episodes, gamma, lam, kl_targ, batch_size, eval): """ Main training loop Args: env_name: OpenAI Gym environment name, e.g. 'Hopper-v1' num_episodes: maximum number of episodes to run gamma: reward discount factor (float) lam: lambda from Generalized Advantage Estimate kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new) batch_size: number of episodes per policy training batch """ if eval: print("Evaluating: ") evaluate(env_name, num_episodes) exit() killer = GracefulKiller() env, obs_dim, act_dim = init_gym(env_name) now = datetime.utcnow().strftime( "%b-%d_%H:%M:%S") # create unique directories logger = Logger(logname=env_name, now=now) aigym_path = os.path.join('/tmp', env_name, now) #env = wrappers.Monitor(env, aigym_path, force=True) scaler = Scaler(obs_dim) val_func = NNValueFunction(obs_dim) policy = Policy(obs_dim, act_dim, kl_targ) #policy.restore_weights() ## ------------- #val_func.restore_weights() ## ------------- # run a few episodes of untrained policy to initialize scaler: run_policy(env, policy, scaler, logger, episodes=5) episode = 0 while episode < num_episodes: trajectories = run_policy(env, policy, scaler, logger, episodes=batch_size) episode += len(trajectories) add_value(trajectories, val_func) # add estimated values to episodes add_disc_sum_rew(trajectories, gamma) # calculated discounted sum of Rs add_gae(trajectories, gamma, lam) # calculate advantage # concatenate all episodes into single NumPy arrays observes, actions, advantages, disc_sum_rew = build_train_set( trajectories) # add various stats to training log: log_batch_stats(observes, actions, advantages, disc_sum_rew, logger, episode) policy.update(observes, actions, advantages, logger) # update policy val_func.fit(observes, disc_sum_rew, logger) # update value function logger.write(display=True) # write logger results to file and stdout if killer.kill_now: if input('Terminate training (y/[n])? ') == 'y': break killer.kill_now = False print("Scaler vars,means: ") print(scaler.vars, scaler.means) for i in range(3): run_episode(env, policy, scaler, animate=True) #policy.save_weights() #val_func.save_weights() #WARNING: scaler is disabled logger.close() policy.close_sess() val_func.close_sess()
def eval_models(env_name, num_episodes, gamma, lam, kl_targ, coef, use_lr_adjust, ada_kl_penalty, seed, epochs, phi_epochs, max_timesteps, reg_scale, phi_lr, phi_hs, policy_size, phi_obj, load_model): env, obs_dim, act_dim = init_gym(env_name) set_global_seeds(seed) env.seed(seed) env._max_episode_steps = max_timesteps obs_dim += 1 now = datetime.utcnow().strftime("%b-%d_%H:%M:%S") aigym_path = os.path.join('log-files/', env_name, now) env = wrappers.Monitor(env, aigym_path, force=True, video_callable=False) scaler = Scaler(obs_dim) val_func = NNValueFunction(obs_dim) policy = Policy(obs_dim, act_dim, kl_targ, epochs, phi_epochs, policy_size=policy_size, phi_hidden_sizes=phi_hs, reg_scale=reg_scale, lr_phi=phi_lr, phi_obj=phi_obj) logger.log("loading model") load_dir = "models/" policy.load_model(load_dir) val_func.load_val_model(load_dir) run_policy(env, policy, scaler, num_episodes, max_timesteps=max_timesteps) episode = 0 trajectories, traj_len_list = run_policy(env, policy, scaler, num_episodes, max_timesteps=max_timesteps) num_traj = len(trajectories) logger.log("Avg Length %d total Length %d"%( \ np.mean(traj_len_list), \ np.sum(traj_len_list))) episode += len(trajectories) add_value(trajectories, val_func) add_disc_sum_rew(trajectories, gamma) add_gae(trajectories, gamma, lam) observes, actions, advantages, disc_sum_rew = build_train_set(trajectories) sub_folder = "eval_data/%s_%s_data_seed=%d_max-steps=%d"%(\ env_name, phi_obj, seed, max_timesteps) if not os.path.exists(sub_folder): os.mkdir(sub_folder) # save original gradient mc_grad_info = policy.get_batch_gradient(observes, actions, advantages, c=0.) mc_grad_info['traj_lens'] = traj_len_list with open(sub_folder + '/mc_num_episode=%d.pkl' % (num_episodes), 'wb') as fp: pickle.dump(mc_grad_info, fp) policy.update(load_model, observes, actions, advantages, use_lr_adjust, ada_kl_penalty, c=1) # update policy stein_grad_info = policy.get_batch_gradient(observes, \ actions, advantages, c=1.) stein_grad_info['traj_lens'] = traj_len_list with open(sub_folder + '/stein_num_episode=%d.pkl' % (num_episodes), 'wb') as fp: pickle.dump(stein_grad_info, fp)
def main(env_name, num_episodes, gamma, lam, kl_targ, batch_size, restore_path, out_path, thread_count, animation_mode, gait_name, gait_length, gaits_config_path, reward_mask, log_rewards, gait_reward_weight, g_colab, progress_reward_weight, phase_time_limit): """ Main training loop Args: env_name: OpenAI Gym environment name, e.g. 'Hopper-v1' num_episodes: maximum number of episodes to run gamma: reward discount factor (float) lam: lambda from Generalized Advantage Estimate kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new) batch_size: number of episodes per policy training batch """ killer = GracefulKiller() # restore_path = os.path.abspath(restore_path) env, obs_dim, act_dim = init_gym(env_name) log_rewards = log_rewards or (num_episodes == 0) env_list = [] if thread_count > 1: env_list, obs_dim, act_dim = init_gyms(env_name, batch_size) obs_dim += 1 # add 1 to obs dimension for time step feature (see run_episode()) start_time = datetime.now() # create unique directories start_time_str = start_time.strftime("%b-%d/%H.%M.%S") logger = Logger(logname=env_name, now=start_time_str, out_path=out_path) env.env.set_params(gaits_config_path=gaits_config_path, gait_name=gait_name, gait_cycle_len=gait_length, out_path=logger.path, log_rewards=log_rewards, render_mode=animation_mode, reward_mask=reward_mask, contact_reward=gait_reward_weight, g_colab=g_colab, progress_weight=progress_reward_weight, phase_time_limit=phase_time_limit) scaler = Scaler(obs_dim) val_func = NNValueFunction(obs_dim, logger, restore_path) policy = Policy(obs_dim, act_dim, kl_targ, logger, restore_path) log_train_info(logger, num_episodes, start_time_str, gait_name, gait_length, batch_size, restore_path, reward_mask, gait_reward_weight, progress_reward_weight, phase_time_limit) # run a few episodes of untrained policy to initialize scaler: episode = 0 try: if restore_path is None: print("\nInitializing scaler (may take some time)... ") run_policy(env, policy, scaler, logger, episodes=5) print("Done\n") else: scaler.load(restore_path, obs_dim) while episode < num_episodes: sim_time = datetime.now() if thread_count > 1: trajectories = run_policy_parallel(env_list, policy, scaler, logger, episodes=batch_size, thread_num=thread_count) else: trajectories = run_policy(env, policy, scaler, logger, episodes=batch_size) sim_time = datetime.now() - sim_time episode += len(trajectories) add_value(trajectories, val_func) # add estimated values to episodes add_disc_sum_rew(trajectories, gamma) # calculated discounted sum of Rs add_gae(trajectories, gamma, lam) # calculate advantage # concatenate all episodes into single NumPy arrays observes, actions, advantages, disc_sum_rew = build_train_set( trajectories) # add various stats to training log: train_time = datetime.now() - start_time policy_time = datetime.now() policy.update(observes, actions, advantages, logger) # update policy policy_time = datetime.now() - policy_time val_time = datetime.now() val_func.fit(observes, disc_sum_rew, logger) # update value function val_time = datetime.now() - val_time log_batch_stats(observes, actions, advantages, disc_sum_rew, logger, episode, train_time, sim_time, policy_time, val_time) logger.write( display=True) # write logger results to file and stdout print("Estimated time left: {}\n".format( estimate_time_left(episode, num_episodes, train_time))) if episode % 1000 == 0: policy.save() val_func.save() scaler.save(logger.path) print("Data saved at {}\n".format(logger.path)) update_train_info(logger, episode) if animation_mode > 0: run_policy(env, policy, scaler, logger, episodes=1, animate=True, anim_name='epizode_{}'.format(episode)) if episode % 5000 == 0: os.rename( os.path.join(logger.path, 'value_dump'), os.path.join(logger.path, 'value_dump_' + str(episode))) os.rename( os.path.join(logger.path, 'policy_dump'), os.path.join(logger.path, 'policy_dump_' + str(episode))) # if episode == 20000: # reward_mask = 63 # env.env.set_params(gaits_config_path=gaits_config_path, gait_name=gait_name, gait_cycle_len=gait_length, # out_path=logger.path, log_rewards=log_rewards, render_mode=animation_mode, # reward_mask=reward_mask, contact_reward=gait_reward_weight, g_colab=g_colab) print("Progress Enabled") if killer.kill_now: # if input('Terminate training (y/[n])? ') == 'y': # break # killer.kill_now = False break finally: if animation_mode > 0 or num_episodes == 0: print("Rendering result video") try: trajectories = run_policy( env, policy, scaler, logger, episodes=1, animate=True, anim_name='final_epizode_{}'.format(episode)) # for walk analysis for t in trajectories: logger.log_trajectory(t) except Exception as e: print("Failed to animate results, error: {}".format(e)) raise e scaler.save(logger.path) policy.close_sess() val_func.close_sess() update_train_info(logger, episode) logger.close()