def main(): """ Main training loop Args: env_name: OpenAI Gym environment name, e.g. 'Hopper-v1' num_episodes: maximum number of episodes to run gamma: reward discount factor (float) lam: lambda from Generalized Advantage Estimate kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new) batch_size: number of episodes per policy training batch hid1_mult: hid1 size for policy and value_f (mutliplier of obs dimension) policy_logvar: natural log of initial policy variance """ env_name = 'HumanoidasimoMRD4_2-v1' #env_name='Humanoid-v1' num_episodes = 5000000 gamma = 0.995 lam = 0.98 kl_targ = 0.003 batch_size = 32 hid1_mult = 10 policy_logvar = -1.0 killer = GracefulKiller() env, obs_dim, act_dim = init_gym(env_name) obs_dim += 1 # add 1 to obs dimension for time step feature (see run_episode()) now = datetime.utcnow().strftime( "%b-%d_%H:%M:%S") # create unique directories logger = Logger(logname=env_name, now=now) aigym_path = os.path.join( '/home/initial/eclipse-workspace4/test/trpo-master/src/result', env_name, now) env = wrappers.Monitor(env, aigym_path, force=True) scaler = Scaler(obs_dim) val_func = NNValueFunction(obs_dim, hid1_mult, filename2) policy = Policy(obs_dim, act_dim, kl_targ, hid1_mult, policy_logvar, filename=filename1) # run a few episodes of untrained policy to initialize scaler: run_policy(env, policy, scaler, logger, episodes=5) episode = 0 while episode < num_episodes: trajectories = run_policy(env, policy, scaler, logger, episodes=batch_size) episode += len(trajectories) add_value(trajectories, val_func) # add estimated values to episodes add_disc_sum_rew(trajectories, gamma) # calculated discounted sum of Rs add_gae(trajectories, gamma, lam) # calculate advantage # concatenate all episodes into single NumPy arrays observes, actions, advantages, disc_sum_rew = build_train_set( trajectories) # add various stats to training log: log_batch_stats(observes, actions, advantages, disc_sum_rew, logger, episode) policy.update(observes, actions, advantages, logger) # update policy val_func.fit(observes, disc_sum_rew, logger) # update value function logger.write(display=True) # write logger results to file and stdout if ((episode % (batch_size * 3) == 0)): # & (name == "local_thread3")): #print(['stop']) policy.save(episode, filename1) val_func.save(episode, filename2) #loger.flush() if killer.kill_now: if input('Terminate training (y/[n])? ') == 'y': break killer.kill_now = False logger.close() policy.close_sess() val_func.close_sess()
def main(num_episodes, gamma, lam, kl_targ, batch_size, hid1_mult, policy_logvar): """ Main training loop Args: num_episodes: maximum number of episodes to run gamma: reward discount factor (float) lam: lambda from Generalized Advantage Estimate kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new) batch_size: number of episodes per policy training batch hid1_mult: hid1 size for policy and value_f (mutliplier of obs dimension) policy_logvar: natural log of initial policy variance """ killer = GracefulKiller() env, obs_dim, act_dim = init_gym() obs_dim += 1 # add 1 to obs dimension for time step feature (see run_episode()) scaler = Scaler(obs_dim) val_func = NNValueFunction(obs_dim, hid1_mult) policy = Policy(obs_dim, act_dim, kl_targ, hid1_mult, policy_logvar) # run a few episodes of untrained policy to initialize scaler: run_policy(env, policy, scaler, episodes=5) episode = 0 #Inizialize reward list (to keep track of improvements) avg_rew_list = [] while episode < num_episodes: print(episode) trajectories = run_policy(env, policy, scaler, episodes=batch_size) episode += len(trajectories) add_value(trajectories, val_func) # add estimated values to episodes add_disc_sum_rew(trajectories, gamma) # calculated discounted sum of Rs add_gae(trajectories, gamma, lam) # calculate advantage # concatenate all episodes into single NumPy arrays observes, actions, advantages, disc_sum_rew = build_train_set( trajectories) # add various stats to training log: policy.update(observes, actions, advantages) # update policy val_func.fit(observes, disc_sum_rew) # update value function avg_rew_list.append(avg_rewards(trajectories)) #Save every 20000 epidodes models (value_func, policy, scaler) and average rewards if not episode % 20000: print("Saving models") policy.save(episode) val_func.save(episode) f = open("models/scaler-" + str(episode) + ".pkl", 'wb') pickle.dump(scaler, f, pickle.HIGHEST_PROTOCOL) f.close() f2 = open("models/rewards-" + str(episode) + ".pkl", 'wb') pickle.dump(deepcopy(avg_rew_list), f2, pickle.HIGHEST_PROTOCOL) f2.close() if killer.kill_now: if input('Terminate training (y/[n])? ') == 'y': break killer.kill_now = False #Show animation at the end of training while True: obs = env.reset() step = 0.0 scale, offset = scaler.get() scale[-1] = 1.0 offset[-1] = 0.0 done = False while not done: obs = obs.astype(np.float32).reshape((1, -1)) obs = np.append(obs, [[step]], axis=1) obs = (obs - offset) * scale action = policy.sample(obs).reshape((1, -1)).astype(np.float32) obs, reward, done, _ = env.step(np.squeeze(action, axis=0)) env.render1() env.render2() step += 1e-3 policy.close_sess() val_func.close_sess()
class Central_agent: def __init__(self): with tf.name_scope("central_agent"): self.val_func = NNValueFunction(obs_dim, hid1_mult) self.policy = Policy(obs_dim, act_dim, kl_targ, hid1_mult, policy_logvar) self.num_tuple = 0 def update_parameter_server(self, episode, trajectories, name): self.num_tuple += len(trajectories) if len(trajectories) < batch_size: return # concatenate all episodes into single NumPy arrays observes, actions, advantages, disc_sum_rew = self.build_train_set( trajectories) # add various stats to training log: self.log_batch_stats(observes, actions, advantages, disc_sum_rew, logger, episode) self.policy.update(observes, actions, advantages, logger) # update policy self.val_func.fit(observes, disc_sum_rew, logger) # update value function logger.write(display=True) # write logger results to file and stdout print([ 'thread_name: ' + name + ', episode: ' + str(episode) + ', tuples: ' + str(self.num_tuple) ]) if ((episode % (batch_size * 3) == 0)): # & (name == "local_thread3")): #print(['stop']) self.policy.save(episode, filename1) self.val_func.save(episode, filename2) def build_train_set(self, trajectories): observes = np.concatenate([t['observes'] for t in trajectories]) actions = np.concatenate([t['actions'] for t in trajectories]) disc_sum_rew = np.concatenate( [t['disc_sum_rew'] for t in trajectories]) advantages = np.concatenate([t['advantages'] for t in trajectories]) advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-6) return observes, actions, advantages, disc_sum_rew def log_batch_stats(self, observes, actions, advantages, disc_sum_rew, logger, episode): logger.log({ '_mean_obs': np.mean(observes), '_min_obs': np.min(observes), '_max_obs': np.max(observes), '_std_obs': np.mean(np.var(observes, axis=0)), '_mean_act': np.mean(actions), '_min_act': np.min(actions), '_max_act': np.max(actions), '_std_act': np.mean(np.var(actions, axis=0)), '_mean_adv': np.mean(advantages), '_min_adv': np.min(advantages), '_max_adv': np.max(advantages), '_std_adv': np.var(advantages), '_mean_discrew': np.mean(disc_sum_rew), '_min_discrew': np.min(disc_sum_rew), '_max_discrew': np.max(disc_sum_rew), '_std_discrew': np.var(disc_sum_rew), '_Episode': episode })
def main(env_name, num_episodes, gamma, lam, kl_targ, batch_size, restore_path, out_path, thread_count, animation_mode, gait_name, gait_length, gaits_config_path, reward_mask, log_rewards, gait_reward_weight, g_colab, progress_reward_weight, phase_time_limit): """ Main training loop Args: env_name: OpenAI Gym environment name, e.g. 'Hopper-v1' num_episodes: maximum number of episodes to run gamma: reward discount factor (float) lam: lambda from Generalized Advantage Estimate kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new) batch_size: number of episodes per policy training batch """ killer = GracefulKiller() # restore_path = os.path.abspath(restore_path) env, obs_dim, act_dim = init_gym(env_name) log_rewards = log_rewards or (num_episodes == 0) env_list = [] if thread_count > 1: env_list, obs_dim, act_dim = init_gyms(env_name, batch_size) obs_dim += 1 # add 1 to obs dimension for time step feature (see run_episode()) start_time = datetime.now() # create unique directories start_time_str = start_time.strftime("%b-%d/%H.%M.%S") logger = Logger(logname=env_name, now=start_time_str, out_path=out_path) env.env.set_params(gaits_config_path=gaits_config_path, gait_name=gait_name, gait_cycle_len=gait_length, out_path=logger.path, log_rewards=log_rewards, render_mode=animation_mode, reward_mask=reward_mask, contact_reward=gait_reward_weight, g_colab=g_colab, progress_weight=progress_reward_weight, phase_time_limit=phase_time_limit) scaler = Scaler(obs_dim) val_func = NNValueFunction(obs_dim, logger, restore_path) policy = Policy(obs_dim, act_dim, kl_targ, logger, restore_path) log_train_info(logger, num_episodes, start_time_str, gait_name, gait_length, batch_size, restore_path, reward_mask, gait_reward_weight, progress_reward_weight, phase_time_limit) # run a few episodes of untrained policy to initialize scaler: episode = 0 try: if restore_path is None: print("\nInitializing scaler (may take some time)... ") run_policy(env, policy, scaler, logger, episodes=5) print("Done\n") else: scaler.load(restore_path, obs_dim) while episode < num_episodes: sim_time = datetime.now() if thread_count > 1: trajectories = run_policy_parallel(env_list, policy, scaler, logger, episodes=batch_size, thread_num=thread_count) else: trajectories = run_policy(env, policy, scaler, logger, episodes=batch_size) sim_time = datetime.now() - sim_time episode += len(trajectories) add_value(trajectories, val_func) # add estimated values to episodes add_disc_sum_rew(trajectories, gamma) # calculated discounted sum of Rs add_gae(trajectories, gamma, lam) # calculate advantage # concatenate all episodes into single NumPy arrays observes, actions, advantages, disc_sum_rew = build_train_set( trajectories) # add various stats to training log: train_time = datetime.now() - start_time policy_time = datetime.now() policy.update(observes, actions, advantages, logger) # update policy policy_time = datetime.now() - policy_time val_time = datetime.now() val_func.fit(observes, disc_sum_rew, logger) # update value function val_time = datetime.now() - val_time log_batch_stats(observes, actions, advantages, disc_sum_rew, logger, episode, train_time, sim_time, policy_time, val_time) logger.write( display=True) # write logger results to file and stdout print("Estimated time left: {}\n".format( estimate_time_left(episode, num_episodes, train_time))) if episode % 1000 == 0: policy.save() val_func.save() scaler.save(logger.path) print("Data saved at {}\n".format(logger.path)) update_train_info(logger, episode) if animation_mode > 0: run_policy(env, policy, scaler, logger, episodes=1, animate=True, anim_name='epizode_{}'.format(episode)) if episode % 5000 == 0: os.rename( os.path.join(logger.path, 'value_dump'), os.path.join(logger.path, 'value_dump_' + str(episode))) os.rename( os.path.join(logger.path, 'policy_dump'), os.path.join(logger.path, 'policy_dump_' + str(episode))) # if episode == 20000: # reward_mask = 63 # env.env.set_params(gaits_config_path=gaits_config_path, gait_name=gait_name, gait_cycle_len=gait_length, # out_path=logger.path, log_rewards=log_rewards, render_mode=animation_mode, # reward_mask=reward_mask, contact_reward=gait_reward_weight, g_colab=g_colab) print("Progress Enabled") if killer.kill_now: # if input('Terminate training (y/[n])? ') == 'y': # break # killer.kill_now = False break finally: if animation_mode > 0 or num_episodes == 0: print("Rendering result video") try: trajectories = run_policy( env, policy, scaler, logger, episodes=1, animate=True, anim_name='final_epizode_{}'.format(episode)) # for walk analysis for t in trajectories: logger.log_trajectory(t) except Exception as e: print("Failed to animate results, error: {}".format(e)) raise e scaler.save(logger.path) policy.close_sess() val_func.close_sess() update_train_info(logger, episode) logger.close()