Ejemplo n.º 1
0
def main(env_name, num_episodes, gamma, lam, kl_targ, batch_size):
    """ Main training loop

    Args:
        env_name: OpenAI Gym environment name, e.g. 'Hopper-v1'
        num_episodes: maximum number of episodes to run
        gamma: reward discount factor (float)
        lam: lambda from Generalized Advantage Estimate
        kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new)
        batch_size: number of episodes per policy training batch
    """
    killer = GracefulKiller()
    #TODO Change init_gym for one of my functions
    env, obs_dim, act_dim = init_gym(env_name)
    obs_dim += 1  # add 1 to obs dimension for time step feature (see run_episode())
    now = datetime.utcnow().strftime("%b-%d_%H:%M:%S").replace(":","_")  # create unique directories
    logger = Logger(logname=env_name, now=now)
    pathFolder = logger.pathFolder
    #Change wrappers.Monitor for a class of mine that controls de simulation
    #Creo que el wrapper no sirve de nada para mi ejemplo
    #env = wrappers.Monitor(env, aigym_path, force=True)
    scaler = Scaler(obs_dim)
    val_func = NNValueFunction(obs_dim)
    policy = Policy(obs_dim, act_dim, kl_targ)

    #Esto es para alimentar con el optimo
    trajectories = initiatePolicyWithOptimum(env, policy, scaler, logger)
    add_value(trajectories, val_func)  # add estimated values to episodes
    add_disc_sum_rew(trajectories, gamma)  # calculated discounted sum of Rs
    add_gae(trajectories, gamma, lam)  # calculate advantage
    # concatenate all episodes into single NumPy arrays
    observes, actions, advantages, disc_sum_rew = build_train_set(trajectories)
    print(actions.shape)
    policy.update(observes, actions, advantages, logger)  # update policy
    val_func.fit(observes, disc_sum_rew, logger)  # update value function

    # No estoy seguro de si esto es necesario ya
    # run a few episodes of untrained policy to initialize scaler:
    run_policy(env, policy, scaler, logger, episodes=5)
    episode = 0
    while episode < num_episodes:
        trajectories = run_policy(env, policy, scaler, logger, episodes=batch_size)
        episode += len(trajectories)
        add_value(trajectories, val_func)  # add estimated values to episodes
        add_disc_sum_rew(trajectories, gamma)  # calculated discounted sum of Rs
        add_gae(trajectories, gamma, lam)  # calculate advantage
        # concatenate all episodes into single NumPy arrays
        observes, actions, advantages, disc_sum_rew = build_train_set(trajectories)
        # add various stats to training log:
        log_batch_stats(observes, actions, advantages, disc_sum_rew, logger, episode)
        policy.update(observes, actions, advantages, logger)  # update policy
        val_func.fit(observes, disc_sum_rew, logger)  # update value function
        logger.write(display=True)  # write logger results to file and stdout
        if killer.kill_now:
            if input('Terminate training (y/[n])? ') == 'y':
                break
            killer.kill_now = False
    logger.close()
    policy.close_sess(pathFolder)
    val_func.close_sess(pathFolder)
Ejemplo n.º 2
0
def train_models(env_name, num_episodes, gamma, lam, kl_targ, coef,
                 use_lr_adjust, ada_kl_penalty, seed, epochs, phi_epochs,
                 max_timesteps, reg_scale, phi_lr, phi_hs, policy_size,
                 phi_obj, load_model):

    env, obs_dim, act_dim = init_gym(env_name)
    set_global_seeds(seed)
    env.seed(seed)
    env._max_episode_steps = max_timesteps
    obs_dim += 1  # add 1 to obs dimension for time step feature (see run_episode())
    now = datetime.utcnow().strftime(
        "%b-%d_%H:%M:%S")  # create unique directories
    aigym_path = os.path.join('log-files/', env_name, now)
    env = wrappers.Monitor(env, aigym_path, force=True, video_callable=False)
    scaler = Scaler(obs_dim)
    val_func = NNValueFunction(obs_dim)
    policy = Policy(obs_dim,
                    act_dim,
                    kl_targ,
                    epochs,
                    phi_epochs,
                    policy_size=policy_size,
                    phi_hidden_sizes=phi_hs,
                    reg_scale=reg_scale,
                    lr_phi=phi_lr,
                    phi_obj=phi_obj)

    run_policy(env, policy, scaler, num_episodes, max_timesteps=max_timesteps)

    episode = 0
    for _ in range(200):
        trajectories, traj_len_list = run_policy(env,
                                                 policy,
                                                 scaler,
                                                 num_episodes,
                                                 max_timesteps=max_timesteps)

        num_traj = len(trajectories)

        episode += len(trajectories)
        add_value(trajectories, val_func)
        add_disc_sum_rew(trajectories, gamma)
        add_gae(trajectories, gamma, lam)

        observes, actions, advantages, disc_sum_rew = build_train_set(
            trajectories)

        policy.update(load_model,
                      observes,
                      actions,
                      advantages,
                      use_lr_adjust,
                      ada_kl_penalty,
                      c=0.)  # update policy
        val_func.fit(observes, disc_sum_rew)

    # Save models
    policy.save_policy()
    val_func.save_val_func()
    logger.log("saved model")
Ejemplo n.º 3
0
def main(env_name, num_episodes, gamma, lam, kl_targ, batch_size):
    """ Main training loop

    Args:
        env_name: OpenAI Gym environment name, e.g. 'Hopper-v1'
        num_episodes: maximum number of episodes to run
        gamma: reward discount factor (float)
        lam: lambda from Generalized Advantage Estimate
        kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new)
        batch_size: number of episodes per policy training batch
    """

    killer = GracefulKiller()
    env, obs_dim, act_dim = init_gym(env_name, False)
    if time_state:
        obs_dim += 1  # add 1 to obs dimension for time step feature (see run_episode())
    now = datetime.utcnow().strftime(
        "%b-%d_%H-%M-%S")  # create unique directories
    logger = Logger(logname=env_name, now=now)

    scaler = Scaler(obs_dim, env_name)
    val_func = NNValueFunction(obs_dim, env_name, True)
    arg = [obs_dim, act_dim, kl_targ, time_state, env_name]
    policy = Policy(obs_dim, act_dim, kl_targ, env_name, True)

    episode = 0

    # to create new file at beginning of trial
    #f= open("coor_state.txt","w")
    #f.close

    while episode < num_episodes:
        trajectories = run_policy(env,
                                  policy,
                                  scaler,
                                  logger,
                                  arg,
                                  episodes=batch_size)
        episode += len(trajectories)
        add_value(trajectories, val_func)  # add estimated values to episodes
        add_disc_sum_rew(trajectories,
                         gamma)  # calculated discounted sum of Rs
        add_gae(trajectories, gamma, lam)  # calculate advantage
        # concatenate all episodes into single NumPy arrays
        observes, actions, advantages, disc_sum_rew = build_train_set(
            trajectories)
        # add various stats to training log:
        log_batch_stats(observes, actions, advantages, disc_sum_rew, logger,
                        episode)
        policy.update(observes, actions, advantages, logger)  # update policy
        val_func.fit(observes, disc_sum_rew, logger)  # update value function
        logger.write(display=True)  # write logger results to file and stdout
        scaler.save()
        if killer.kill_now:
            if input('Terminate training (y/[n])? ') == 'y':
                break
            killer.kill_now = False
    logger.close()
    policy.close_sess()
    val_func.close_sess()
Ejemplo n.º 4
0
def main(env_name, num_episodes, gamma, lam, kl_targ, batch_size, hid1_mult,
         policy_logvar):
    """ Main training loop

    Args:
        env_name: OpenAI Gym environment name, e.g. 'Hopper-v1'
        num_episodes: maximum number of episodes to run
        gamma: reward discount factor (float)
        lam: lambda from Generalized Advantage Estimate
        kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new)
        batch_size: number of episodes per policy training batch
        hid1_mult: hid1 size for policy and value_f (mutliplier of obs dimension)
        policy_logvar: natural log of initial policy variance
    """
    killer = GracefulKiller()
    env, obs_dim, act_dim = init_gym(env_name)
    obs_dim += 1  # add 1 to obs dimension for time step feature (see run_episode())
    now = datetime.utcnow().strftime(
        "%b_%d_%H_%M_%S")  # create unique directories
    #logger = Logger(logname=env_name, now=now)
    #aigym_path = os.path.join('/tmp', env_name, now)
    #env = wrappers.Monitor(env, aigym_path, force=True)

    logger = Logger(logname=env_name, now=now)
    aigym_path = os.path.join('videos', env_name, now)
    env = wrappers.Monitor(env, aigym_path, force=True)

    scaler = Scaler(obs_dim)
    val_func = NNValueFunction(obs_dim, hid1_mult)
    policy = Policy(obs_dim, act_dim, kl_targ, hid1_mult, policy_logvar)
    # run a few episodes of untrained policy to initialize scaler:
    run_policy(env, policy, scaler, logger, episodes=5)
    episode = 0
    while episode < num_episodes:
        trajectories = run_policy(env,
                                  policy,
                                  scaler,
                                  logger,
                                  episodes=batch_size)
        episode += len(trajectories)
        add_value(trajectories, val_func)  # add estimated values to episodes
        add_disc_sum_rew(trajectories,
                         gamma)  # calculated discounted sum of Rs
        add_gae(trajectories, gamma, lam)  # calculate advantage
        # concatenate all episodes into single NumPy arrays
        observes, actions, advantages, disc_sum_rew = build_train_set(
            trajectories)
        # add various stats to training log:
        log_batch_stats(observes, actions, advantages, disc_sum_rew, logger,
                        episode)
        policy.update(observes, actions, advantages, logger)  # update policy
        val_func.fit(observes, disc_sum_rew, logger)  # update value function
        logger.write(display=True)  # write logger results to file and stdout
        if killer.kill_now:
            if input('Terminate training (y/[n])? ') == 'y':
                break
            killer.kill_now = False
    logger.close()
    policy.close_sess()
    val_func.close_sess()
Ejemplo n.º 5
0
def main(num_episodes, gamma, lam, kl_targ, batch_size, hid1_mult, policy_logvar,
        scenario,  num_agents, action_dim, timesteps):
    """ Main training loop

    Args:
        env_name: OpenAI Gym environment name, e.g. 'Hopper-v1'
        num_episodes: maximum number of episodes to run
        gamma: reward discount factor (float)
        lam: lambda from Generalized Advantage Estimate
        kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new)
        batch_size: number of episodes per policy training batch
        hid1_mult: hid1 size for policy and value_f (mutliplier of obs dimension)
        policy_logvar: natural log of initial policy variance
    """
    killer = GracefulKiller()
    # env, obs_dim, act_dim = init_gym(env_name)
    env = make_env(scenario)
    obs_dims = env.observation_space
    act_dims = [env.action_space[0].n for i in range(env.n)]
   
    obs_dims = [obs_dim.shape[0] + 1 for obs_dim in obs_dims]  # add 1 to obs dimension for time step feature (see run_episode())
  
    now = datetime.utcnow().strftime("%b-%d_%H:%M:%S")  # create unique directories
    logger = Logger(logname=scenario, now=now)
    aigym_path = os.path.join('/tmp', scenario, now)
    # env = wrappers.Monitor(env, aigym_path, force=True)
    scaler = Scaler(obs_dims)
    val_func = NNValueFunction(obs_dims[0]+act_dims[0], hid1_mult)
    policys = []

    for i in range(num_agents):
        policys.append(Policy(i, obs_dims[i], act_dims[0], kl_targ, hid1_mult, policy_logvar, num_agents-1, timesteps))
    # run a few episodes of untrained policy to initialize scaler:
    run_policy(env, policys, scaler, logger,  act_dims[0], timesteps, episodes=5)
    episode = 0
    while episode < num_episodes:
        trajectories = run_policy(env, policys, scaler, logger, act_dims[0],timesteps, episodes=batch_size)
        episode += len(trajectories)
        add_value(trajectories, val_func)  # add estimated values to episodes
        add_disc_sum_rew(trajectories, gamma)  # calculated discounted sum of Rs
        add_gae(trajectories, gamma, lam)  # calculate advantage
        # concatenate all episodes into single NumPy arrays
        observes, actions, intents, act_trajs,  advantages, disc_sum_rew = build_train_set(trajectories)
        # add various stats to training log:
        # log_batch_stats(observes, actions,intents, act_trajs,  advantages, disc_sum_rew, logger, episode)
        for i, policy in enumerate(policys):

            policy.update(observes[i], actions[i], intents[i], act_trajs[i], advantages[i], logger)  # update policy
            val_func.fit(observes[i]+intents[i], disc_sum_rew[i], logger)  # update value function
        logger.write(display=True)  # write logger results to file and stdout
        if killer.kill_now:
            if input('Terminate training (y/[n])? ') == 'y':
                break
            killer.kill_now = False
    logger.close()
    for policy in policys:
        policy.close_sess()
    val_func.close_sess()
Ejemplo n.º 6
0
def main(arglist):
    """ Main training loop

    Args:
        env_name: OpenAI Gym environment name, e.g. 'Hopper-v1'
        num_episodes: maximum number of episodes to run
        gamma: reward discount factor (float)
        lam: lambda from Generalized Advantage Estimate
        kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new)
        batch_size: number of episodes per policy training batch
        hid1_mult: hid1 size for policy and value_f (mutliplier of obs dimension)
        policy_logvar: natural log of initial policy variance
    """
    killer = GracefulKiller()
    # env, obs_dim, act_dim = init_gym(aenv_name)
    env = make_env(arglist.scenario, arglist)
    obs_dim = env.observation_space[0].shape[0]
    act_dim = env.action_space[0].n
    obs_dim += 1  # add 1 to obs dimension for time step feature (see run_episode())
    now = datetime.utcnow().strftime("%b-%d_%H:%M:%S")  # create unique directories
    aigym_path = os.path.join('/tmp', arglist.scenario, now)
    scaler = Scaler(obs_dim)
    val_func = NNValueFunction(obs_dim, arglist.hid1_mult)
    trainers, loggers = get_trainers(env, arglist.num_adversaries, obs_dim, act_dim, arglist)
    # run a few episodes of untrained policy to initialize scaler:
    run_policy(env, trainers, scaler, loggers, arglist.max_episode_len , episodes=5)
    episode = 0
    while episode < arglist.num_episodes:
        trajectories = run_policy(env, trainers, scaler, loggers, arglist.max_episode_len ,  episodes=arglist.b_size)
        episode += len(trajectories[0])
        print("episode: {}".format(episode))
        add_value(trajectories, val_func)
        add_disc_sum_rew(trajectories, arglist.gamma)
        add_gae(trajectories, arglist.gamma, arglist.lam)
        observations, actions, advantages, disc_sum_rews = build_train_set(trajectories)
        log_batch_stats(observations, actions, advantages, disc_sum_rews, loggers, episode)
        for i in range(len(trainers)):
            trainers[i].update(observations[i], actions[i], advantages[i], loggers[i])
            val_func.fit(observations[i], disc_sum_rews[i], loggers[i])  
            loggers[i].write(display=True)  

        if killer.kill_now:
            if input('Terminate training (y/[n])? ') == 'y':
                break
            killer.kill_now = False
        if episode % arglist.save_rate == 0:
            print("Episode {} complete".format(episode))
            
        # score = play(env, policy1, policy2)   
    for i in range(len(loggers)):
        loggers[i].close()
        trainers[i].close_sess()
        val_func.close_sess()     
Ejemplo n.º 7
0
def main(env_name, num_episodes, render, gamma, lam, kl_targ, batch_size):
    """ Main training loop

    Args:
        env_name: OpenAI Gym environment name, e.g. 'Hopper-v1'
        num_episodes: maximum number of episodes to run
        gamma: reward discount factor (float)
        lam: lambda from Generalized Advantage Estimate
        kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new)
        batch_size: number of episodes per policy training batch
    """
    killer = GracefulKiller()
    env, obs_dim, act_dim = init_gym(env_name, render)
    obs_dim += 1  # add 1 to obs dimension for time step feature (see run_episode())
    now = datetime.utcnow().strftime("%b-%d_%H-%M-%S")  # create unique directories
    logger = Logger(logname=env_name, now=now)

    scaler = Scaler(obs_dim, env_name)
    val_func = NNValueFunction(obs_dim, env_name)
    policy = Policy(obs_dim, act_dim, kl_targ, env_name)
    # run a few episodes of untrained policy to initialize scaler:
    run_policy(env, policy, scaler, logger, episodes=5)
    episode = 0
    #capture = False
    while episode < num_episodes:
        trajectories = run_policy(env, policy, scaler, logger, episodes=batch_size)
        episode += len(trajectories)
        """if episode > 600 and not capture:
               env.ScreenCapture(5)
               capture = True"""
        add_value(trajectories, val_func)  # add estimated values to episodes
        add_disc_sum_rew(trajectories, gamma)  # calculated discounted sum of Rs
        add_gae(trajectories, gamma, lam)  # calculate advantage
        # concatenate all episodes into single NumPy arrays
        observes, actions, advantages, disc_sum_rew = build_train_set(trajectories)
        # add various stats to training log:
        log_batch_stats(observes, actions, advantages, disc_sum_rew, logger, episode)
        policy.update(observes, actions, advantages, logger)  # update policy
        val_func.fit(observes, disc_sum_rew, logger)  # update value function
        
        logger.write(display=True)  # write logger results to file and stdout
        scaler.save()
        if killer.kill_now:
            if input('Terminate training (y/[n])? ') == 'y':
                break
            killer.kill_now = False
    logger.close()
    policy.close_sess()
    val_func.close_sess()
def main(num_episodes, gamma, lam, kl_targ, batch_size, env_name="Hopper-v2"):
    """ Main training loop
    Args:
        env_name: OpenAI Gym environment name, e.g. 'Hopper-v1'
        num_episodes: maximum number of episodes to run
        gamma: reward discount factor (float)
        lam: lambda from Generalized Advantage Estimate
        kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new)
        batch_size: number of episodes per policy training batch
    """
    killer = GracefulKiller()
    env, obs_dim, act_dim = init_gym(env_name)
    obs_dim += 1  # add 1 to obs dimension for time step feature (see run_episode())
    now = (datetime.datetime.utcnow() - datetime.timedelta(hours=4)).strftime("%b-%d_%H:%M:%S")  # create dictionaries based on ets time
    logger = Logger(logname=env_name, now=now)
    plotter = Plot(plotname=env_name+"-Fig", now=now)
    aigym_path = os.path.join('/tmp', env_name, now)
    # env = wrappers.Monitor(env, aigym_path, force=True)  # recording, dir??
    scaler = Scaler(obs_dim)        # obs_dim=377
    val_func = NNValueFunction(obs_dim)
    policy = Policy(obs_dim, act_dim, kl_targ)  # kl target=0.003 by default
    # run a few episodes of untrained policy to initialize scaler:
    run_policy(env, policy, scaler, logger, plotter, episodes=5, plot=False)
    episode = 0
    while episode < num_episodes:
        trajectories = run_policy(env, policy, scaler, logger, plotter, episodes=batch_size)
        episode += len(trajectories)    # length of trajectories equals batch size which by default is 20
        plotter.updateEpisodes(episode)
        add_value(trajectories, val_func)  # add estimated values to episodes
        add_disc_sum_rew(trajectories, gamma)  # calculated discounted sum of Rs
        add_gae(trajectories, gamma, lam)  # calculate advantage
        # concatenate all episodes into single NumPy arrays
        observes, actions, advantages, disc_sum_rew = build_train_set(trajectories)
        # add various stats to training log:
        log_batch_stats(observes, actions, advantages, disc_sum_rew, logger, episode)
        policy.update(observes, actions, advantages, logger, plotter)  # update policy
        val_func.fit(observes, disc_sum_rew, logger)  # update value function
        logger.write(display=True)  # write logger results to file and stdout
        if killer.kill_now:
            if input('Terminate training (y/[n])? ') == 'y':
                break
            killer.kill_now = False
    logger.close()
    plotter.plot()
    # plt.show()

    policy.close_sess()
    val_func.close_sess()
Ejemplo n.º 9
0
def main(env_name, num_episodes, gamma, lam, kl_targ, batch_size):
    """ Main training loop """
    killer = GracefulKiller()
    env, obs_dim, act_dim = init_gym(env_name)
    obs_dim += 1  # add 1 to obs dimension for time step feature (see run_episode())
    now = datetime.utcnow().strftime(
        "%b-%d_%H:%M:%S")  # create unique directories
    logger = Logger(logname=env_name, now=now)
    aigym_path = os.path.join('/home/vatsal', env_name, now)
    env = wrappers.Monitor(env, aigym_path, force=True)
    scaler = Scaler(obs_dim)
    val_func = NNValueFunction(obs_dim)
    policy = Policy(obs_dim, act_dim, kl_targ)
    # run a few episodes of untrained policy to initialize scaler:
    run_policy(env, policy, scaler, logger, episodes=5)
    episode = 0
    while episode < num_episodes:
        trajectories = run_policy(env,
                                  policy,
                                  scaler,
                                  logger,
                                  episodes=batch_size)
        episode += len(trajectories)
        add_value(trajectories, val_func)  # add estimated values to episodes
        add_disc_sum_rew(trajectories,
                         gamma)  # calculated discounted sum of Rs
        add_gae(trajectories, gamma, lam)  # calculate advantage
        # concatenate all episodes into single NumPy arrays
        observes, actions, advantages, disc_sum_rew = build_train_set(
            trajectories)
        # add various stats to training log:
        log_batch_stats(observes, actions, advantages, disc_sum_rew, logger,
                        episode)
        policy.update(observes, actions, advantages, logger)  # update policy
        val_func.fit(observes, disc_sum_rew, logger)  # update value function
        logger.write(display=True)  # write logger results to file and stdout
        if killer.kill_now:
            if input('Terminate training (y/[n])? ') == 'y':
                break
            killer.kill_now = False
    logger.close()
    policy.close_sess()
    val_func.close_sess()
Ejemplo n.º 10
0
def main(env_name, num_episodes, gamma, lam, kl_targ, batch_size):
	'''
	Main training loop 

	Args:
		env_name: Robot model name
		num_episodes: maximum umber of episodes to run (int)
		gamma: reward discount factor (float)
		lam: lambda for Generalized Advantage Estimate
		kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new)]
		bath_size: number of episodes per policy training batch
	'''
	env, obs_dim, act_dim = init_env(env_name)
	obs_dim += 1 # add 1 to obs dimension for time step feature (see run_episode())
	now = datetime.utcnow().strftime("%b-%d_%H:%M:%S").replace(":","_")  # create unique directories
	logger = Logger(logname=env_name, now=now)
	pathFolder = logger.pathFolder
	scaler = Scaler(obs_dim)
	val_func = NNValueFunction(obs_dim)
	policy = Policy(obs_dim, act_dim, kl_targ)
	acumulator = BestAcumulator()
	#TODO agregar la parte de sampling una vez que todo ande

	# run a few episodes of untrained policy to initialize scaler:
	run_policy(env, policy, scaler, logger, 5, acumulator)
	episode = 0
	while episode < num_episodes:
		trajectories = run_policy(env, policy, scaler, logger, batch_size, acumulator)
		episode += len(trajectories)
		add_value(trajectories, val_func) # add estimated values to episodes
		add_disc_sum_rew(trajectories, gamma) # calculate discounted sum of Rs
		add_gae(trajectories, gamma, lam) # calculate advantage
		# concatenate all episodes into single NumPy arrays
		observes, actions, advantages, disc_sum_rew = build_train_set(trajectories)
		# add various stats to train log:
		log_batch_stats(observes, actions, advantages, disc_sum_rew, logger, episode)
		policy.update(observes, actions, advantages, logger)  # update policy
		val_func.fit(observes, disc_sum_rew, logger)  # update value function
		logger.write(display=True)  # write logger results to file and stdout
	acumulator.save(pathFolder)
	logger.close()
	policy.close_sess(pathFolder)
	val_func.close_sess(pathFolder)
Ejemplo n.º 11
0
def main(num_episodes, gamma, lam, kl_targ, batch_size, hid1_mult,
         policy_logvar):
    """ Main training loop
    Args:
        num_episodes: maximum number of episodes to run
        gamma: reward discount factor (float)
        lam: lambda from Generalized Advantage Estimate
        kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new)
        batch_size: number of episodes per policy training batch
        hid1_mult: hid1 size for policy and value_f (mutliplier of obs dimension)
        policy_logvar: natural log of initial policy variance
    """
    killer = GracefulKiller()
    env, obs_dim, act_dim = init_gym()
    obs_dim += 1  # add 1 to obs dimension for time step feature (see run_episode())

    scaler = Scaler(obs_dim)
    val_func = NNValueFunction(obs_dim, hid1_mult)
    policy = Policy(obs_dim, act_dim, kl_targ, hid1_mult, policy_logvar)
    # run a few episodes of untrained policy to initialize scaler:
    run_policy(env, policy, scaler, episodes=5)
    episode = 0
    #Inizialize reward list (to keep track of improvements)
    avg_rew_list = []
    while episode < num_episodes:
        print(episode)
        trajectories = run_policy(env, policy, scaler, episodes=batch_size)
        episode += len(trajectories)
        add_value(trajectories, val_func)  # add estimated values to episodes
        add_disc_sum_rew(trajectories,
                         gamma)  # calculated discounted sum of Rs
        add_gae(trajectories, gamma, lam)  # calculate advantage
        # concatenate all episodes into single NumPy arrays
        observes, actions, advantages, disc_sum_rew = build_train_set(
            trajectories)
        # add various stats to training log:
        policy.update(observes, actions, advantages)  # update policy
        val_func.fit(observes, disc_sum_rew)  # update value function
        avg_rew_list.append(avg_rewards(trajectories))
        #Save every 20000 epidodes models (value_func, policy, scaler) and average rewards
        if not episode % 20000:
            print("Saving models")
            policy.save(episode)
            val_func.save(episode)
            f = open("models/scaler-" + str(episode) + ".pkl", 'wb')
            pickle.dump(scaler, f, pickle.HIGHEST_PROTOCOL)
            f.close()
            f2 = open("models/rewards-" + str(episode) + ".pkl", 'wb')
            pickle.dump(deepcopy(avg_rew_list), f2, pickle.HIGHEST_PROTOCOL)
            f2.close()
        if killer.kill_now:
            if input('Terminate training (y/[n])? ') == 'y':
                break
            killer.kill_now = False
    #Show animation at the end of training
    while True:
        obs = env.reset()
        step = 0.0
        scale, offset = scaler.get()
        scale[-1] = 1.0
        offset[-1] = 0.0
        done = False
        while not done:
            obs = obs.astype(np.float32).reshape((1, -1))
            obs = np.append(obs, [[step]], axis=1)
            obs = (obs - offset) * scale
            action = policy.sample(obs).reshape((1, -1)).astype(np.float32)
            obs, reward, done, _ = env.step(np.squeeze(action, axis=0))
            env.render1()
            env.render2()
            step += 1e-3
    policy.close_sess()
    val_func.close_sess()
Ejemplo n.º 12
0
def main(env_name, num_episodes, gamma, lam, kl_targ, batch_size, hid1_mult,
         policy_logvar, weights_path, init_episode, experiment_name, resume):
    """ Main training loop

    Args:
        env_name: OpenAI Gym environment name, e.g. 'Hopper-v1'
        num_episodes: maximum number of episodes to run
        gamma: reward discount factor (float)
        lam: lambda from Generalized Advantage Estimate
        kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new)
        batch_size: number of episodes per policy training batch
        hid1_mult: hid1 size for policy and value_f (mutliplier of obs dimension)
        policy_logvar: natural log of initial policy variance
    """
    killer = GracefulKiller()
    logger = Logger(logname=env_name, sub_dir=experiment_name)
    aigym_path = os.path.join('results', env_name, experiment_name)

    if resume:
        weights_path = aigym_path
        ckpt = tf.train.get_checkpoint_state(weights_path)
        init_episode = int(
            os.path.basename(ckpt.model_checkpoint_path).split('-')[1])

    env, obs_dim, act_dim = init_gym(env_name)
    # obs_dim += 1  # add 1 to obs dimension for time step feature (see run_episode())

    # env = wrappers.Monitor(env, aigym_path, force=True)
    scaler = Scaler(obs_dim)
    val_func = NNValueFunction(obs_dim, hid1_mult)
    policy = Policy(obs_dim, act_dim, kl_targ, hid1_mult, policy_logvar,
                    weights_path)
    # run a few episodes of untrained policy to initialize scaler:
    run_policy(env, policy, scaler, logger, episodes=5)
    episode = init_episode
    while episode <= num_episodes:
        if episode % 1000 is 0:
            # record one episode
            record(env_name, aigym_path, policy, scaler)
            policy.save(aigym_path, episode)
        trajectories = run_policy(env,
                                  policy,
                                  scaler,
                                  logger,
                                  episodes=batch_size)
        episode += len(trajectories)
        add_value(trajectories, val_func)  # add estimated values to episodes
        add_disc_sum_rew(trajectories,
                         gamma)  # calculated discounted sum of Rs
        add_gae(trajectories, gamma, lam)  # calculate advantage
        # concatenate all episodes into single NumPy arrays
        observes, actions, advantages, disc_sum_rew = build_train_set(
            trajectories)
        # add various stats to training log:
        log_batch_stats(observes, actions, advantages, disc_sum_rew, logger,
                        episode)
        policy.update(observes, actions, advantages, logger)  # update policy
        val_func.fit(observes, disc_sum_rew, logger)  # update value function
        logger.write(display=True)  # write logger results to file and stdout
        if killer.kill_now:
            if input('Terminate training (y/[n])? ') == 'y':
                break
            killer.kill_now = False
    #record one last episode
    record(env_name, aigym_path, policy, scaler)
    logger.close()
    policy.close_sess()
    val_func.close_sess()
Ejemplo n.º 13
0
def main(env_name, num_episodes, gamma, lam, kl_targ, batch_size,
         net_size_factor, noise_bias, weight, use_ppoclip):
    """ Main training loop

    Args:
        env_name: OpenAI Gym environment name, e.g. 'Hopper-v1'
        num_episodes: maximum number of episodes to run
        gamma: reward discount factor (float)
        lam: lambda from Generalized Advantage Estimate
        kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new)
        batch_size: number of episodes per policy training batch
    """
    killer = GracefulKiller()
    env, obs_dim, act_dim = init_gym(env_name)
    obs_dim += 1  # add 1 to obs dimension for time step feature (see run_episode())
    # now = datetime.utcnow().strftime("%b-%d_%H:%M:%S")  # create unique directories
    now = datetime.now().strftime("%b-%d_%H:%M:%S") + "_single"
    logger = Logger(logname=env_name, now=now)
    aigym_path = os.path.join('/tmp', env_name, now)
    # env = wrappers.Monitor(env, aigym_path, force=True)
    scaler = Scaler(obs_dim)
    if weight == "None":
        val_func = NNValueFunction(obs_dim, net_size_factor=net_size_factor)
        policy = None
        if use_ppoclip == "False":
            policy = Policy(obs_dim,
                            act_dim,
                            kl_targ,
                            net_size_factor=net_size_factor,
                            noise_bias=noise_bias)
        elif use_ppoclip == "True":
            policy = PolicyClip(obs_dim,
                                act_dim,
                                kl_targ,
                                net_size_factor=net_size_factor,
                                noise_bias=noise_bias)
            #assert False, "Not tested"
        else:
            assert False, "Unreachable"
    else:
        token = weight.split(".")
        token[-3] = token[-3][:-5] + "value"
        weight_2 = ".".join(token)
        val_func = NNValueFunctionContinue(weight_2,
                                           obs_dim,
                                           net_size_factor=net_size_factor)
        policy = PolicyContinue(weight,
                                obs_dim,
                                act_dim,
                                kl_targ,
                                net_size_factor=net_size_factor,
                                noise_bias=noise_bias)
    # run a few episodes of untrained policy to initialize scaler:
    run_policy(env, policy, scaler, logger, episodes=5)
    episode = 0
    while episode < num_episodes:
        trajectories = run_policy(env,
                                  policy,
                                  scaler,
                                  logger,
                                  episodes=batch_size)
        episode += len(trajectories)
        add_value(trajectories, val_func)  # add estimated values to episodes
        add_disc_sum_rew(trajectories,
                         gamma)  # calculated discounted sum of Rs
        add_gae(trajectories, gamma, lam)  # calculate advantage
        # concatenate all episodes into single NumPy arrays
        observes, actions, advantages, disc_sum_rew = build_train_set(
            trajectories)
        # add various stats to training log:
        log_batch_stats(observes, actions, advantages, disc_sum_rew, logger,
                        episode)
        policy.update(observes, actions, advantages, logger,
                      scaler)  # update policy
        val_func.fit(observes, disc_sum_rew, logger)  # update value function
        logger.write(display=True)  # write logger results to file and stdout
        if killer.kill_now:
            if input('Terminate training (y/[n])? ') == 'y':
                break
            killer.kill_now = False
    logger.close()
    # with open("test_dump", 'w') as f:
    #     pickle.dump(policy, f)
    policy.close_sess()
    val_func.close_sess()
def eval_models(env_name, num_episodes, gamma, lam, kl_targ, coef,
                use_lr_adjust, ada_kl_penalty, seed, epochs, phi_epochs,
                max_timesteps, reg_scale, phi_lr, phi_hs, policy_size, phi_obj,
                load_model):

    env, obs_dim, act_dim = init_gym(env_name)
    set_global_seeds(seed)
    env.seed(seed)
    env._max_episode_steps = max_timesteps
    obs_dim += 1
    now = datetime.utcnow().strftime("%b-%d_%H:%M:%S")
    aigym_path = os.path.join('log-files/', env_name, now)
    env = wrappers.Monitor(env, aigym_path, force=True, video_callable=False)
    # scaler = Scaler(obs_dim)
    logger.log("loading scaler")
    with open('models/scaler/scaler.pkl', 'rb') as input:
        scaler = pickle.load(input)
    val_func = NNValueFunction(obs_dim)
    policy = Policy(obs_dim,
                    act_dim,
                    kl_targ,
                    epochs,
                    phi_epochs,
                    policy_size=policy_size,
                    phi_hidden_sizes=phi_hs,
                    reg_scale=reg_scale,
                    lr_phi=phi_lr,
                    phi_obj=phi_obj)

    logger.log("loading model")
    load_dir = "models/"
    policy.load_model(load_dir)
    load_v = False  #whether load value function baseline or train from scratch; no big impact on stein
    if load_v == True:
        val_func.load_val_model(load_dir)

    episode = 0

    trajectories, traj_len_list = run_policy(env,
                                             policy,
                                             scaler,
                                             num_episodes,
                                             max_timesteps=max_timesteps,
                                             mode=load_model)

    num_traj = len(trajectories)
    logger.log("Avg Length %d total Length %d"%( \
            np.mean(traj_len_list), \
            np.sum(traj_len_list)))

    episode += len(trajectories)

    #Split data into validation and training data
    random.shuffle(trajectories)
    t_trajectories = trajectories[:int(len(trajectories) / 2)]
    v_trajectories = trajectories[int(len(trajectories) / 2):]

    refit_v = True  # if fit value function baseline once again before evaluating; no big impact on stein
    if refit_v == True:
        tt_trajectories = copy.deepcopy(t_trajectories)
        add_value(tt_trajectories, val_func)
        add_disc_sum_rew(tt_trajectories, gamma)
        add_gae(tt_trajectories, gamma, lam)
        tt_observes, tt_actions, tt_advantages, tt_disc_sum_rew = build_train_set(
            tt_trajectories)
        logger.log("refit value function baseline")
        val_func.fit(tt_observes, tt_disc_sum_rew)  # update value function
        logger.log("done")

    # build training data after refit v
    add_value(t_trajectories, val_func)
    add_disc_sum_rew(t_trajectories, gamma)
    add_gae(t_trajectories, gamma, lam)
    t_observes, t_actions, t_advantages, t_disc_sum_rew = build_train_set(
        t_trajectories)

    # build validation data after refit v
    add_value(v_trajectories, val_func)
    add_disc_sum_rew(v_trajectories, gamma)
    add_gae(v_trajectories, gamma, lam)
    v_observes, v_actions, v_advantages, v_disc_sum_rew = build_train_set(
        v_trajectories)

    sub_folder = "max_timesteps=%s_eval_data/%s_%s_data_seed=%d_max-steps=%d"%(\
                        max_timesteps, env_name, phi_obj,
                        seed, max_timesteps)
    if not os.path.exists(sub_folder):
        os.mkdir(sub_folder)

    # save original gradient
    mc_grad_info = policy.get_batch_gradient(v_observes,
                                             v_actions,
                                             v_advantages,
                                             c=0.)
    mc_grad_info['traj_lens'] = traj_len_list
    with open(sub_folder + '/mc_num_episode=%d.pkl' % (num_episodes),
              'wb') as fp:
        pickle.dump(mc_grad_info, fp)

    d = Dataset(dict(ob=t_observes,
                     ac=t_actions,
                     atarg=t_advantages,
                     vtarg=t_disc_sum_rew),
                shuffle=True)
    for _ in range(phi_epochs):  # optim_epochs
        for batch in d.iterate_once(128):  # optim_batchsize
            policy.update(load_model,
                          batch['ob'],
                          batch['ac'],
                          batch['atarg'],
                          use_lr_adjust,
                          ada_kl_penalty,
                          c=1)  # update policy

    stein_grad_info = policy.get_batch_gradient(v_observes, \
                    v_actions, v_advantages, c=1.)

    stein_grad_info['traj_lens'] = traj_len_list
    with open(sub_folder + '/stein_num_episode=%d.pkl' % (num_episodes),
              'wb') as fp:
        pickle.dump(stein_grad_info, fp)
Ejemplo n.º 15
0
def main(env_name, num_episodes, gamma, lam, kl_targ, batch_size, nprocs,
         policy_hid_list, valfunc_hid_list, gpu_pct):
    """ Main training loop

    Args:
        env_name: OpenAI Gym environment name, e.g. 'Hopper-v1'
        num_episodes: maximum number of episodes to run
        gamma: reward discount factor (float)
        lam: lambda from Generalized Advantage Estimate
        kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new)
        batch_size: number of episodes per policy training batch
    """
    # killer = GracefulKiller()

    env, obs_dim, act_dim = init_gym(env_name)
    env.seed(111 + mpi_util.rank)
    mpi_util.set_global_seeds(111 + mpi_util.rank)

    obs_dim += 1  # add 1 to obs dimension for time step feature (see run_episode())
    if mpi_util.rank == 0:
        now = datetime.utcnow().strftime(
            "%b-%d_%H:%M:%S")  # create unique directories
        aigym_path = os.path.join('/tmp', env_name, now)
        env = wrappers.Monitor(env, aigym_path, force=True)
        logger = Logger(logname=env_name, now=now)

    policy = Policy(obs_dim, act_dim, kl_targ)
    scaler = Scaler(obs_dim)
    val_func = NNValueFunction(obs_dim)

    if mpi_util.rank == 0:
        # run a few episodes (on node 0) of untrained policy to initialize scaler:
        trajectories = run_policy(env, policy, scaler, episodes=5)

        unscaled = np.concatenate([t['unscaled_obs'] for t in trajectories])
        scaler.update(
            unscaled)  # update running statistics for scaling observations

    # broadcast policy weights, scaler, val_func
    (policy, scaler,
     val_func) = mpi_util.broadcast_policy_scaler_val(policy, scaler, val_func)

    worker_batch_size = int(batch_size / mpi_util.nworkers)  # HACK
    if (worker_batch_size * mpi_util.nworkers != batch_size):
        print("batch_size:", batch_size, " is not divisible by nworkers:",
              mpi_util.nworkers)
        exit(1)

    episode = 0
    while episode < num_episodes:
        trajectories = run_policy(env,
                                  policy,
                                  scaler,
                                  episodes=worker_batch_size)
        trajectories = mpi_util.gather_trajectories(trajectories)

        if mpi_util.rank == 0:
            # concatentate trajectories into one list
            trajectories = list(itertools.chain.from_iterable(trajectories))
            print("did a batch of ", len(trajectories), " trajectories")
            print([t['rewards'].sum() for t in trajectories])

            episode += len(trajectories)
            add_value(trajectories,
                      val_func)  # add estimated values to episodes
            add_disc_sum_rew(trajectories,
                             gamma)  # calculated discounted sum of Rs
            add_gae(trajectories, gamma, lam)  # calculate advantage

            # concatenate all episodes into single NumPy arrays
            observes, actions, advantages, disc_sum_rew = build_train_set(
                trajectories)

            # add various stats to training log:
            logger.log({
                '_MeanReward':
                np.mean([t['rewards'].sum() for t in trajectories]),
                'Steps':
                np.sum([t['observes'].shape[0] for t in trajectories])
            })
            log_batch_stats(observes, actions, advantages, disc_sum_rew,
                            logger, episode)

            policy.update(observes, actions, advantages,
                          logger)  # update policy
            val_func.fit(observes, disc_sum_rew,
                         logger)  # update value function

            unscaled = np.concatenate(
                [t['unscaled_obs'] for t in trajectories])
            scaler.update(
                unscaled)  # update running statistics for scaling observations

            logger.write(
                display=True)  # write logger results to file and stdout

        # if mpi_util.rank == 0 and killer.kill_now:
        #     if input('Terminate training (y/[n])? ') == 'y':
        #         break
        #     killer.kill_now = False

        # broadcast policy weights, scaler, val_func
        (policy, scaler, val_func) = mpi_util.broadcast_policy_scaler_val(
            policy, scaler, val_func)

    if mpi_util.rank == 0: logger.close()
    policy.close_sess()
    if mpi_util.rank == 0: val_func.close_sess()
Ejemplo n.º 16
0
class Policy():
    def __init__(self,
                 name,
                 obs_dim,
                 act_dim,
                 n_ways,
                 batch_size,
                 log_path,
                 gamma=0.995,
                 lam=0.98,
                 kl_targ=0.003,
                 hid1_mult=10,
                 policy_logvar=1.0):
        self.name = name
        self.obs_dim, self.act_dim = obs_dim, act_dim
        self.n_ways = n_ways
        self.batch_size = batch_size
        self.gamma = gamma
        self.lam = lam
        self.kl_targ = kl_targ
        self.hid1_mult = hid1_mult
        self.policy_logvar = policy_logvar
        self.logger = Logger(logname=os.path.join(log_path, name),
                             now=datetime.utcnow().strftime("%b_%d_%H_%M_%S"))

        self.scaler = Scaler(self.obs_dim)
        self.val_func = NNValueFunction(self.obs_dim, hid1_mult=10)
        self.trpo_net = TrpoNet(name,
                                self.obs_dim,
                                self.act_dim,
                                n_ways=n_ways,
                                kl_targ=kl_targ,
                                hid1_mult=hid1_mult,
                                policy_logvar=policy_logvar)

        self.trajectories = []
        self.episode = 0

    def update_scaler(self, unscaled):
        self.scaler.update(
            unscaled)  # update running statistics for scaling observations

    def update(self,
               unscaled_obs,
               actions,
               rewards,
               env_idx=-1,
               trainWeight=False):
        scale, offset = self.scaler.get()
        scale[-1] = 1.0
        offset[-1] = 0.0
        observes = (unscaled_obs - offset) * scale
        trajectory = {
            'observes': observes,
            'actions': actions,
            'rewards': rewards,
            'unscaled_obs': unscaled_obs
        }
        self.trajectories.append(trajectory)
        if len(self.trajectories) > self.batch_size:
            unscaled = np.concatenate(
                [t['unscaled_obs'] for t in self.trajectories])
            self.scaler.update(
                unscaled)  # update running statistics for scaling observations
            self.logger.log({
                '_{}_MeanReward'.format(self.name):
                np.mean([t['rewards'].sum() for t in self.trajectories]),
                '_{}_steps'.format(self.name):
                unscaled.shape[0] / self.batch_size
            })
            trajs = copy.deepcopy(self.trajectories)
            self.trajectories = []

            self.episode += len(trajs)
            self._add_value(trajs,
                            self.val_func)  # add estimated values to episodes
            self._add_disc_sum_rew(
                trajs, self.gamma)  # calculated discounted sum of Rs
            self._add_gae(trajs, self.gamma, self.lam)  # calculate advantage
            # concatenate all episodes into single NumPy arrays
            observes, actions, advantages, disc_sum_rew = self._build_train_set(
                trajs)
            self._log_batch_stats(observes, actions, advantages, disc_sum_rew,
                                  self.logger, self.episode)
            self.trpo_net.update(observes,
                                 actions,
                                 advantages,
                                 env_idx,
                                 self.logger,
                                 trainWeight=trainWeight)  # update policy
            self.val_func.fit(observes, disc_sum_rew,
                              self.logger)  # update value function

            self.logger.write(display=False)

    def act(self, unscaled_obs):
        scale, offset = self.scaler.get()
        scale[-1] = 1.0  # don't scale time step feature
        offset[-1] = 0.0  # don't offset time step feature
        #print(self.name,unscaled_obs.shape,len(offset))
        obs = (unscaled_obs - offset) * scale
        action = self.trpo_net.sample(obs).reshape((1, -1)).astype(np.float32)
        return action

    def addway(self):
        self.n_ways += 1

        var_dict = self.trpo_net.get_vars()
        new_pi = TrpoNet(self.name, self.obs_dim, self.act_dim, self.n_ways,
                         self.kl_targ, self.hid1_mult, self.policy_logvar)
        new_pi.set_vars(var_dict)
        self.trpo_net.close_sess()
        self.trpo_net = new_pi
        gc.collect()

    def close_session(self):
        self.val_func.close_sess()
        self.trpo_net.close_sess()

    def _discount(self, x, gamma):
        """ Calculate discounted forward sum of a sequence at each point """
        return scipy.signal.lfilter([1.0], [1.0, -gamma], x[::-1])[::-1]

    def _add_value(self, trajectories, val_func):
        """ Adds estimated value to all time steps of all trajectories

        Args:
            trajectories: as returned by run_policy()
            val_func: object with predict() method, takes observations
                and returns predicted state value

        Returns:
            None (mutates trajectories dictionary to add 'values')
        """
        for trajectory in trajectories:
            observes = trajectory['observes']
            values = val_func.predict(observes)
            trajectory['values'] = values

    def _add_disc_sum_rew(self, trajectories, gamma):
        """ Adds discounted sum of rewards to all time steps of all trajectories

        Args:
            trajectories: as returned by run_policy()
            gamma: discount

        Returns:
            None (mutates trajectories dictionary to add 'disc_sum_rew')
        """
        for trajectory in trajectories:
            if gamma < 0.999:  # don't scale for gamma ~= 1
                rewards = trajectory['rewards'] * (1 - gamma)
            else:
                rewards = trajectory['rewards']
            disc_sum_rew = self._discount(rewards, gamma)
            trajectory['disc_sum_rew'] = disc_sum_rew

    def _add_gae(self, trajectories, gamma, lam):
        """ Add generalized advantage estimator.
        https://arxiv.org/pdf/1506.02438.pdf

        Args:
            trajectories: as returned by run_policy(), must include 'values'
                key from add_value().
            gamma: reward discount
            lam: lambda (see paper).
                lam=0 : use TD residuals
                lam=1 : A =  Sum Discounted Rewards - V_hat(s)

        Returns:
            None (mutates trajectories dictionary to add 'advantages')
        """
        for trajectory in trajectories:
            if gamma < 0.999:  # don't scale for gamma ~= 1
                rewards = trajectory['rewards'] * (1 - gamma)
            else:
                rewards = trajectory['rewards']
            values = trajectory['values']
            # temporal differences
            tds = rewards - values + np.append(values[1:] * gamma, 0)
            advantages = self._discount(tds, gamma * lam)
            trajectory['advantages'] = advantages

    def _build_train_set(self, trajectories):
        """

        Args:
            trajectories: trajectories after processing by add_disc_sum_rew(),
                add_value(), and add_gae()

        Returns: 4-tuple of NumPy arrays
            observes: shape = (N, obs_dim)
            actions: shape = (N, act_dim)
            advantages: shape = (N,)
            disc_sum_rew: shape = (N,)
        """
        observes = np.concatenate([t['observes'] for t in trajectories])
        actions = np.concatenate([t['actions'] for t in trajectories])
        disc_sum_rew = np.concatenate(
            [t['disc_sum_rew'] for t in trajectories])
        advantages = np.concatenate([t['advantages'] for t in trajectories])
        # normalize advantages
        advantages = (advantages - advantages.mean()) / (advantages.std() +
                                                         1e-6)

        return observes, actions, advantages, disc_sum_rew

    def _log_batch_stats(self, observes, actions, advantages, disc_sum_rew,
                         logger, episode):
        """ Log various batch statistics """
        logger.log({
            '_mean_obs': np.mean(observes),
            '_min_obs': np.min(observes),
            '_max_obs': np.max(observes),
            '_std_obs': np.mean(np.var(observes, axis=0)),
            '_mean_act': np.mean(actions),
            '_min_act': np.min(actions),
            '_max_act': np.max(actions),
            '_std_act': np.mean(np.var(actions, axis=0)),
            '_mean_adv': np.mean(advantages),
            '_min_adv': np.min(advantages),
            '_max_adv': np.max(advantages),
            '_std_adv': np.var(advantages),
            '_mean_discrew': np.mean(disc_sum_rew),
            '_min_discrew': np.min(disc_sum_rew),
            '_max_discrew': np.max(disc_sum_rew),
            '_std_discrew': np.var(disc_sum_rew),
            '_Episode': episode
        })
Ejemplo n.º 17
0
def main(env_name, num_episodes, gamma, lam, kl_targ, batch_size, hid1_mult, policy_logvar, animate, evaluate, load_ckpt):
    """ Main training loop

    Args:
        env_name: OpenAI Gym environment name, e.g. 'Hopper-v1'
        num_episodes: maximum number of episodes to run
        gamma: reward discount factor (float)
        lam: lambda from Generalized Advantage Estimate
        kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new)
        batch_size: number of episodes per policy training batch
        hid1_mult: hid1 size for policy and value_f (mutliplier of obs dimension)
        policy_logvar: natural log of initial policy variance
    """

    killer = GracefulKiller()
    env, obs_dim, act_dim = init_gym(env_name)

    # Observations here are the previously applied torques + current angles of joints
    obs_dim = 8 + 8 + 5

    # Actions are 1 torque value per oscillator
    act_dim = 8

    now = datetime.utcnow().strftime("%b-%d_%H:%M:%S")  # create unique directories
    logger = Logger(logname=env_name, now=now)
    
    scaler = Scaler(obs_dim)
    val_func = NNValueFunction(obs_dim, hid1_mult)
    policy = Policy(obs_dim, act_dim, kl_targ, hid1_mult, policy_logvar)


    if evaluate:
        print("Evaluating: ")
        eval_agent(env, policy, logger, obs_dim, act_dim, 15)
        exit()

    if load_ckpt:
        print("Loading last ckpt: ")
        policy.restore_weights()

    # run a few episodes of untrained policy to initialize scaler:
    run_policy(env, policy, scaler, logger, 5, animate)
    episode = 0
    while episode < num_episodes:
        trajectories = run_policy(env, policy, scaler, logger, batch_size, animate)
        episode += len(trajectories)
        add_value(trajectories, val_func)  # add estimated values to episodes
        add_disc_sum_rew(trajectories, gamma)  # calculated discounted sum of Rs
        add_gae(trajectories, gamma, lam)  # calculate advantage
        # concatenate all episodes into single NumPy arrays
        observes, actions, advantages, disc_sum_rew = build_train_set(trajectories)
        # add various stats to training log:
        log_batch_stats(observes, actions, advantages, disc_sum_rew, logger, episode)
        policy.update(observes, actions, advantages, logger)  # update policy
        val_func.fit(observes, disc_sum_rew, logger)  # update value function
        logger.write(display=True)  # write logger results to file and stdout
        if killer.kill_now:
            if input('Terminate training (y/[n])? ') == 'y':
                break
            killer.kill_now = False
    logger.close()
    policy.save_weights()
    policy.close_sess()
    val_func.save_weights()
    print("Saved policy and VF weights.")
    val_func.close_sess()
Ejemplo n.º 18
0
    obs_dim += 1  # add 1 to obs dimension for time step feature (see run_episode())
    act_dim = env.action_space.shape[0]
    # sess = tf.Session()
    policy = Policy(obs_dim, act_dim)
    val_func = NNValueFunction(obs_dim)
    # sess.run(tf.compat.v1.initializers.global_variables())

    now = datetime.utcnow().strftime(
        "%b-%d_%H:%M:%S")  # create unique directories
    logger = Logger(logname=env_name, now=now)

    scaler = Scaler(obs_dim)

    run_policy(env, policy, scaler, logger, episodes=5)
    episode = 0
    while episode < num_episodes:
        trajectories = run_policy(env,
                                  policy,
                                  scaler,
                                  logger,
                                  episodes=batch_size)
        episode += len(trajectories)
        observes, actions, advantages, disc_sum_rew = build_train_set(
            trajectories, val_func, gamma, lam)
        policy.update(observes, actions, advantages, logger)
        val_func.fit(observes, disc_sum_rew, logger)
        logger.log({
            '_Episode': episode,
        })
        logger.write(display=True)
Ejemplo n.º 19
0
def train_models(env_name, num_episodes, 
        gamma, lam, kl_targ, 
        coef, use_lr_adjust, 
        ada_kl_penalty, seed, 
        epochs, phi_epochs,
        max_timesteps, reg_scale,
        phi_lr, phi_hs,
        policy_size, 
        phi_obj, load_model, type): 

    env, obs_dim, act_dim = init_gym(env_name)
    set_global_seeds(seed) 
    env.seed(seed)
    env._max_episode_steps = max_timesteps
    obs_dim += 1  # add 1 to obs dimension for time step feature (see run_episode())
    now = datetime.utcnow().strftime("%b-%d_%H:%M:%S")  # create unique directories
    aigym_path = os.path.join('log-files/', env_name, now)
    env = wrappers.Monitor(env, aigym_path, force=True, video_callable=False)
    scaler = Scaler(obs_dim)
    val_func = NNValueFunction(obs_dim)
    policy = Policy(obs_dim, act_dim, 
            kl_targ,epochs, 
            phi_epochs, 
            policy_size=policy_size,
            phi_hidden_sizes=phi_hs,
            reg_scale=reg_scale,
            lr_phi=phi_lr,
            phi_obj=phi_obj,
            type=type)

    
    run_policy(env, policy, 
            scaler, num_episodes, 
            max_timesteps=max_timesteps, mode=load_model) # run a few to init scaler 
    
    episode = 0
    for i in range(2000):
        print("sampling and training at %s iteration\n"%(i))
        trajectories, traj_len_list = run_policy(env, policy, scaler, 
                            num_episodes, max_timesteps=max_timesteps, mode=load_model)
    
        num_traj = len(trajectories)
    
        episode += len(trajectories)
        add_value(trajectories, val_func)  
        add_disc_sum_rew(trajectories, gamma)  
        add_gae(trajectories, gamma, lam) 
    
        observes, actions, advantages, disc_sum_rew = build_train_set(trajectories)
        
        policy.update(load_model, observes, actions, advantages,
                use_lr_adjust, ada_kl_penalty, c=0.)  # update policy
        val_func.fit(observes, disc_sum_rew) 

    # Save models
    policy.save_policy()
    val_func.save_val_func()
    refine_scaler = False
    if refine_scaler == True:
        run_policy(env, policy, 
                scaler, num_episodes, 
                max_timesteps=max_timesteps, mode=load_model) # run a few to refine scaler 
    with open('models/scaler/scaler.pkl', 'wb') as output:
        pickle.dump(scaler, output, pickle.HIGHEST_PROTOCOL)
    logger.log("saved model")
Ejemplo n.º 20
0
def main(env_name, num_episodes, gamma, lam, kl_targ, batch_size, hid1_mult,
         policy_logvar, print_results, act_dim, obs_dim, final_pol_test,
         **kwargs):
    """ Main training loop

    Args:
        env_name: OpenAI Gym environment name, e.g. 'Hopper-v1'
        num_episodes: maximum number of episodes to run
        gamma: reward discount factor (float)
        lam: lambda from Generalized Advantage Estimate
        kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new)
        batch_size: number of episodes per policy training batch
        hid1_mult: hid1 size for policy and value_f
        (mutliplier of obs dimension)
        policy_logvar: natural log of initial policy variance
    """
    killer = GracefulKiller()
    env = init_env(env_name, **kwargs)
    # add 1 to obs dimension for time step feature (see run_episode())
    obs_dim += 1
    tz = timezone('America/Montreal')  # Montreal Timezone
    dt = datetime.now(tz)  # Create unique directories
    now = dt.strftime('%Y-%m-%d %H_%M_%S')
    logger = Logger(logname=env_name, now=now)
    aigym_path = os.path.join('/tmp', env_name, now)
    scaler = Scaler(obs_dim)
    val_func = NNValueFunction(obs_dim, hid1_mult)
    policy = Policy(obs_dim, act_dim, kl_targ, hid1_mult, policy_logvar)
    # run a few episodes of untrained policy to initialize scaler:
    run_policy(env, policy, scaler, logger, episodes=5)
    episode = 0
    kl_terms = np.array([])
    beta_terms = np.array([])
    if print_results:
        rew_graph = np.array([])
        mean_rew_graph = np.array([])
        dir = './log-files/' + env_name + '/' + now + '/'
    while episode < num_episodes:
        trajectories, tot_stuck = run_policy(env,
                                             policy,
                                             scaler,
                                             logger,
                                             episodes=batch_size)
        episode += len(trajectories)
        # add estimated values to episodes
        add_value(trajectories, val_func)
        # calculated discounted sum of Rs
        add_disc_sum_rew(trajectories, gamma, scaler.mean_rew,
                         np.sqrt(scaler.var_rew))
        add_gae(trajectories, gamma, lam, scaler.mean_rew,
                np.sqrt(scaler.var_rew))  # calculate advantage
        disc0 = [t['disc_sum_rew'][0] for t in trajectories]
        # concatenate all episodes into single NumPy arrays
        observes, actions, advantages, disc_sum_rew, unscaled_observes = build_train_set(
            trajectories)
        # add various stats to training log:
        log_batch_stats(observes, actions, advantages, disc_sum_rew, logger,
                        episode)
        policy.update(observes, actions, advantages, logger)  # update policy
        val_func.fit(observes, disc_sum_rew, logger)  # update value function
        logger.write(display=True)  # write logger results to file and stdout
        if killer.kill_now:
            if raw_input('Terminate training (y/[n])? ') == 'y':
                break
            killer.kill_now = False
        if print_results:
            rew_graph = np.append(rew_graph, disc0)
            x1 = list(range(1, (len(rew_graph) + 1)))
            rewards = plt.plot(x1, rew_graph)
            plt.title('Standard PPO')
            plt.xlabel("Episode")
            plt.ylabel("Discounted sum of rewards")
            plt.savefig("log-learning_curve.png")
            plt.close()
            mean_rew_graph = np.append(mean_rew_graph, np.mean(disc0))
            x2 = list(range(1, (len(mean_rew_graph) + 1)))
            mean_rewards = plt.plot(x2, mean_rew_graph)
            plt.title('Standard PPO')
            plt.xlabel("Batch")
            plt.ylabel("Mean of Last Batch")
            plt.savefig("learning_curve2.png")
            plt.close()
    if print_results:
        print('running simulations')
        tr, tot_stuck = run_policy(env,
                                   policy,
                                   scaler,
                                   logger,
                                   episodes=final_pol_test)
        print('done')
        sum_rewww = [t['rewards'].sum() for t in tr]
        sum_rewww += [tot_stuck]
        print('total stucks', sum_rewww[-1])
        hist_dat = np.array(sum_rewww)
        fig = plt.hist(hist_dat, bins=2000, edgecolor='b', linewidth=1.2)
        plt.title('Standard PPO')
        plt.xlabel("Sum of Rewards")
        plt.ylabel("Frequency")
        plt.savefig("standard_ppo.png")
        plt.close()
        with open('sum_rew_final_policy.pkl', 'wb') as f:
            pickle.dump(sum_rewww, f)
        logger.final_log()
    logger.close()
    policy.close_sess()
    val_func.close_sess()
Ejemplo n.º 21
0
def main(env_name, num_episodes, gamma, lam, kl_targ, batch_size, nprocs,
         policy_hid_list, valfunc_hid_list, gpu_pct):
    """ Main training loop

    Args:
        env_name: OpenAI Gym environment name, e.g. 'Hopper-v1'
        num_episodes: maximum number of episodes to run
        gamma: reward discount factor (float)
        lam: lambda from Generalized Advantage Estimate
        kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new)
        batch_size: number of episodes per policy training batch
    """
    # killer = GracefulKiller()
    if mpi_util.nworkers > 1:
        batch_size = batch_size // mpi_util.nworkers if batch_size % mpi_util.nworkers == 0 else batch_size // mpi_util.nworkers + 1  # spread the desired batch_size across processes
    env, obs_dim, act_dim = init_gym(env_name)
    mpi_util.set_global_seeds(111 + mpi_util.rank)
    env.seed(111 + mpi_util.rank)
    obs_dim += 1  # add 1 to obs dimension for time step feature (see run_episode())
    now = datetime.utcnow().strftime(
        "%b-%d_%H:%M:%S")  # create unique directories
    logger = Logger(logname=env_name, now=now)
    aigym_path = os.path.join('/tmp', env_name, now)
    if mpi_util.rank == 0:
        env = wrappers.Monitor(env,
                               aigym_path,
                               force=True,
                               write_upon_reset=True)
    scaler = Scaler(obs_dim)
    val_func = NNValueFunction(obs_dim, valfunc_hid_list)
    policy = Policy(obs_dim, act_dim, kl_targ, policy_hid_list)
    # run a few episodes of untrained policy to initialize scaler:
    run_policy(env, policy, scaler, logger, episodes=5)
    episode = 0
    while episode < num_episodes:
        mpi_util.timeit(
            '--------------------------'
        )  # let's time everything so we can see where the work is being done
        trajectories = run_policy(env,
                                  policy,
                                  scaler,
                                  logger,
                                  episodes=batch_size)
        mpi_util.timeit('run_policy')
        # episode += len(trajectories)
        episode += mpi_util.all_sum(len(trajectories))
        mpi_util.timeit('mpi_util.all_sum')
        add_value(trajectories, val_func)  # add estimated values to episodes
        mpi_util.timeit('add_value')
        add_disc_sum_rew(trajectories,
                         gamma)  # calculated discounted sum of Rs
        mpi_util.timeit('add_disc_sum_rew')
        add_gae(trajectories, gamma, lam)  # calculate advantage
        mpi_util.timeit('add_gae')
        # concatenate all episodes into single NumPy arrays
        observes, actions, advantages, disc_sum_rew = build_train_set(
            trajectories)
        mpi_util.timeit('build_train_set')
        # add various stats to training log:
        log_batch_stats(observes, actions, advantages, disc_sum_rew, logger,
                        episode)
        mpi_util.timeit('log_batch_stats')
        if mpi_util.rank == 0:
            policy.update(observes, actions, advantages,
                          logger)  # update policy
            mpi_util.timeit('policy.update')
            val_func.fit(observes, disc_sum_rew,
                         logger)  # update value function
            mpi_util.timeit('val_func.fit')
        mpi_util.rank0_bcast_wts(
            val_func.sess, val_func.g, 'val'
        )  # doubt if value network is used during rollouts but it only takes a few milliseconds anyhow
        mpi_util.timeit('mpi_util.rank0_bcast_wts(val_func')
        mpi_util.rank0_bcast_wts(policy.sess, policy.g, 'policy')
        mpi_util.timeit('mpi_util.rank0_bcast_wts(policy')
        if mpi_util.rank == 0:
            logger.write(
                display=True)  # write logger results to file and stdout
        # if killer.kill_now:
        #     if input('Terminate training (y/[n])? ') == 'y':
        #         break
        #     killer.kill_now = False
    logger.close()
    policy.close_sess()
    val_func.close_sess()
Ejemplo n.º 22
0
class Central_agent:
    def __init__(self):
        with tf.name_scope("central_agent"):
            self.val_func = NNValueFunction(obs_dim, hid1_mult)
            self.policy = Policy(obs_dim, act_dim, kl_targ, hid1_mult,
                                 policy_logvar)
            self.num_tuple = 0

    def update_parameter_server(self, episode, trajectories, name):
        self.num_tuple += len(trajectories)
        if len(trajectories) < batch_size:
            return

        # concatenate all episodes into single NumPy arrays
        observes, actions, advantages, disc_sum_rew = self.build_train_set(
            trajectories)
        # add various stats to training log:
        self.log_batch_stats(observes, actions, advantages, disc_sum_rew,
                             logger, episode)
        self.policy.update(observes, actions, advantages,
                           logger)  # update policy
        self.val_func.fit(observes, disc_sum_rew,
                          logger)  # update value function
        logger.write(display=True)  # write logger results to file and stdout
        print([
            'thread_name: ' + name + ', episode: ' + str(episode) +
            ', tuples: ' + str(self.num_tuple)
        ])
        if ((episode %
             (batch_size * 3) == 0)):  # & (name == "local_thread3")):
            #print(['stop'])
            self.policy.save(episode, filename1)
            self.val_func.save(episode, filename2)

    def build_train_set(self, trajectories):
        observes = np.concatenate([t['observes'] for t in trajectories])
        actions = np.concatenate([t['actions'] for t in trajectories])
        disc_sum_rew = np.concatenate(
            [t['disc_sum_rew'] for t in trajectories])
        advantages = np.concatenate([t['advantages'] for t in trajectories])
        advantages = (advantages - advantages.mean()) / (advantages.std() +
                                                         1e-6)
        return observes, actions, advantages, disc_sum_rew

    def log_batch_stats(self, observes, actions, advantages, disc_sum_rew,
                        logger, episode):
        logger.log({
            '_mean_obs': np.mean(observes),
            '_min_obs': np.min(observes),
            '_max_obs': np.max(observes),
            '_std_obs': np.mean(np.var(observes, axis=0)),
            '_mean_act': np.mean(actions),
            '_min_act': np.min(actions),
            '_max_act': np.max(actions),
            '_std_act': np.mean(np.var(actions, axis=0)),
            '_mean_adv': np.mean(advantages),
            '_min_adv': np.min(advantages),
            '_max_adv': np.max(advantages),
            '_std_adv': np.var(advantages),
            '_mean_discrew': np.mean(disc_sum_rew),
            '_min_discrew': np.min(disc_sum_rew),
            '_max_discrew': np.max(disc_sum_rew),
            '_std_discrew': np.var(disc_sum_rew),
            '_Episode': episode
        })
Ejemplo n.º 23
0
def main(env_name, num_iterations, gamma, lam, kl_targ, batch_size, hid1_mult,
         policy_logvar, coef, use_lr_adjust, ada_kl_penalty, seed, epochs,
         phi_epochs, max_timesteps, reg_scale, phi_lr, phi_hs, policy_size,
         phi_obj):
    """ Main training loop

    Args:
        env_name: OpenAI Gym environment name, e.g. 'Hopper-v1'
        num_iterations: maximum number of iterations to run
        gamma: reward discount factor (float)
        lam: lambda from Generalized Advantage Estimate
        kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new)
        batch_size: number of episodes per policy training batch
        hid1_mult: hid1 size for policy and value_f (mutliplier of obs dimension)
        policy_logvar: natural log of initial policy variance
        coef: coefficient of Stein control variate
        use_lr_adjust: whether adjust lr based on kl
        ada_kl_penalty: whether adjust kl penalty
        max_timesteps: maximum time steps per trajectory
        reg_scale: regularization coefficient 
        policy_size: policy network size
        phi_obj: FitQ or MinVar
    """

    env, obs_dim, act_dim = init_gym(env_name)
    set_global_seeds(seed)
    env.seed(seed)
    env._max_episode_steps = max_timesteps
    obs_dim += 1  # add 1 to obs dimension for time step feature (see run_episode())

    now = datetime.utcnow().strftime("%b-%d_%H:%M:%S")
    aigym_path = os.path.join('log-files/', env_name, now)
    env = wrappers.Monitor(env, aigym_path, force=True, video_callable=False)

    scaler = Scaler(obs_dim)
    val_func = NNValueFunction(obs_dim, hid1_mult)

    policy = Policy(obs_dim,
                    act_dim,
                    kl_targ,
                    hid1_mult,
                    policy_logvar,
                    epochs,
                    phi_epochs,
                    policy_size=policy_size,
                    phi_hidden_sizes=phi_hs,
                    c_ph=coef,
                    reg_scale=reg_scale,
                    lr_phi=phi_lr,
                    phi_obj=phi_obj)

    # run a few episodes of untrained policy to initialize scaler:
    run_policy(env,
               policy,
               scaler,
               batch_size=1000,
               max_timesteps=max_timesteps)

    for _ in range(num_iterations):
        logger.log("\n#Training Iter %d" % (_))
        logger.log("Draw Samples..")

        trajectories = run_policy(env,
                                  policy,
                                  scaler,
                                  batch_size=batch_size,
                                  max_timesteps=max_timesteps)

        add_value(trajectories, val_func)  # add estimated values to episodes
        add_disc_sum_rew(trajectories,
                         gamma)  # calculated discounted sum of Rs
        add_gae(trajectories, gamma, lam)  # calculate advantage

        # concatenate all episodes into single NumPy arrays
        observes, actions, advantages, disc_sum_rew = build_train_set(
            trajectories)

        # add various stats to training log:
        log_batch_stats(observes, actions, advantages, disc_sum_rew)

        logger.log("Starting Training...")
        policy.update(observes, actions, advantages, \
                use_lr_adjust, ada_kl_penalty)  # update policy

        val_func.fit(observes, disc_sum_rew)  # update value function

        logger.log('--------------------------------\n')

    policy.close_sess()
    val_func.close_sess()
Ejemplo n.º 24
0
def main(env_name, num_episodes, gamma, lam, kl_targ, batch_size, hid1_mult,
         policy_logvar, print_results, risk_targ):
    """ Main training loop

    Args:
        env_name: OpenAI Gym environment name, e.g. 'Hopper-v1'
        num_episodes: maximum number of episodes to run
        gamma: reward discount factor (float)
        lam: lambda from Generalized Advantage Estimate
        kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new)
        batch_size: number of episodes per policy training batch
        hid1_mult: hid1 size for policy and value_f (mutliplier of obs dimension)
        policy_logvar: natural log of initial policy variance
    """
    killer = GracefulKiller()
    env, obs_dim, act_dim = init_gym(env_name)
    obs_dim += 1  # add 1 to obs dimension for time step feature (see run_episode())
    now_utc = datetime.utcnow()  # create unique directories
    now = str(now_utc.day) + '-' + now_utc.strftime('%b') + '-' + str(
        now_utc.year) + '_' + str(
            ((now_utc.hour - 4) % 24)) + '.' + str(now_utc.minute) + '.' + str(
                now_utc.second)  # adjust for Montreal Time Zone
    logger = Logger(logname=env_name, now=now)
    aigym_path = os.path.join('/tmp', env_name, now)
    #env = wrappers.Monitor(env, aigym_path, force=True)
    scaler = Scaler(obs_dim)
    val_func = NNValueFunction(obs_dim, hid1_mult)
    policy = Policy(obs_dim, act_dim, kl_targ, hid1_mult, policy_logvar,
                    risk_targ, 'CVaR', batch_size, 1)
    # run a few episodes of untrained policy to initialize scaler:
    run_policy(env, policy, scaler, logger, episodes=5)
    episode = 0
    kl_terms = np.array([])
    beta_terms = np.array([])
    if print_results:
        rew_graph = np.array([])
        mean_rew_graph = np.array([])
    #big_li_rew_nodisc0 = np.array([])
    while episode < num_episodes:
        trajectories = run_policy(env,
                                  policy,
                                  scaler,
                                  logger,
                                  episodes=batch_size)
        episode += len(trajectories)
        add_value(trajectories, val_func)  # add estimated values to episodes
        #predicted_values_0 = [t['values'][0] for t in trajectories]
        add_disc_sum_rew(
            trajectories, gamma, scaler.mean_rew,
            np.sqrt(scaler.var_rew))  # calculated discounted sum of Rs
        add_gae(trajectories, gamma, lam, scaler.mean_rew,
                np.sqrt(scaler.var_rew))  # calculate advantage
        nodisc0 = -0.0001 * np.array(
            [t['rewards'].sum() for t in trajectories])  # scaled for gradients
        print(nodisc0)
        disc0 = [t['disc_sum_rew'][0] for t in trajectories]
        print('scaled sum rewards', nodisc0)
        observes, actions, advantages, disc_sum_rew = build_train_set(
            trajectories)
        # add various stats to training log:
        log_batch_stats(observes, actions, advantages, disc_sum_rew, logger,
                        episode)
        lamb = policy.update(observes, actions, advantages,
                             logger)  # update policy
        val_func.fit(observes, disc_sum_rew, logger)  # update value function
        logger.write(display=True)  # write logger results to file and stdout
        kl_terms = np.append(kl_terms, policy.check_kl)
        x1 = list(range(1, (len(kl_terms) + 1)))
        rewards = plt.plot(x1, kl_terms)
        plt.title('RAPPO')
        plt.xlabel("Episode")
        plt.ylabel("KL Divergence")
        plt.savefig("KL_curve.png")
        plt.close("KL_curve.png")
        beta_terms = np.append(beta_terms, policy.beta)
        x2 = list(range(1, (len(beta_terms) + 1)))
        mean_rewards = plt.plot(x2, beta_terms)
        plt.title('RAPPO')
        plt.xlabel("Batch")
        plt.ylabel("Beta Lagrange Multiplier")
        plt.savefig("lagrange_beta_curve.png")
        plt.close("lagrange_beta_curve.png")
        if killer.kill_now:
            if input('Terminate training (y/[n])? ') == 'y':
                break
            killer.kill_now = False
        if print_results:
            rew_graph = np.append(rew_graph, disc0)
            x1 = list(range(1, (len(rew_graph) + 1)))
            rewards = plt.plot(x1, rew_graph)
            plt.title('RAPPO')
            plt.xlabel("Episode")
            plt.ylabel("Discounted sum of rewards")
            plt.savefig("learning_curve.png")
            plt.close()
            mean_rew_graph = np.append(mean_rew_graph, np.mean(disc0))
            x2 = list(range(1, (len(mean_rew_graph) + 1)))
            mean_rewards = plt.plot(x2, mean_rew_graph)
            plt.title('RAPPO')
            plt.xlabel("Batch")
            plt.ylabel("Mean of Last Batch")
            plt.savefig("learning_curve2.png")
            plt.close()
    if print_results:
        tr = run_policy(env, policy, scaler, logger, episodes=1000)
        sum_rewww = [t['rewards'].sum() for t in tr]
        hist_dat = np.array(sum_rewww)
        fig = plt.hist(hist_dat, bins=2000, edgecolor='b', linewidth=1.2)
        plt.title('RAPPO')
        plt.xlabel("Sum of Rewards")
        plt.ylabel("Frequency")
        plt.savefig("RA_ppo.png")
        plt.close()
        with open('sum_rew_final_policy.pkl', 'wb') as f:
            pickle.dump(sum_rewww, f)
        logger.final_log()
    logger.close()
    policy.close_sess()
    val_func.close_sess()
Ejemplo n.º 25
0
def add_value(trajectories, val_func):
	'''
	Adds estimated value to all time steps of all trajectories

	Args: 
		trajectories: as returned by run_policy()
		val_func: object with predict() method, takes observations and returns predicted state value

	Returns: 
		None (mutates trajectories dictionary to add 'values')
	'''
	for trajectory in trajectories:
        observes = trajectory['observes']
        values = val_func.predict(observes)
        trajectory['values'] = values

def add_gae(trajectories, gamma, lam):
	''' 
	Add generalized advantage estimator. 
	https://arxiv.org/pdf/1506.02438.pdf

	Args:
		trajectories: as returned by run_policy must include 'values' key from add_values().
		gamma: reward discount
		lam: lambda (see paper).
			lam=0 : use TD residuals
			lam=1 : A = Sum Discounted Rewards - V_hat(s)

	Returns:
		None (mutates trajectories dictionary to add 'advantages')
	'''
	for trajectory in trajectories:
        if gamma < 0.999:  # don't scale for gamma ~= 1
            rewards = trajectory['rewards'] * (1 - gamma)
        else:
            rewards = trajectory['rewards']
        values = trajectory['values']
        # temporal differences
        tds = rewards - values + np.append(values[1:] * gamma, 0)
        advantages = discount(tds, gamma * lam)
        trajectory['advantages'] = advantages

def build_train_set(trajectories):
	'''

	Args:
		trajectories after processing by add_disc_sum_rew(), add_value() and add_gae()

	Returns: 4-tuple of NumPy arrays
		observes: shape = (N, obs_dim)
		actions: shape = (N, act_dim)
		advantages: shape = (N,)
		disc_sum_rew: shape = (N,)
	'''
	observes = np.concatenate([t['observes'] for t in trajectories])
    actions = np.concatenate([t['actions'] for t in trajectories])
    disc_sum_rew = np.concatenate([t['disc_sum_rew'] for t in trajectories])
    advantages = np.concatenate([t['advantages'] for t in trajectories])
    # normalize advantages
    advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-6)

    return observes, actions, advantages, disc_sum_rew

def log_batch_stats(observes, actions, advantages, disc_sum_rew, logger, episode):
    """ Log various batch statistics """
    logger.log({'_mean_obs': np.mean(observes),
                '_min_obs': np.min(observes),
                '_max_obs': np.max(observes),
                '_std_obs': np.mean(np.var(observes, axis=0)),
                '_mean_act': np.mean(actions),
                '_min_act': np.min(actions),
                '_max_act': np.max(actions),
                '_std_act': np.mean(np.var(actions, axis=0)),
                '_mean_adv': np.mean(advantages),
                '_min_adv': np.min(advantages),
                '_max_adv': np.max(advantages),
                '_std_adv': np.var(advantages),
                '_mean_discrew': np.mean(disc_sum_rew),
                '_min_discrew': np.min(disc_sum_rew),
                '_max_discrew': np.max(disc_sum_rew),
                '_std_discrew': np.var(disc_sum_rew),
                '_Episode': episode
                })

def main(env_name, num_episodes, gamma, lam, kl_targ, batch_size):
	'''
	Main training loop 

	Args:
		env_name: Robot model name
		num_episodes: maximum umber of episodes to run (int)
		gamma: reward discount factor (float)
		lam: lambda for Generalized Advantage Estimate
		kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new)]
		bath_size: number of episodes per policy training batch
	'''
	env, obs_dim, act_dim = init_env(env_name)
    obs_dim += 1 # add 1 to obs dimension for time step feature (see run_episode())
    now = datetime.utcnow().strftime("%b-%d_%H:%M:%S").replace(":","_")  # create unique directories
    logger = Logger(logname=env_name, now=now)
    pathFolder = logger.pathFolder
    scaler = Scaler(obs_dim)
    val_func = NNValueFunction(obs_dim)
    policy = Policy(obs_dim, act_dim, kl_targ)

    #TODO agregar la parte de sampling una vez que todo ande

    # run a few episodes of untrained policy to initialize scaler:
    run_policy(env, policy, scaler, logger, episodes=5)
    episode = 0
    while episode < num_episodes:
    	trajectories = run_policy(env, policy, scaler, logger, numEpisodes=batch_size)
    	episode += len(trajectories)
    	add_value(trajectories, val_func) # add estimated values to episodes
    	add_disc_sum_rew(trajectories, gamma) # calculate discounted sum of Rs
    	add_gae(trajectories, gamma, lam) # calculate advantage
    	# concatenate all episodes into single NumPy arrays
    	observes, actions, advantages, disc_sum_rew = build_train_set(trajectories)
    	# add various stats to train log:
    	log_batch_stats(observes, actions, advantages, disc_sum_rew, logger, episode)
        policy.update(observes, actions, advantages, logger)  # update policy
        val_func.fit(observes, disc_sum_rew, logger)  # update value function
        logger.write(display=True)  # write logger results to file and stdout
    logger.close()
    policy.close_sess(pathFolder)
    val_func.close_sess(pathFolder)
Ejemplo n.º 26
0
def main():
    """ Main training loop

    Args:
        env_name: OpenAI Gym environment name, e.g. 'Hopper-v1'
        num_episodes: maximum number of episodes to run
        gamma: reward discount factor (float)
        lam: lambda from Generalized Advantage Estimate
        kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new)
        batch_size: number of episodes per policy training batch
        hid1_mult: hid1 size for policy and value_f (mutliplier of obs dimension)
        policy_logvar: natural log of initial policy variance
    """

    env_name = 'HumanoidasimoMRD4_2-v1'
    #env_name='Humanoid-v1'
    num_episodes = 5000000
    gamma = 0.995
    lam = 0.98
    kl_targ = 0.003
    batch_size = 32
    hid1_mult = 10
    policy_logvar = -1.0

    killer = GracefulKiller()
    env, obs_dim, act_dim = init_gym(env_name)
    obs_dim += 1  # add 1 to obs dimension for time step feature (see run_episode())
    now = datetime.utcnow().strftime(
        "%b-%d_%H:%M:%S")  # create unique directories
    logger = Logger(logname=env_name, now=now)
    aigym_path = os.path.join(
        '/home/initial/eclipse-workspace4/test/trpo-master/src/result',
        env_name, now)
    env = wrappers.Monitor(env, aigym_path, force=True)
    scaler = Scaler(obs_dim)
    val_func = NNValueFunction(obs_dim, hid1_mult, filename2)
    policy = Policy(obs_dim,
                    act_dim,
                    kl_targ,
                    hid1_mult,
                    policy_logvar,
                    filename=filename1)
    # run a few episodes of untrained policy to initialize scaler:
    run_policy(env, policy, scaler, logger, episodes=5)
    episode = 0
    while episode < num_episodes:
        trajectories = run_policy(env,
                                  policy,
                                  scaler,
                                  logger,
                                  episodes=batch_size)
        episode += len(trajectories)
        add_value(trajectories, val_func)  # add estimated values to episodes
        add_disc_sum_rew(trajectories,
                         gamma)  # calculated discounted sum of Rs
        add_gae(trajectories, gamma, lam)  # calculate advantage
        # concatenate all episodes into single NumPy arrays
        observes, actions, advantages, disc_sum_rew = build_train_set(
            trajectories)
        # add various stats to training log:
        log_batch_stats(observes, actions, advantages, disc_sum_rew, logger,
                        episode)
        policy.update(observes, actions, advantages, logger)  # update policy
        val_func.fit(observes, disc_sum_rew, logger)  # update value function
        logger.write(display=True)  # write logger results to file and stdout
        if ((episode %
             (batch_size * 3) == 0)):  # & (name == "local_thread3")):
            #print(['stop'])
            policy.save(episode, filename1)
            val_func.save(episode, filename2)
            #loger.flush()
        if killer.kill_now:
            if input('Terminate training (y/[n])? ') == 'y':
                break
            killer.kill_now = False
    logger.close()
    policy.close_sess()
    val_func.close_sess()
Ejemplo n.º 27
0
def main(env_name, max_time_steps, time_steps_batch, time_steps_mini_batch, gamma, lamda, kl_targ, clipping_range, pol_loss_type, init_pol_logvar, animate,\
        save_video, save_rate, num_episodes_sim, task_params, task_name, dims_core_hid, dims_head_hid, act_func_name,\
        time_step_to_load, now_to_load):
    """ Main training loop

    Args:
        env_name: OpenAI Gym environment name, e.g. 'Hopper-v1'
        max_time_steps: maximum number of time steps to run
        gamma: reward discount factor (float)
        lamda: lambda from Generalized Advantage Estimate
        kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new)
        clipping_range: max value to clip the policy gradient ratio
        pol_loss_type: string determining which type of loss to use for the Policy Network
        time_steps_batch: number of time steps per policy training batch
        init_pol_logvar: natural log of initial policy variance
        save_video: Boolean determining if videos of the agent will be saved
        save_rate: Int determining how often to save videos for
        num_episodes_sim: Number of episodes to simulate/save videos for
        task_params: list of parameters to modify each environment for a different task
        task_name: name user assigns to the task being used to modify the environment
    """

    # ****************  Environment Initialization and Paths  ***************
    task_params_str = ''.join(str(e) + ', ' for e in task_params)
    num_tasks = len(task_params)
    envs = [None] * num_tasks
    scalers = [None] * num_tasks
    loggers = [None] * num_tasks

    print("\n\n------ PATHS: ------")
    start_time = datetime.now()
    if time_step_to_load == None:
        now = start_time.strftime(
            "%b-%d_%H:%M:%S"
        )  # If NOT loading from Checkpoint -> used to  create unique directories
    else:
        assert now_to_load != None,\
            "\n\nWARNING: Date time to load ({}) was not provided. Please provide a valid date time of an experiment".format(now_to_load)
        now = now_to_load
    logs_path = os.path.join('log-files', env_name, task_name, task_params_str,
                             now)

    for task in range(num_tasks):
        # Create task specific environment
        envs[task], obs_dim, act_dim = init_gym(env_name,
                                                task_param=task_params[task])
        obs_dim += 1  # add 1 to obs dimension for time step feature (see run_episode())

        # Create task specific Paths and logger object
        loggers[task] = Logger(logname= [env_name, task_name, task_params_str], now=now, \
                               logname_file= "_{}_{}".format(task_name, task_params[task]))

        if time_step_to_load == None:  # If NOT loading from Checkpoint
            scalers[task] = Scaler(obs_dim)

            # Auxiliary saver (becase logger sometimes fails or takes to much time)
            with open(
                    logs_path +
                    '/aux_{}_{}.txt'.format(task_name, task_params[task]),
                    'w') as f:
                f.write("_TimeStep" + "  " + "_MeanReward")

    aigym_path = os.path.join('./videos', env_name, task_name, task_params_str,
                              now)  # videos folders
    agent_path = os.path.join('agents', env_name, task_name, task_params_str,
                              now)  # agent / policy folders
    if time_step_to_load == None:  # If NOT loading from Checkpoint
        os.makedirs(agent_path)
        with open(agent_path + '/commandline_args.txt', 'w') as f:
            f.write(' '.join(sys.argv[1:]))  # save commandline command
        with open(logs_path + '/commandline_args.txt', 'w') as f:
            f.write(' '.join(sys.argv[1:]))  # save commandline command

    print("\nPath for Saved Videos : {}".format(aigym_path))
    print("Path for Saved Agents: {}\n".format(agent_path))

    # ****************  Initialize Policy, Value Networks and Scaler  ***************
    print("\n\n------ NEURAL NETWORKS: ------")
    dims_core_hid.insert(
        0, obs_dim
    )  # Modify dims list to have the size of the layer 'n-1' at position '0'
    dims_head_hid.insert(0, dims_head_hid[-1])

    val_func = NNValueFunction(obs_dim, dims_core_hid, dims_head_hid,
                               num_tasks, time_steps_mini_batch)
    policy = Policy(obs_dim,
                    act_dim,
                    dims_core_hid,
                    dims_head_hid,
                    num_tasks,
                    time_steps_mini_batch,
                    pol_loss_type=pol_loss_type)

    # Load from Checkpoint:
    # Validate intented time step to load OR get last time step number if no target time step was provided
    if time_step_to_load != None:
        load_agent_path = agent_path  # agent / policy folders
        saved_ep_list = [
            file.split(".")[0].split("_")[-1]
            for file in os.listdir(load_agent_path) if "policy" in file
        ]

        if time_step_to_load == -1:  # Get last saved time step
            time_step_to_load = sorted(
                [int(ep_string) for ep_string in saved_ep_list])[-1]

        else:  # Validate if time_step_to_load was indeed saved
            assert str(time_step_to_load) in saved_ep_list,\
            "\n\nWARNING: Time Step you want to load ({}) was not stored during trainning".format(time_step_to_load)

        # Load Policy Network's Ops and Variables & Load Scaler Object
        policy.tf_saver.restore(
            policy.sess, "{}/policy_ep_{}".format(load_agent_path,
                                                  time_step_to_load))
        val_func.tf_saver.restore(
            val_func.sess, "{}/val_func_ep_{}".format(load_agent_path,
                                                      time_step_to_load))
        scalers = pickle.load(
            open(
                "{}/scalers_ep_{}.p".format(load_agent_path,
                                            time_step_to_load), 'rb'))
        print("\n\n ---- CHECKPOINT LOAD:  Time Step Loaded **{}**".format(
            time_step_to_load))

        # Delete extra epochs that where logged to the auxiliary logs
        for task in range(num_tasks):
            aux_log_path = logs_path + '/aux_{}_{}.txt'.format(
                task_name, task_params[task])
            aux_log = pd.read_table(aux_log_path, delim_whitespace=True)
            idx_to_cut = aux_log.index[aux_log["_TimeStep"] ==
                                       time_step_to_load].tolist()[0]
            aux_log[0:idx_to_cut +
                    1].to_csv(aux_log_path,
                              header=True,
                              index=False,
                              sep=' ',
                              mode='w')  # overwrite trimmed aux_log

    # If NOT loading from Checkpoint: run some time steps to initialize scalers and create Tensor board dirs
    elif time_step_to_load == None:
        for task in range(num_tasks):
            run_policy(envs[task],
                       policy,
                       scalers[task],
                       loggers[task],
                       time_steps_batch=int(time_steps_batch / 3),
                       task=task)

        # Tensor Board writer
        os.makedirs(agent_path + '/tensor_board/policy')
        os.makedirs(agent_path + '/tensor_board/valFunc')

    tb_pol_writer = tf.summary.FileWriter(agent_path + '/tensor_board/policy',
                                          graph=policy.g)
    tb_val_writer = tf.summary.FileWriter(agent_path + '/tensor_board/valFunc',
                                          graph=val_func.g)

    # ****************  Start Training  ***************
    print("\n\n------ TRAINNING: ------")
    animate = True if animate == "True" else False
    save_video = True if save_video == "True" else False
    saver_offset = save_rate
    killer = GracefulKiller()

    if time_step_to_load == None: time_step = 0
    else: time_step = time_step_to_load

    # Time steps are counted across all tasks i.e. N time steps indicates each tasks has been runned for N times
    while time_step < max_time_steps and not killer.kill_now:

        # ****************  Obtain data (train set)  ***************
        observes_all = [None] * num_tasks
        actions_all = [None] * num_tasks
        advantages_all = [None] * num_tasks
        disc_sum_rew_all = [None] * num_tasks

        time_step += time_steps_batch
        for task in range(num_tasks):

            # Obtain 'time_steps_batch' trajectories and add additional intermediate calculations
            trajectories = run_policy(envs[task],
                                      policy,
                                      scalers[task],
                                      loggers[task],
                                      time_steps_batch=time_steps_batch,
                                      task=task,
                                      animate=animate)

            add_value(trajectories, val_func,
                      task)  # add estimated values to trajectories
            add_disc_sum_rew(trajectories,
                             gamma)  # calculated discounted sum of Rs
            add_gae(trajectories, gamma, lamda)  # calculate advantage

            # Concatenate all time steps into single NumPy arrays
            observes_all[task], actions_all[task], advantages_all[
                task], disc_sum_rew_all[task] = build_train_set(trajectories)

            # print("Observes Shape: {}".format(observes_all[task].shape))
            # print("Actions Shape: {}\n\n".format(actions_all[task].shape))
            # print("Advantage Shape: {}\n\n".format(advantages_all[task].shape))

            # Logging Stats
            log_batch_stats(observes_all[task], actions_all[task], advantages_all[task], disc_sum_rew_all[task], \
                            loggers[task], time_step)

        # ****************  Update Policy and Value Networks  ***************
        # print ("*************************************")
        for task in range(num_tasks):
            pol_summary = policy.update(task, observes_all[task],
                                        actions_all[task],
                                        advantages_all[task],
                                        loggers[task])  # update policy
            val_summary = val_func.fit(task, observes_all[task],
                                       disc_sum_rew_all[task],
                                       loggers[task])  # update value function
            # Auxiliary saver (because logger sometimes fails or takes to much time)
            with open(
                    logs_path +
                    '/aux_{}_{}.txt'.format(task_name, task_params[task]),
                    'a') as f:
                f.write("\n" + str(loggers[task].log_entry['_TimeStep']) +
                        "  " + str(loggers[task].log_entry['_MeanReward']))
            loggers[task].write(
                display=False)  # write logger results to file and stdout

            tb_pol_writer.add_summary(pol_summary, global_step=time_step)
            tb_val_writer.add_summary(val_summary, global_step=time_step)

        # ****************  Storing NN and Videos  ***************
        # Store Policy, Value Network and Scaler: every 'save_rate'  or in first/last time steps
        if time_step >= saver_offset or time_step >= max_time_steps or time_step <= time_steps_batch * 1.5 or killer.kill_now:
            # TODO: Make saving agent/video a method so that it can be called in killer.kill_now
            saver_offset += save_rate
            policy.tf_saver.save(policy.sess, "{}/policy_ep_{}".format(
                agent_path, time_step))  # Save Policy Network
            val_func.tf_saver.save(val_func.sess, "{}/val_func_ep_{}".format(
                agent_path, time_step))  # Save Value Network
            pickle.dump(
                scalers,
                open("{}/scalers_ep_{}.p".format(agent_path, time_step), 'wb'))
            print("---- Saved Agent at Time Step {} ----".format(time_step))

            # Save video of current agent/policy
            if save_video:
                print(
                    "---- Saving Video at Time Step {} ----".format(time_step))
                for task in range(num_tasks):
                    _ = sim_agent(envs[task],
                                  policy,
                                  task,
                                  scalers[task],
                                  num_episodes_sim,
                                  save_video=True,
                                  out_dir=aigym_path +
                                  "/vid_ts_{}/{}_{}".format(
                                      time_step, task_name, task_params[task]))
                    envs[task].close()  # closes window open by monitor wrapper
                    envs[task], _, _ = init_gym(
                        env_name, task_param=task_params[task]
                    )  # Recreate env as it was killed
            print("\n\n")

            # If Ctrl + C is Pressed, ask user if Trainning shall be terminated
            if killer.kill_now:
                if input('Terminate training (y/[n])? ') == 'y':
                    break
                killer.kill_now = False

    # ****************  Terminate Variables  **************
    for task in range(num_tasks):
        envs[task].close()
        loggers[task].close()
    policy.close_sess()
    val_func.close_sess()

    # Save elapsed time
    end_time = datetime.now()
    elapsed_time = end_time - start_time
    timedelta(0, 8, 562000)
    delta_time = divmod(elapsed_time.days * 86400 + elapsed_time.seconds, 60)
    delta_str = "Elapsed Time: {} min {} seconds".format(
        delta_time[0], delta_time[1])
    # save elapsed time, 'a' to append not overwrite
    with open(agent_path + '/commandline_args.txt', 'a') as f:
        f.write('\n\n' + delta_str)
    with open(logs_path + '/commandline_args.txt', 'a') as f:
        f.write('\n\n' + delta_str)
Ejemplo n.º 28
0
def main(env_name, num_episodes, gamma, lam, kl_targ, batch_size, hid1_mult,
         policy_logvar, save):
    """ Main training loop

    Args:
        env_name: OpenAI Gym environment name, e.g. 'Hopper-v1'
        num_episodes: maximum number of episodes to run
        gamma: reward discount factor (float)
        lam: lambda from Generalized Advantage Estimate
        kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new)
        batch_size: number of episodes per policy training batch
        hid1_mult: hid1 size for policy and value_f (mutliplier of obs dimension)
        policy_logvar: natural log of initial policy variance
    """
    killer = GracefulKiller()
    env, obs_dim, act_dim = init_gym(env_name)
    obs_dim += 1  # add 1 to obs dimension for time step feature (see run_episode())
    now = datetime.utcnow().strftime(
        "%b-%d_%H:%M:%S")  # create unique directories
    env_id = env_name + id_generator()
    logger = Logger(logname=env_id, now=now)
    aigym_path = os.path.join('/tmp', env_id)
    env = wrappers.Monitor(env,
                           aigym_path,
                           force=True,
                           video_callable=lambda episode_id: False)
    scaler = Scaler(obs_dim)
    val_func = NNValueFunction(obs_dim, hid1_mult)
    policy = Policy(obs_dim, act_dim, kl_targ, hid1_mult, policy_logvar)
    # run a few episodes of untrained policy to initialize scaler:
    run_policy(env, policy, scaler, logger, episodes=5)
    episode = 0

    if env_name == 'Swimmer-v1':
        score_window = 100
        solution_score = 360
    elif env_name == 'HalfCheetah-v1':
        score_window = 100
        solution_score = 4800
    else:
        assert False

    # assert score_window % batch_size == 0
    rewards = collections.deque(maxlen=int(np.rint(score_window / batch_size)))
    while episode < num_episodes:
        trajectories = run_policy(env,
                                  policy,
                                  scaler,
                                  logger,
                                  episodes=batch_size)
        episode += len(trajectories)
        add_value(trajectories, val_func)  # add estimated values to episodes
        add_disc_sum_rew(trajectories,
                         gamma)  # calculated discounted sum of Rs
        add_gae(trajectories, gamma, lam)  # calculate advantage
        # concatenate all episodes into single NumPy arrays
        observes, actions, advantages, disc_sum_rew = build_train_set(
            trajectories)
        # add various stats to training log:
        log_batch_stats(observes, actions, advantages, disc_sum_rew, logger,
                        episode)
        policy.update(observes, actions, advantages, logger)  # update policy
        val_func.fit(observes, disc_sum_rew, logger)  # update value function
        mean_reward = logger.log_entry['_MeanReward']
        logger.write(display=True)  # write logger results to file and stdout
        if killer.kill_now:
            if input('Terminate training (y/[n])? ') == 'y':
                break
            killer.kill_now = False

        rewards.append(mean_reward)
        '''
        if np.mean(rewards) >= solution_score:
            episode = episode - score_window
            break
        '''

    logger.close()
    policy.close_sess()
    val_func.close_sess()

    # return episode
    return -np.mean(rewards)
Ejemplo n.º 29
0
def main(env_name, num_episodes, gamma, lam, kl_targ, batch_size, hid1_mult,
         policy_logvar, **kwargs):
    """ Main training loop

    Args:
        env_name: OpenAI Gym environment name, e.g. 'Hopper-v1'
        num_episodes: maximum number of episodes to run
        gamma: reward discount factor (float)
        lam: lambda from Generalized Advantage Estimate
        kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new)
        batch_size: number of episodes per policy training batch
        hid1_mult: hid1 size for policy and value_f (mutliplier of obs dimension)
        policy_logvar: natural log of initial policy variance
    """
    memory = deque([])
    memory_size = kwargs['memory_size']
    killer = GracefulKiller()
    env, obs_dim, act_dim = init_gym(env_name)
    obs_dim += 1  # add 1 to obs dimension for time step feature (see run_episode())
    now = datetime.utcnow().strftime(
        "%b-%d_%H:%M:%S")  # create unique directories
    logger = Logger(logname=env_name, now=now)
    aigym_path = os.path.join('/tmp', env_name, now)
    env = wrappers.Monitor(env, aigym_path, force=True)
    scaler = Scaler(obs_dim)
    val_func = NNValueFunction(obs_dim, hid1_mult)
    target_policy = Policy(obs_dim, act_dim, kl_targ, hid1_mult,
                           policy_logvar)  # kl_targ = 0?
    explore_policy = Policy(obs_dim, act_dim, kl_targ, hid1_mult,
                            policy_logvar)
    # run a few episodes of untrained policy to initialize scaler:
    run_policy(env, target_policy, scaler, logger, episodes=5, fix_drct_dist=0)
    run_policy(env,
               explore_policy,
               scaler,
               logger,
               episodes=5,
               fix_drct_dist=0)
    episode = 0
    fix_drct_dist_range = (0.3, 0)

    while episode < num_episodes:
        # save model
        if episode % 200 == 0:
            save_path = target_policy.saver.save(
                target_policy.sess,
                "/home/csc63182/testspace/models/halfcheetah-trpo/model-%d.ckpt"
                % (episode))

        # run a few episodes
        fix_drct_dist = (
            (episode * fix_drct_dist_range[1]) +
            (num_episodes - episode) * fix_drct_dist_range[0]) / num_episodes
        target_trajectories = run_policy(env,
                                         target_policy,
                                         scaler,
                                         logger,
                                         episodes=batch_size,
                                         fix_drct_dist=0)
        explore_trajectories = run_policy(env,
                                          explore_policy,
                                          scaler,
                                          logger,
                                          episodes=batch_size,
                                          fix_drct_dist=fix_drct_dist)

        # Add to memory
        n_explore = max(0, int(batch_size * (1 - episode / num_episodes)) - 1)
        trajectories = target_trajectories + explore_trajectories[:n_explore]
        episode += batch_size
        memory += trajectories
        while len(memory) > memory_size:
            memory.popleft()

        # train explore network
        add_value(explore_trajectories,
                  val_func)  # add estimated values to episodes
        add_disc_sum_rew(explore_trajectories,
                         gamma)  # calculated discounted sum of Rs
        add_gae(explore_trajectories, gamma, lam)  # calculate advantage
        observes, actions, advantages, disc_sum_rew = build_train_set(
            explore_trajectories)
        explore_policy.update(observes, actions, advantages,
                              logger)  # update policy
        val_func.fit(observes, disc_sum_rew, logger)  # update value function

        # train target network
        # re-sample trajectories
        trajectories = sample(memory, batch_size)
        add_value(trajectories, val_func)  # add estimated values to episodes
        add_disc_sum_rew(trajectories,
                         gamma)  # calculated discounted sum of Rs
        add_gae(trajectories, gamma, lam)  # calculate advantage
        # concatenate all episodes into single NumPy arrays
        observes, actions, advantages, disc_sum_rew = build_train_set(
            trajectories)
        # add various stats to training log:
        log_batch_stats(observes, actions, advantages, disc_sum_rew, logger,
                        episode)
        target_policy.update(observes, actions, advantages,
                             logger)  # update policy
        val_func.fit(observes, disc_sum_rew, logger)  # update value function
        logger.write(display=True)  # write logger results to file and stdout
        if killer.kill_now:
            if input('Terminate training (y/[n])? ') == 'y':
                break
            killer.kill_now = False
    with open('rewards_%s.txt' % kwargs['log_postfix'], 'w') as f:
        for reward in rewards_record:
            f.write('%f\n' % reward)
    plt.plot((np.arange(len(rewards_record)) + 1) * batch_size, rewards_record)
    plt.savefig('learning_curve_%s.png' % kwargs['log_postfix'])
    logger.close()
    explore_policy.close_sess()
    target_policy.close_sess()
    val_func.close_sess()
Ejemplo n.º 30
0
def main(env_name, num_episodes, gamma, lam, kl_targ, batch_size, TestNote):
    """ Main training loop

    Args:
        env_name: OpenAI Gym environment name, e.g. 'Hopper-v1'
        num_episodes: maximum number of episodes to run
        gamma: reward discount factor (float)
        lam: lambda from Generalized Advantage Estimate
        kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new)
        batch_size: number of episodes per policy training batch
    """
    print('Start time:\n')
    print(time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())))


    killer = GracefulKiller()
    env, obs_dim, act_dim = init_gym(env_name)
    obs_dim += 1  # add 1 to obs dimension for time step feature (see run_episode())
    now = datetime.now().strftime("%b-%d_%H:%M:%S")  # create unique directories  格林尼治时间!!!  utcnow改为now
    testname = now+'-'+TestNote
    logger = Logger(logname=env_name, now=testname)
    monitor_path = os.path.join('log-files', env_name, testname, 'monitor')
    env = wrappers.Monitor(env, monitor_path, force=True)
    scaler = Scaler(obs_dim)
    val_func = NNValueFunction(obs_dim)
    policy = Policy(obs_dim, act_dim, kl_targ)
    # run a few episodes of untrained policy to initialize scaler:
    run_policy(env, policy, scaler, logger, episodes=5)
    episode = 0

    print('Start time:\n')
    time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))

    while episode < num_episodes:
        trajectories = run_policy(env, policy, scaler, logger, episodes=batch_size)
        episode += len(trajectories)
        add_value(trajectories, val_func)  # add estimated values to episodes
        add_disc_sum_rew(trajectories, gamma)  # calculated discounted sum of Rs
        add_gae(trajectories, gamma, lam)  # calculate advantage
        # concatenate all episodes into single NumPy arrays
        observes, actions, advantages, disc_sum_rew = build_train_set(trajectories)
        # add various stats to training log:
        log_batch_stats(observes, actions, advantages, disc_sum_rew, logger, episode)
        policy.update(observes, actions, advantages, logger)  # update policy
        val_func.fit(observes, disc_sum_rew, logger)  # update value function

        # save models
        if not episode % (num_episodes / 10):
            policy_save_path = os.path.join('log-files', env_name, testname, 'checkpoint')
            policy.save_model(env_name + "-" + str(episode), policy_save_path)


        logger.write(display=True)  # write logger results to file and stdout
        if killer.kill_now:
            if input('Terminate training (y/[n])? ') == 'y':
                break
            killer.kill_now = False


    logger.close()
    policy.close_sess()
    val_func.close_sess()

    print('End time:\n')
    print(time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())))
Ejemplo n.º 31
0
def main(env_name, num_episodes, gamma, lam, kl_targ, batch_size, nprocs,
         policy_hid_list, valfunc_hid_list, gpu_pct, restore_path, animate,
         submit):
    """ Main training loop

    Args:
        env_name: OpenAI Gym environment name, e.g. 'Hopper-v1'
        num_episodes: maximum number of episodes to run
        gamma: reward discount factor (float)
        lam: lambda from Generalized Advantage Estimate
        kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new)
        batch_size: number of episodes per policy training batch
    """
    # killer = GracefulKiller()

    env, obs_dim, act_dim = init_osim(animate)
    env.seed(111 + mpi_util.rank)
    mpi_util.set_global_seeds(111 + mpi_util.rank)

    obs_dim += 1  # add 1 to obs dimension for time step feature (see run_episode())
    now = datetime.utcnow().strftime(
        "%b-%d_%H:%M:%S")  # create unique directories
    if mpi_util.rank == 0:
        #aigym_path = os.path.join('/tmp', env_name, now)
        #env = wrappers.Monitor(env, aigym_path, force=True)
        logger = Logger(logname=env_name, now=now)

    episode = 0

    checkpoint = Checkpoint("saves", now)
    # restore from checkpoint?
    if restore_path:
        (policy, val_func, scaler, episode, obs_dim, act_dim,
         kl_targ) = checkpoint.restore(restore_path)
    else:
        policy = Policy(obs_dim, act_dim, kl_targ)
        val_func = NNValueFunction(obs_dim)
        scaler = Scaler(obs_dim)

        if mpi_util.rank == 0:
            # run a few episodes (on node 0) of untrained policy to initialize scaler:
            trajectories = run_policy(env, policy, scaler, episodes=5)

            unscaled = np.concatenate(
                [t['unscaled_obs'] for t in trajectories])
            scaler.update(
                unscaled)  # update running statistics for scaling observations

        # broadcast policy weights, scaler, val_func
        (policy, scaler, val_func) = mpi_util.broadcast_policy_scaler_val(
            policy, scaler, val_func)

        if mpi_util.rank == 0:
            checkpoint.save(policy, val_func, scaler, episode)

    if animate:
        observes, actions, rewards, unscaled_obs = run_episode(env,
                                                               policy,
                                                               scaler,
                                                               animate=animate)
        exit(0)

    if submit:
        # Settings
        #remote_base = 'http://grader.crowdai.org:1729'
        remote_base = 'http://grader.crowdai.org:1730'
        token = 'a83412a94593cae3a491f3ee28ff44e1'

        client = Client(remote_base)

        # Create environment
        observation = client.env_create(token)
        step = 0.0
        observes, actions, rewards, unscaled_obs = [], [], [], []
        scale, offset = scaler.get()
        scale[-1] = 1.0  # don't scale time step feature
        offset[-1] = 0.0  # don't offset time step feature

        # Run a single step
        #
        # The grader runs 3 simulations of at most 1000 steps each. We stop after the last one
        while True:
            obs = np.array(observation).astype(np.float32).reshape((1, -1))
            print("OBSERVATION TYPE:", type(obs), obs.shape)
            print(obs)
            obs = np.append(obs, [[step]], axis=1)  # add time step feature
            unscaled_obs.append(obs)
            obs = (obs - offset) * scale  # center and scale observations
            observes.append(obs)

            action = policy.sample(obs).astype(np.float32).reshape((-1, 1))
            print("ACTION TYPE:", type(action), action.shape)
            print(action)
            actions.append(action)

            [observation, reward, done,
             info] = client.env_step(action.tolist())
            print("step:", step, "reward:", reward)

            if not isinstance(reward, float):
                reward = np.asscalar(reward)
            rewards.append(reward)
            step += 1e-3  # increment time step feature

            if done:
                print(
                    "================================== RESTARTING ================================="
                )
                observation = client.env_reset()
                step = 0.0
                observes, actions, rewards, unscaled_obs = [], [], [], []
                scale, offset = scaler.get()
                scale[-1] = 1.0  # don't scale time step feature
                offset[-1] = 0.0  # don't offset time step feature
                if not observation:
                    break

        client.submit()
        exit(0)

    ######

    worker_batch_size = int(batch_size / mpi_util.nworkers)  # HACK
    if (worker_batch_size * mpi_util.nworkers != batch_size):
        print("batch_size:", batch_size, " is not divisible by nworkers:",
              mpi_util.nworkers)
        exit(1)

    batch = 0
    while episode < num_episodes:
        if mpi_util.rank == 0 and batch > 0 and batch % 10 == 0:
            checkpoint.save(policy, val_func, scaler, episode)
        batch = batch + 1

        trajectories = run_policy(env,
                                  policy,
                                  scaler,
                                  episodes=worker_batch_size)
        trajectories = mpi_util.gather_trajectories(trajectories)

        if mpi_util.rank == 0:
            # concatentate trajectories into one list
            trajectories = list(itertools.chain.from_iterable(trajectories))
            print("did a batch of ", len(trajectories), " trajectories")
            print([t['rewards'].sum() for t in trajectories])

            episode += len(trajectories)
            add_value(trajectories,
                      val_func)  # add estimated values to episodes
            add_disc_sum_rew(trajectories,
                             gamma)  # calculated discounted sum of Rs
            add_gae(trajectories, gamma, lam)  # calculate advantage

            # concatenate all episodes into single NumPy arrays
            observes, actions, advantages, disc_sum_rew = build_train_set(
                trajectories)

            # add various stats to training log:
            logger.log({
                '_MeanReward':
                np.mean([t['rewards'].sum() for t in trajectories]),
                'Steps':
                np.sum([t['observes'].shape[0] for t in trajectories])
            })
            log_batch_stats(observes, actions, advantages, disc_sum_rew,
                            logger, episode)

            policy.update(observes, actions, advantages,
                          logger)  # update policy
            val_func.fit(observes, disc_sum_rew,
                         logger)  # update value function

            unscaled = np.concatenate(
                [t['unscaled_obs'] for t in trajectories])
            scaler.update(
                unscaled)  # update running statistics for scaling observations

            logger.write(
                display=True)  # write logger results to file and stdout

        # if mpi_util.rank == 0 and killer.kill_now:
        #     if input('Terminate training (y/[n])? ') == 'y':
        #         break
        #     killer.kill_now = False

        # broadcast policy weights, scaler, val_func
        (policy, scaler, val_func) = mpi_util.broadcast_policy_scaler_val(
            policy, scaler, val_func)

    if mpi_util.rank == 0: logger.close()
    policy.close_sess()
    if mpi_util.rank == 0: val_func.close_sess()