Example #1
0
    def __init__(self, args, sess, simulator):
        self.args = args
        self.sess = sess
        self.simulator = simulator

        #Define learning agent (TRPO)
        self.agent = TRPO(self.args, self.simulator, self.sess)
Example #2
0
def main(args):
    env = gym.make(args.env_name)
    device = torch.device(args.device)

    # 1.Set some necessary seed.
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)
    torch.manual_seed(args.seed)
    np.random.seed(args.seed)
    env.seed(args.seed)

    # 2.Create actor, critic, EnvSampler() and TRPO.
    state_size = env.observation_space.shape[0]
    action_size = env.action_space.shape[0]
    actor = PolicyNetwork(state_size,
                          action_size,
                          hidden_sizes=args.hidden_sizes,
                          init_std=args.init_std)
    critic = ValueNetwork(state_size, hidden_sizes=args.hidden_sizes)
    env_sampler = EnvSampler(env, args.max_episode_step)
    trpo = TRPO(actor, critic, args.value_lr, args.value_steps_per_update,
                args.cg_steps, args.linesearch_steps, args.gamma, args.tau,
                args.damping, args.max_kl, device)

    def get_action(state):
        state = torch.FloatTensor(state).unsqueeze(0).to(device)
        action = actor.select_action(state)
        return action.detach().cpu().numpy()[0]

    total_step = 0
    for episode in range(1, args.episodes + 1):
        episode_reward, samples = env_sampler(get_action, args.batch_size)
        actor_loss, value_loss = trpo.update(*samples)
        yield episode * args.batch_size, episode_reward, actor_loss, value_loss
Example #3
0
    def __init__(self, args, sess):
        self.args = args
        self.sess = sess

        self.env = gym.make(self.args.env_name)
        self.args.max_path_length = self.env.spec.timestep_limit
        self.agent = TRPO(self.args, self.env, self.sess)
        self.saver = tf.train.Saver()
Example #4
0
    def __init__(self, args, sess, simulator):
        self.args = args
        self.sess = sess
        self.simulator = simulator

        #Construct simulation environment
        self.simulator = gym.make('Pendulum-v0')

        #Define learning agent (TRPO)
        self.agent = TRPO(self.args, self.simulator, self.sess)
Example #5
0
    def __init__(self, **kwargs):
        super().__init__(**kwargs)

        self.local_brain = TRPO(**kwargs)

        ''' 
        Running Statistics.
        normalize observations using running mean and std over the course of the entire experiment,
        fix the running statistics per batch
        see p.12 in https://arxiv.org/pdf/1707.02286.pdf
        '''
        self.running_stats = RunningStats(self.local_brain.env.get_state_shape()[0])
        self.rew_scale = 0.0025
Example #6
0
class LEARNER():
    def __init__(self, args, sess, simulator):
        self.args = args
        self.sess = sess
        self.simulator = simulator

        #Define learning agent (TRPO)
        self.agent = TRPO(self.args, self.simulator, self.sess)

    def learn(self):
        train_index = 0
        total_episode = 0
        total_steps = 0
        all_logs = list()
        while True:
            #Train the TRPO agent
            train_index += 1
            train_log = self.agent.train(train_index)
            total_steps += train_log["Total Step"]
            total_episode += train_log["Num episode"]

            all_logs.append(train_log)

            print(train_log['Episode_Avg_Reward'])
            print(train_index)

            if total_steps > self.args.total_train_step:
                savemat(
                    'data1_' +
                    datetime.datetime.now().strftime("%y-%m-%d-%H-%M") +
                    '.mat', dict(data=all_logs, args=self.args))
                break
Example #7
0
    def __init__(self, args, sess, simulator):
        self.args = args
        self.sess = sess
        self.simulator = simulator

        #Construct simulation environment
        self.simulator = gym.make('Pendulum-v0')
        self.simulator.unwrapped.max_torque = 15.
        self.simulator.unwrapped.max_speed = 60.
        self.simulator.unwrapped.action_space = spaces.Box(low=-self.simulator.unwrapped.max_torque, high=self.simulator.unwrapped.max_torque, shape=(1,))
        high = np.array([1., 1., self.simulator.unwrapped.max_speed])
        self.simulator.unwrapped.observation_space = spaces.Box(low=-high, high=high)

        
        #Define learning agent (TRPO)
        self.agent = TRPO(self.args, self.simulator, self.sess)
Example #8
0
class LEARNER():
    def __init__(self, args, sess, simulator):
        self.args = args
        self.sess = sess
        self.simulator = simulator

        #Construct simulation environment
        self.simulator = gym.make('Pendulum-v0')

        #Define learning agent (TRPO)
        self.agent = TRPO(self.args, self.simulator, self.sess)

    def learn(self):
        train_index = 0
        total_episode = 0
        total_steps = 0
        all_logs = list()
        while True:
            #Train the TRPO agent
            train_index += 1
            train_log = self.agent.train()
            total_steps += train_log["Total Step"]
            total_episode += train_log["Num episode"]

            all_logs.append(train_log)

            #Simulate system w/ new parameters
            if train_index % 20 == 0:
                self.agent.sim()

            if total_steps > self.args.total_train_step:
                nn_weights = {
                    'policy_network': self.agent.get_value(),
                    'advantage_network': self.agent.gae.get_value()
                }
                savemat(
                    'data_' +
                    datetime.datetime.now().strftime("%y-%m-%d-%H-%M") +
                    '.mat', dict(data=all_logs, args=self.args))
                savemat(
                    'weights_' +
                    datetime.datetime.now().strftime("%y-%m-%d-%H-%M") +
                    '.mat', dict(policy_weights=nn_weights, args=self.args))
                break
Example #9
0
class LEARNER():
    def __init__(self, args, sess, simulator):
        self.args = args
        self.sess = sess
        self.simulator = simulator

        #Construct simulation environment
        self.simulator = gym.make('Pendulum-v0')
        self.simulator.unwrapped.max_torque = 15.
        self.simulator.unwrapped.max_speed = 60.
        self.simulator.unwrapped.action_space = spaces.Box(low=-self.simulator.unwrapped.max_torque, high=self.simulator.unwrapped.max_torque, shape=(1,))
        high = np.array([1., 1., self.simulator.unwrapped.max_speed])
        self.simulator.unwrapped.observation_space = spaces.Box(low=-high, high=high)

        
        #Define learning agent (TRPO)
        self.agent = TRPO(self.args, self.simulator, self.sess)

    def learn(self):
        train_index = 0
        total_episode = 0
        total_steps = 0
        all_logs = list()
        while True:
            #Train the TRPO agent
            train_index += 1
            train_log = self.agent.train()
            total_steps += train_log["Total Step"]
            total_episode += train_log["Num episode"]

            all_logs.append(train_log)
            
            #Simulate system w/ new parameters
            if train_index%5 == 0:
                self.agent.sim()
                print(train_index)

            if total_steps > self.args.total_train_step:
                savemat('data4_' + datetime.datetime.now().strftime("%y-%m-%d-%H-%M") + '.mat',dict(data=all_logs, args=self.args))
                break
Example #10
0
def test(t_agent, trial_dir, visual_flag, token):
    assert trial_dir is not None and os.path.exists(trial_dir)

    # prepare trial environment
    pid = os.getpid()
    logger, _ = prepare_for_logging(str(pid), create_folder=False)

    # load config
    config_file = os.path.join(trial_dir, "config.yaml")
    if not os.path.exists(config_file):
        convert_legacy_config(trial_dir, t_agent)
    config = util.load_config(config_file)

    if "max_obstacles" not in config:
        config["max_obstacles"] = 3
    env = NIPS(visualize=visual_flag,
               max_obstacles=config["max_obstacles"],
               token=token)
    util.print_settings(logger, config, env)

    # instantiate an agent
    config["logger"] = logger
    config["log_dir"] = trial_dir
    config["model_dir"] = trial_dir
    if t_agent == "DDPG":
        from ddpg import DDPG
        agent = DDPG(env, config)
    elif t_agent == "TRPO":
        from trpo import TRPO
        agent = TRPO(env, config)
    else:
        raise ValueError("Unsupported agent type: {}".format(t_agent))
    agent.set_state(config)

    # test
    util.print_sec_header(logger, "Testing")
    rewards = agent.test(logging=env.remote_env)
    logger.info("avg_reward={}".format(np.mean(rewards)))
    env.close()
Example #11
0
env_args['fixed_length'] = True
env_args['trajectory_length'] = 5

if simulator_type == 'single-path':
    simulator = SinglePathSimulator(env_name,
                                    policy,
                                    n_trajectories,
                                    max_timesteps,
                                    state_filter=state_filter,
                                    **env_args)
elif simulator_type == 'vine':
    raise NotImplementedError

try:
    trpo_args = config['trpo_args']
except:
    trpo_args = {}

trpo = TRPO(policy,
            value_fun,
            simulator,
            model_name=model_name,
            continue_from_file=continue_from_file,
            **trpo_args)

print(f'Training policy {model_name} on {env_name} environment...\n')

trpo.train(config['n_episodes'])

print('\nTraining complete.\n')
Example #12
0
 def __init__(self,
              task,
              num_epsiodes=500,
              discount_factor=0.995,
              gae_lambda = 1.0,
              trpo_step_size=0.01,
              policy_network_hidden_sizes=(64, 64),
              policy_learn_std=True,
              policy_adaptive_std=False,
              cg_iters=10,
              cg_damping=1e-5,
              cg_backtrack_ratio=0.8,
              cg_max_backtracks=10,
              sampler_thread_num=8,
              sampler_max_samples=50000,
              sampler_max_path_length=1000,
              sampler_center_advantage=True):
     
     self.task = task
     self.discount = discount_factor
     self.gae_lambda = gae_lambda
     self.sampler_max_samples = sampler_max_samples
     self.sampler_max_path_length = sampler_max_path_length
     self.sampler_center_advantage = sampler_center_advantage
     self.subsample_rate = 0.8
     self.fitting_mode = 'linear'
     self.use_trpo = True
     
     self.num_episodes = num_epsiodes
     self.directory = 'log/{}/'.format(task)
     
     self.simulator = Simulator(task=task)
     input_shape = (None, self.simulator.obsevation_dim)
     output_size = self.simulator.action_dim
     
     if self.fitting_mode == 'linear':
         self.value_network = LinearFitting()
     elif self.fitting_mode == 'mlp':
         self.value_network = MLPFitting(input_shape, hidden_sizes=(32, 32))
     else:
         raise NotImplementedError
     
     if self.simulator.action_type == 'continuous':
         self.policy_network = GaussianMLPPolicy(input_shape=input_shape,
                                                 output_size=output_size,
                                                 hidden_sizes=policy_network_hidden_sizes,
                                                 learn_std=policy_learn_std,
                                                 adaptive_std=policy_adaptive_std,
                                                 std_hidden_sizes=policy_network_hidden_sizes)
     elif self.simulator.action_type == 'discrete':
         self.policy_network = CategoricalMLPPolicy(input_shape=input_shape,
                                                    output_size=output_size,
                                                    hidden_sizes=policy_network_hidden_sizes)
     
     self.optimizer = ConjugateOptimizer(cg_iters=cg_iters,
                                         reg_coeff=cg_damping,
                                         backtrack_ratio=cg_backtrack_ratio,
                                         max_backtracks=cg_max_backtracks)
     
     self.sampler = Sampler(self.simulator, self.policy_network)
     self.parallel_sampler = ParallelSampler(self.sampler, 
                                             thread_num=sampler_thread_num, 
                                             max_path_length=self.sampler_max_path_length,
                                             render=False)
     
     if self.use_trpo:
         self.trpo = TRPO(self.policy_network, self.optimizer, trpo_step_size)
     else:
         self.trpo = PPO(self.policy_network)
     
     # Additional summaries
     self.average_reward = tf.placeholder(dtype=tf.float32, shape=[])
     tf.summary.scalar("reward", self.average_reward, collections=['trainer'])
     self.summary_op = tf.summary.merge_all('trainer')
Example #13
0
        entropy = args.entropy_weight * entropy_calc(policies[step])

        q_value = q_values[step].gather(1, actions[step])
        critic_loss = ((retrace - q_value)**2 / 2).mean(0)

        truncated_rho = imp_wt.gather(1, actions[step]).clamp(max=1)

        # print(truncated_rho, critic_loss)

        retrace = truncated_rho * (retrace -
                                   q_value.detach()) + values[step].detach()

        loss += actor_loss + critic_loss - entropy

        if args.type == 'trpo':
            loss = TRPO(model, policies, average_policies, 1, loss,
                        policies[step] / average_policies[step])

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if args.batch_size < len(replay_buffer) + 1:
        for _ in range(np.random.poisson(args.replay_ratio)):
            trajecs = replay_buffer.sample(args.batch_size)
            s_x, a_x, r_x, old_pol, m_x = map(
                torch.stack,
                zip(*(map(torch.cat, zip(*trajec)) for trajec in trajecs)))

            q_vals = []
            vals = []
            pols = []
Example #14
0
def train(config, trial_dir=None, visualize=False):
    pid = os.getpid()
    logger, log_dir = prepare_for_logging("pid_{}".format(pid))

    # create environment
    env = NIPS(visualize)
    logger.info("pid={}, env={}".format(pid, id(env)))
    if trial_dir is not None and os.path.exists(
            trial_dir) and config['agent'] == 'DDPG':
        logger.info("Loading config from {} ...".format(trial_dir))
        with open(os.path.join(trial_dir, "config.pk"), "rb") as f:
            config = pickle.load(f)
    # config["scale_action"] = scale_action
    config["title_prefix"] = "RunEnv"

    # observation processor
    if "ob_processor" not in config or config["ob_processor"] == "dummy":
        ob_processor = ObservationProcessor()
    elif config["ob_processor"] == "2ndorder":
        ob_processor = SecondOrderAugmentor()
    else:
        ob_processor = BodySpeedAugmentor()
    config["ob_aug_dim"] = ob_processor.get_aug_dim()

    # snapshot info
    if "save_snapshot_every" not in config:
        config["save_snapshot_every"] = 500
    save_snapshot_every = config["save_snapshot_every"]

    # save config
    with open(os.path.join(log_dir, "config.pk"), "wb") as f:
        pickle.dump(config, f)
    util.print_settings(logger, config, env)

    # DDPG
    if config['agent'] == 'DDPG':
        # create random process
        oup = create_rand_process(env, config)

        # create replay buffer
        memory = create_memory(env, config)

        # create ddpg agent
        agent = DDPG(env, memory, oup, ob_processor, config)
        agent.build_nets(actor_hiddens=config["actor_hiddens"],
                         scale_action=config["scale_action"],
                         critic_hiddens=config["critic_hiddens"])

        # print networks
        agent.actor.summary()
        agent.target_actor.summary()
        agent.critic.summary()

        # add callbacks
        def p_info(episode_info):
            util.print_episode_info(logger, episode_info, pid)

        def save_nets(episode_info):
            paths = {}
            paths["actor"] = os.path.join(log_dir, "actor.h5")
            paths["critic"] = os.path.join(log_dir, "critic.h5")
            paths["target"] = os.path.join(log_dir, "target.h5")
            agent = episode_info["agent"]
            agent.save_models(paths)

        def save_snapshots(episode_info):
            agent = episode_info["agent"]
            episode = episode_info["episode"]
            if episode % save_snapshot_every == 0:
                paths = {}
                paths["actor"] = os.path.join(log_dir,
                                              "actor_{}.h5".format(episode))
                paths["critic"] = os.path.join(log_dir,
                                               "critic_{}.h5".format(episode))
                paths["target"] = os.path.join(log_dir,
                                               "target_{}.h5".format(episode))
                agent.save_models(paths)
                memory_path = os.path.join(log_dir, "replaybuffer.npz")
                agent.save_memory(memory_path)
                logger.info("Snapshots saved. (pid={})".format(pid))

        agent.on_episode_end.append(p_info)
        agent.on_episode_end.append(save_nets)
        agent.on_episode_end.append(save_snapshots)

        # load existing model
        if trial_dir is not None and os.path.exists(trial_dir):
            logger.info("Loading networks from {} ...".format(trial_dir))
            paths = {}
            paths["actor"] = "actor.h5"
            paths["critic"] = "critic.h5"
            paths["target"] = "target.h5"
            paths = {
                k: os.path.join(trial_dir, v)
                for k, v in paths.iteritems()
            }
            logger.info("Paths to models: {}".format(paths))
            agent.load_models(paths)
            memory_path = os.path.join(trial_dir, "replaybuffer.npz")
            if os.path.exists(memory_path):
                agent.load_memory(memory_path)
                logger.info("Replay buffer loaded.")

        # learn
        util.print_sec_header(logger, "Training")
        reward_hist, steps_hist = agent.learn(
            total_episodes=config["total_episodes"],
            max_steps=config["max_steps"])
        env.close()

        # send result
        img_file = os.path.join(log_dir, "train_stats.png")
        util.plot_stats(reward_hist, steps_hist, img_file)
        log_file = os.path.join(log_dir, "train.log")
        title = log_dir + "_" + config["title_prefix"]
        util.send_email(title, [img_file], [log_file], SMTP_SERVER)

    # TRPO
    elif config['agent'] == 'TRPO':

        def ob_processor_maker():
            if config["ob_processor"] == "normal":
                return ObservationProcessor()
            elif config["ob_processor"] == "2ndorder":
                return SecondOrderAugmentor()
            elif config['ob_processor'] == 'bodyspeed':
                return BodySpeedAugmentor()
            else:
                raise ValueError('invalid ob processor type')

        def env_maker(visualize=False):
            env = NIPS(visualize=visualize)
            monitor_dir = os.path.join(log_dir, "gym_monitor")
            env = gym.wrappers.Monitor(env,
                                       directory=monitor_dir,
                                       video_callable=False,
                                       force=False,
                                       resume=True,
                                       write_upon_reset=True)
            return env

        del env
        env = env_maker()

        agent = TRPO(
            env,
            env_maker,
            logger,
            log_dir,
            ob_processor_maker,
            policy_hiddens=config['policy_hiddens'],
            baseline_hiddens=config['baseline_hiddens'],
            n_envs=config['n_envs'],
            batch_size=config['batch_size'],
            n_iters=config['n_iters'],
        )

        if trial_dir is not None and os.path.exists(trial_dir):
            agent.load_models(trial_dir)
        agent.learn()

    logger.info("Finished (pid={}).".format(pid))
Example #15
0
def train(config, trial_dir=None, visualize=False, overwrite_config=False):
    t_agent = config["agent"]
    assert t_agent in SUPPORTED_AGENTS, "Agent type {} not supported".format(
        t_agent)

    # prepare trial environment
    pid = os.getpid()
    trial_name = "{}_pid{}".format(t_agent, pid)
    logger, log_dir = prepare_for_logging(trial_name)

    # create agent
    if "max_obstacles" not in config:
        config["max_obstacles"] = 3
    env = NIPS(visualize, max_obstacles=config["max_obstacles"])
    logger.info("pid={}, env={}".format(pid, id(env)))

    # to train from scratch or fine tune
    fine_tuning = False
    if trial_dir is not None:
        config_file = os.path.join(trial_dir, "config.yaml")
        if not os.path.exists(config_file):
            convert_legacy_config(trial_dir, t_agent)
        existing_config = util.load_config(config_file)
        fine_tuning = True
        if overwrite_config:
            logger.info("Overwrite config from file {}".format(trial_dir))
            for k, v in config.iteritems():
                existing_config[k] = v
        config = existing_config
        config["model_dir"] = trial_dir

    # save config to the trial folder
    util.print_settings(logger, config, env)
    config_file = os.path.join(log_dir, "config.yaml")
    util.save_config(config_file, config)

    # instantiate an agent
    config["logger"] = logger
    config["log_dir"] = log_dir
    if t_agent == "DDPG":
        from ddpg import DDPG
        agent = DDPG(env, config)
    elif t_agent == "TRPO":
        from trpo import TRPO
        agent = TRPO(env, config)
    else:
        # because of the assertion above, this should never happen
        raise ValueError("Unsupported agent type: {}".format(t_agent))

    # learn
    if fine_tuning:
        util.print_sec_header(logger, "Continual training")
        agent.set_state(config)
    else:
        util.print_sec_header(logger, "Training from scratch")
    reward_hist, steps_hist = agent.learn(
        total_episodes=config["total_episodes"])
    env.close()

    # send result
    img_file = os.path.join(log_dir, "train_stats.png")
    util.plot_stats(reward_hist, steps_hist, img_file)
    log_file = os.path.join(log_dir, "train.log")
    util.send_email(log_dir, [img_file], [log_file], config)

    logger.info("Finished (pid={}).".format(pid))
Example #16
0
def test(agent, trial_dir, test_episode, visual_flag, submit_flag):
    pid = os.getpid()
    logger, _ = prepare_for_logging("pid_{}".format(pid), False)

    logger.info("trial_dir={}".format(trial_dir))
    if not os.path.exists(trial_dir):
        logger.info("trial_dir does not exist")
        return

    # create environment
    env = NIPS(visualize=visual_flag)

    # load config
    with open(os.path.join(trial_dir, "config.pk"), "rb") as f:
        config = pickle.load(f)

    if agent == 'DDPG':
        config["scale_action"] = scale_action

        # observation processor
        if "ob_processor" not in config or config["ob_processor"] == "dummy":
            ob_processor = ObservationProcessor()
        elif config["ob_processor"] == "2ndorder":
            ob_processor = SecondOrderAugmentor()
        else:
            ob_processor = BodySpeedAugmentor()
        config["ob_aug_dim"] = ob_processor.get_aug_dim()
        util.print_settings(logger, config, env)

        # create random process
        oup = create_rand_process(env, config)

        # create replay buffer
        memory = create_memory(env, config)

        # create ddpg agent
        agent = DDPG(env, memory, oup, ob_processor, config)
        agent.build_nets(actor_hiddens=config["actor_hiddens"],
                         scale_action=config["scale_action"],
                         critic_hiddens=config["critic_hiddens"])

        # load weights
        paths = {}
        if test_episode > 0:
            paths["actor"] = "actor_{}.h5".format(test_episode)
            paths["critic"] = "critic_{}.h5".format(test_episode)
            paths["target"] = "target_{}.h5".format(test_episode)
        else:
            paths["actor"] = "actor.h5"
            paths["critic"] = "critic.h5"
            paths["target"] = "target.h5"
        paths = {k: os.path.join(trial_dir, v) for k, v in paths.iteritems()}
        logger.info("Paths to models: {}".format(paths))
        agent.load_models(paths)

    elif agent == 'TRPO':

        def ob_processor_maker():
            if config["ob_processor"] == "normal":
                return ObservationProcessor()
            elif config["ob_processor"] == "2ndorder":
                return SecondOrderAugmentor()
            elif config['ob_processor'] == 'bodyspeed':
                return BodySpeedAugmentor()
            else:
                raise ValueError('invalid ob processor type')

        config = {
            "agent": 'TRPO',
            "batch_size": 5000,
            "n_envs": 16,
            "n_iters": 5000,
            "ob_processor": "bodyspeed",
            # "hidden_nonlinearity": "relu",
            # "action_nonlinearity": "tanh",
            # "policy_hiddens": [128, 128, 64, 64],
            # "baseline_hiddens": [128, 128, 64, 64],
            "policy_hiddens": [256, 128, 64],
            "baseline_hiddens": [256, 128, 64],
            "hidden_nonlinearity": "tanh",
            "action_nonlinearity": None,
        }

        agent = TRPO(
            env,
            env_maker=None,
            logger=logger,
            log_dir=None,
            ob_processor_maker=ob_processor_maker,
            policy_hiddens=config['policy_hiddens'],
            baseline_hiddens=config['baseline_hiddens'],
            hidden_nonlinearity=config['hidden_nonlinearity'],
            action_nonlinearity=config['action_nonlinearity'],
            n_envs=config['n_envs'],
            batch_size=config['batch_size'],
            n_iters=config['n_iters'],
        )
        agent.load_models(trial_dir)
    else:
        raise ValueError('invalid agent type')

    if submit_flag:
        submit(agent, logger)
    else:
        rewards = []
        for i in xrange(10):
            steps, reward = agent.test(max_steps=1000)
            logger.info("episode={}, steps={}, reward={}".format(
                i, steps, reward))
            rewards.append(reward)
        logger.info("avg_reward={}".format(np.mean(rewards)))
#     env=env,
#     learning_rate=0.02,
#     gamma=0.995,
#     output_graph=False,
#     seed=1,
#     ep_max=3000,
#     ep_steps_max=8000,
#     hidden_sizes=(30,)
# )

RL = TRPO(env=env,
          lr_pi=0.01,
          lr_v=0.01,
          gamma=0.99,
          lam=0.97,
          delta=0.01,
          output_graph=False,
          seed=1,
          ep_max=100,
          ep_steps_max=4000,
          hidden_sizes=(64, 64),
          train_v_iters=80,
          damping_coeff=0.1,
          cg_iters=10,
          backtrack_iters=10,
          backtrack_coeff=0.8,
          algo='npg')

# RL.train(env, render_threshold_reward=-500, render=False)
RL.train(env, render_threshold_reward=-1000, render=False)
Example #18
0
class TRPOTrainer(GeneralTrainer):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)

        self.local_brain = TRPO(**kwargs)

        ''' 
        Running Statistics.
        normalize observations using running mean and std over the course of the entire experiment,
        fix the running statistics per batch
        see p.12 in https://arxiv.org/pdf/1707.02286.pdf
        '''
        self.running_stats = RunningStats(self.local_brain.env.get_state_shape()[0])
        self.rew_scale = 0.0025

    ''' 
    core training routine. 
        updates value using previous batch of trajectories, 
        updates policy using current batch of trajectories,
        For details, see https://arxiv.org/pdf/1703.02660.pdf
    '''
    def train(self, session):
        self._print_instance_info()

        with session.as_default(), session.graph.as_default():
            self.intialize_params(session = session, n_episodes = 3)

            raw_t = self.gen_trajectories(session, self.local_brain.traj_batch_size)
            t_processed = self.process_trajectories(session,raw_t)
            self.update_policy(session, t_processed)
            t_processed_prev = t_processed

            while self.episode_count < self.max_episode_count:
                raw_t = self.gen_trajectories(session, self.local_brain.traj_batch_size)
                t_processed = self.process_trajectories(session, raw_t)

                self.update_policy(session, t_processed)
                self.update_value(t_processed_prev)

                self.auditor.log()
                t_processed_prev = t_processed


    ''' log, print run instance info. and hyper-params '''
    def _print_instance_info(self):
        self.auditor.update({'task': self.environ_string,
                          'seed': self.seed,
                          'max_episode_count': self.max_episode_count,
                          'policy_type': self.local_brain.policy_type,
                          'reward_discount': self.local_brain.reward_discount,
                          'gae_discount': self.local_brain.gae_discount,
                          'traj_batch_size': self.local_brain.traj_batch_size,
                          'n_policy_epochs': self.local_brain.n_policy_epochs,
                          'policy_learning_rate': float("%.5f" % self.local_brain.policy_learning_rate),
                          'value_learning_rate': float("%.5f" % self.local_brain.value_learning_rate),
                          'n_value_epochs': self.local_brain.n_value_epochs,
                          'value_batch_size': self.local_brain.value_batch_size,
                          'kl_target': self.local_brain.kl_target,
                          'beta': self.local_brain.beta,
                          'beta_min': self.local_brain.beta_min,
                          'beta_max': self.local_brain.beta_max,
                          'ksi': self.local_brain.ksi
                          })
        self.auditor.logmeta()

        return self


    ''' Initialize environment dependent parameters, such as running mean + std '''
    def intialize_params(self, session, n_episodes):
        self.gen_trajectories(session, n_episodes)
        return self


    ''' generate a single episodic trajectory '''
    def _gen_trajectory(self, session):
        state = self.local_brain.env.reset_environment()
        actions, rewards, states, norm_states = [], [], [], []
        terminal = False

        while terminal is False:

            states.append(state)

            state_normalized = (state - self.running_stats.mean()) / self.running_stats.standard_deviation()
            norm_states.append(state_normalized)

            action = self.local_brain.sample_action(session, state_normalized)
            new_state, reward, terminal, info = self.env.perform_action(action)
            actions.append(action)
            rewards.append(reward * self.rew_scale)

            state = new_state # recurse and repeat until episode terminates
        return actions, rewards, states, norm_states


    def _discount(self, x, gamma):
        return scipy.signal.lfilter([1.0], [1.0, -gamma], x[::-1])[::-1]


    ''' generate trajectories by rolling out the stochastic policy 'pi_theta_k', of iteration k,
    and no truncation of rolling horizon, unless needed'''
    def gen_trajectories(self, session, traj_batch_size):

        raw_t = {'states':[], 'actions':[], 'rewards':[], 'disc_rewards':[], 'values':[], 'advantages':[]}
        raw_states = []
        for episode in range(traj_batch_size):

            actions, rewards, states, norm_states = self._gen_trajectory(session)

            raw_t['states'].append(norm_states)
            raw_t['actions'].append(actions)
            raw_t['rewards'].append(rewards)
            ''' discounted sum of rewards until the end of episode for value update'''
            raw_t['disc_rewards'].append(self._discount(rewards, gamma = self.local_brain.reward_discount))

            raw_states += states
            self.episode_count += 1

        self.running_stats.multiple_push(raw_states) # per batch update running statistics

        self.auditor.update({'episode_number': self.episode_count,
                          'per_episode_mean': int(np.sum(np.concatenate(raw_t['rewards'])) /
                                                  (traj_batch_size * self.rew_scale))
                          })

        return raw_t


    ''' estimate value and advantages: gae'''
    def process_trajectories(self, session, t):
        for i in range(self.local_brain.traj_batch_size):
            feed_dict = {self.local_brain.input_ph: t['states'][i]}
            values = session.run(self.local_brain.value, feed_dict=feed_dict)
            t['values'].append(values)

            ''' generalized advantage estimation from https://arxiv.org/pdf/1506.02438.pdf for policy gradient update'''
            temporal_differences = t['rewards'][i] + np.append(self.local_brain.reward_discount * values[1:], 0.0) - list(map(float, values))
            gae = self._discount(temporal_differences, self.local_brain.gae_discount * self.local_brain.reward_discount)

            t['advantages'].append(gae)

        t['states'] = np.concatenate(t['states'])
        t['actions'] = np.concatenate(t['actions'])
        t['rewards'] = np.concatenate(t['rewards'])
        t['disc_rewards'] = np.concatenate(t['disc_rewards'])
        t['values'] = np.concatenate(t['values'])

        ''' per batch normliazation of gae. see p.13 in https://arxiv.org/pdf/1707.02286.pdf '''
        concatenated_gae = np.concatenate(t['advantages'])
        normalized_gae = (concatenated_gae - concatenated_gae.mean()) / (concatenated_gae.std() + 1e-6)
        t['advantages'] = normalized_gae

        t['actions'] = np.reshape(t['actions'], (-1, self.local_brain.env_action_number))
        for entity in ['rewards', 'disc_rewards', 'values', 'advantages']:
            t[entity] = np.reshape(t[entity], (-1, 1))

        return t

    ''' updates policy '''
    def update_policy(self, session, t):
        self.local_brain._update_policy(session, t, self.auditor)
        return self

    ''' updates value '''
    def update_value(self, t):
        self.local_brain._update_value(t, self.auditor)
        return self
Example #19
0
class LEARNER():
    def __init__(self, args, sess):
        self.args = args
        self.sess = sess

        self.env = gym.make(self.args.env_name)
        self.args.max_path_length = self.env.spec.timestep_limit
        self.agent = TRPO(self.args, self.env, self.sess)
        self.saver = tf.train.Saver()

    def learn(self):
        train_index = 0
        total_episode = 0
        total_steps = 0
        while True:
            train_index += 1
            start_time = time.time()
            train_log = self.agent.train()
            total_steps += train_log["Total Step"]
            total_episode += train_log["Num episode"]
            self.write_logs(train_index, total_episode, total_steps,
                            start_time, train_log)
            if np.mod(train_index, self.args.save_interval) == 0:
                self.save(train_index)

            if total_steps > self.args.total_train_step:
                break

    def write_logs(self, train_index, total_episode, total_steps, start_time,
                   log_info):
        log_path = os.path.join(self.args.log_dir, self.model_dir + '.csv')
        if not os.path.exists(log_path):
            log_file = open(log_path, 'w')
            log_file.write("Train step\t," + "Surrogate\t," +
                           "KL divergence\t," + "Number of steps trained\t," +
                           "Number of episodes trained\t," +
                           "Episode.Avg.reward\t," + "Elapsed time\n")
        else:
            log_file = open(log_path, 'a')
        print(
            "Train step %d => Surrogate loss : %3.3f, KL div : %3.8f, Number of Episode/steps trained : %d/%d, Episode.Avg.reward : %3.3f, Time : %3.3f"
            % (train_index, log_info["Surrogate loss"], log_info["KL_DIV"],
               total_episode, total_steps, log_info["Episode Avg.reward"],
               time.time() - start_time))
        log_file.write(
            str(train_index) + '\t,' + str(log_info["Surrogate loss"]) +
            '\t,' + str(log_info["KL_DIV"]) + '\t,' + str(total_steps) +
            '\t,' + str(total_episode) + '\t,' +
            str(log_info["Episode Avg.reward"]) + '\t,' +
            str(time.time() - start_time) + '\n')
        log_file.flush()

    def save(self, steps):
        model_name = 'TRPO_GAE'
        checkpoint_dir = os.path.join(self.args.checkpoint_dir, self.model_dir)
        if not os.path.exists(checkpoint_dir):
            os.mkdir(checkpoint_dir)
        self.saver.save(self.sess,
                        os.path.join(checkpoint_dir, model_name),
                        global_step=steps)
        print('Checkpoint saved at %d train step' % steps)

    @property
    def model_dir(self):
        return '{}_{}lambda'.format(self.args.env_name, self.args.lamda)