Exemple #1
0
 def get(self):
     """
     Call this at the end of an epoch to get all of the data from
     the buffer, with advantages appropriately normalized (shifted to have
     mean zero and std one). Also, resets some pointers in the buffer.
     """
     assert self.ptr == self.max_size  # buffer has to be full before you can get
     self.ptr, self.path_start_idx = 0, 0
     # the next two lines implement the advantage normalization trick
     adv_mean, adv_std = mpi_statistics_scalar(self.adv_buf)
     self.adv_buf = (self.adv_buf - adv_mean) / adv_std
     data = dict(obs=self.obs_buf,
                 act=self.act_buf,
                 ret=self.ret_buf,
                 adv=self.adv_buf,
                 logp=self.logp_buf)
     return {
         k: torch.as_tensor(v, dtype=torch.float32)
         for k, v in data.items()
     }
Exemple #2
0
    def get(self):
        """
        Call this at the end of an epoch to get all of the data from
        the buffer, with advantages appropriately normalized (shifted to have
        mean zero and std one). Also, resets some pointers in the buffer.
        """
        assert self.ptr == self.max_size  # buffer has to be full before you can get

        # reset the path trajectory counter and start index
        self.ptr, self.path_start_idx = 0, 0

        # the next two lines implement the advantage normalization trick
        adv_mean, adv_std = mpi_statistics_scalar(self.adv_buf)
        self.adv_buf = (self.adv_buf - adv_mean) / adv_std

        # return obs_buf, act_buf, adv_buf, ret_buf, logp_buf, and info_bUf (sorted) as a list
        return [
            self.obs_buf, self.act_buf, self.adv_buf, self.ret_buf,
            self.logp_buf
        ] + core.values_as_sorted_list(self.info_bufs)
Exemple #3
0
    def log_tabular(self,
                    key,
                    val=None,
                    with_min_and_max=False,
                    average_only=False):
        """
        Log a value or possibly the mean/std/min/max values of a diagnostic.

        Args:
            key (string): The name of the diagnostic. If you are logging a
                diagnostic whose state has previously been saved with 
                ``store``, the key here has to match the key you used there.

            val: A value for the diagnostic. If you have previously saved
                values for this key via ``store``, do *not* provide a ``val``
                here.

            with_min_and_max (bool): If true, log min and max values of the 
                diagnostic over the epoch.

            average_only (bool): If true, do not log the standard deviation
                of the diagnostic over the epoch.
        """
        if val is not None:
            super().log_tabular(key, val)
        else:
            v = self.epoch_dict[key]

            vals = np.concatenate(v) if len(v) != 0 and isinstance(
                v[0], np.ndarray) and len(v[0].shape) > 0 else v  # v is a list

            stats = mpi_statistics_scalar(vals,
                                          with_min_and_max=with_min_and_max)
            super().log_tabular(key if average_only else 'Average' + key,
                                stats[0])
            if not (average_only):
                super().log_tabular('Std' + key, stats[1])
            if with_min_and_max:
                super().log_tabular('Max' + key, stats[3])
                super().log_tabular('Min' + key, stats[2])
        self.epoch_dict[key] = []
Exemple #4
0
    def get(self):
        """
		call this at the end of epoch to get all data from buffer, normalized advantage,
		and reset pointers
		"""
        assert self.ptr == self.max_size, "Buffer not full"
        self.ptr, self.episode_start_idx = 0, 0

        #advtantage normalization
        adv_mean, adv_std = mpi_statistics_scalar(self.advantage_buffer)
        self.advantage_buffer = (self.advantage_buffer - adv_mean) / adv_std

        data = dict(obs=self.obs_buffer,
                    act=self.act_buffer,
                    ret=self.return_buffer,
                    adv=self.advantage_buffer,
                    logprob=self.logprob_buffer)
        return {
            k: torch.as_tensor(v, dtype=torch.float32)
            for k, v in data.items()
        }
Exemple #5
0
  def get(self):
    """
    "Call this at the end of an epoch to get all of the data from
    the buffer, with advantages appropriately normalized (shifted to have
    mean zero and std one).  Also, resets some pointers in the buffer."
    -OpenAI

    Returns the list of vectors [obs_buf, act_buf, adv_buf, ret_buf, logp_buf].

    NOTE:
    I am concerned that something is lost by normalizing.  For example,
    it is possible that during this epoch, we had an advantage with a very high
    mean, because the actions taken during the trajectories happened to be quite
    a bit better.  If we normalize, we lose the information that this
    epoch represented a solid improvement; instead we just learn which
    particular actions tried out during this epoch gave the biggest
    improvement.  We don't just care how the actions in this trajectory
    compare to one another, though; we care about how they compare to all
    possible actions, including ones we didn't try.

    That said, since this sample was just sampled from the given trajectory,
    we expect the mean advantage to already be close to 0, since we expect
    our value function is already pretty accurate to the mean performance of
    trajectories under this policy.  Thus renormalizing to a mean of zero
    usually shouldn't represent a big change, and it may be that the mathematical
    convenience of this is worth it.  It may also be that there is a more fundamentally
    important reason to normalize which I'm not currently understanding.

    TODO: consider whether we should change this so it DOES NOT normalize advantages
    (or just don't normalize the mean, even if you do normalize the variance)
    """
    # "buffer has to be full before you can get" -OpenAI
    assert self.ptr == self.max_size
    self.ptr, self.path_start_idx = 0, 0

    # "the next two lines implement the advantage normalization trick" -OpenAI
    adv_mean, adv_std = mpi_statistics_scalar(self.adv_buf) # TODO: import openAI mpi statistics stuff
    self.adv_buf = (self.adv_buf - adv_mean) / adv_std

    return [self.obs_buf, self.act_buf, self.adv_buf, self.ret_buf, self.logp_buf]
Exemple #6
0
def impala(gym_or_pyco, env_fn, ac_kwargs=dict(), n=4, logger_kwargs=dict(), actor_critic=core.mlp_actor_critic, num_cpu=1, epochs=200, max_ep_len=300,
           steps_per_epoch=4000, gamma=0.99, seed=73,vf_lr=1e-3, pi_lr = 3e-4, entropy_cost = 0.00025, baseline_cost = .5, rho_bar = 1, c_bar = 1, train_pi_iters=80,train_v_iters=80,
           export_dir="/home/clement/Documents/spinningup_instadeep/data/cmd_impala/cmd_impala_s0/simple_save",
           tensorboard_path = '/home/clement/spinningup/tensorboard'):

    dict_continous_gym = ['CarRacing', 'LunarLander', 'Pong', 'AirRaid', 'Adventure', 'AirRaid', 'Alien', 'Amidar',
                          'Assault', 'Asterix', 'Asteroids', 'Atlantis',
                          'BankHeist', 'BattleZone', 'BeamRider', 'Berzerk', 'Bowling', 'Boxing', 'Breakout',
                          'Carnival',
                          'Centipede', 'ChopperCommand', 'CrazyClimber', 'Defender', 'Demon_attack', 'DoubleDunk',
                          'ElevatorAction', 'Enduro', 'FishingDerby', 'Freeway', 'Frostbite', 'Gopher', 'Gravitar',
                          'Hero', 'IceHockey', 'Jamesbond', 'JourneyEscape', 'Kangaroo', 'Krull', 'KungFuMaster',
                          'MpntezumaRevenge', 'MsPacman', 'NameThisGame', 'Phoenix', 'Pitfall', 'Pooyan',
                          'PrivateEye', 'Qbert', 'Riverraid', 'RoadRunner', 'Robotank', 'Seaquest', 'Skiing',
                          'Solaris', 'SpaceInvaders', 'StarGunner', 'Tennis', 'TimePilot', 'Tutankham', 'UpNDown',
                          'Venture', 'VideoPinball', 'WizardOfWor', 'VarsRevenge', 'Zaxxon', 'Numberlink']
    dict_discrete_gym = []
    dict_gym = ['CarRacing', 'LunarLander', 'Pong', 'AirRaid', 'Adventure', 'AirRaid', 'Alien', 'Amidar',
                'Assault', 'Asterix', 'Asteroids', 'Atlantis',
                'BankHeist', 'BattleZone', 'BeamRider', 'Berzerk', 'Bowling', 'Boxing', 'Breakout', 'Carnival',
                'Centipede', 'ChopperCommand', 'CrazyClimber', 'Defender', 'Demon_attack', 'DoubleDunk',
                'ElevatorAction', 'Enduro', 'FishingDerby', 'Freeway', 'Frostbite', 'Gopher', 'Gravitar',
                'Hero', 'IceHockey', 'Jamesbond', 'JourneyEscape', 'Kangaroo', 'Krull', 'KungFuMaster',
                'MpntezumaRevenge', 'MsPacman', 'NameThisGame', 'Phoenix', 'Pitfall', 'Pooyan',
                'PrivateEye', 'Qbert', 'Riverraid', 'RoadRunner', 'Robotank', 'Seaquest', 'Skiing',
                'Solaris', 'SpaceInvaders', 'StarGunner', 'Tennis', 'TimePilot', 'Tutankham', 'UpNDown',
                'Venture', 'VideoPinball', 'WizardOfWor', 'VarsRevenge', 'Zaxxon', 'Numberlink']

    env = env_fn()
    proc_id()
    seed += 10000 * 3
    tf.set_random_seed(seed)
    np.random.seed(seed)
    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())

    if gym_or_pyco == 'gym':
        None
    else:
        env = env()

    obs_dim = env.observation_space.shape
    if env.action_space == 4:
        act_dim = env.action_space
    try:
        act_dim = env.action_space.n
    except:
        act_dim = env.action_space.shape

    # Share information about action space with policy architecture
    ac_kwargs['action_space'] = env.action_space

    # Inputs to computation graph

    if gym_or_pyco == 'pyco':
        x_ph = tf.placeholder(tf.float32, shape=(None, obs_dim[0], obs_dim[1], 1))
    else:
        x_ph = tf.placeholder(tf.float32, shape=(1, obs_dim[0], obs_dim[1], obs_dim[2]))
    # a_ph = core.placeholders_from_spaces(env.action_space)
    if gym_or_pyco == 'gym' and isinstance(env.action_space, Discrete):
        a_ph = tf.placeholder(tf.uint8, shape=(1))

    elif gym_or_pyco == 'gym' and isinstance(env.action_space, Box):
        a_ph = tf.placeholder(tf.float32, shape=(env.action_space.shape[0]))

    else:
        a_ph = tf.placeholder(tf.int32, shape=(None))

    if gym_or_pyco == 'gym' and isinstance(env.action_space, Discrete):
        pi, logp, logp_pi, v, logits = actor_critic(x_ph, a_ph, policy='baseline_categorical_policy',
                                                    action_space=env.action_space.n)
    elif gym_or_pyco == 'gym' and isinstance(env.action_space, Box):
        pi, logp, logp_pi, v = actor_critic(x_ph, a_ph, policy='baseline_gaussian_policy',
                                            action_space=env.action_space.shape[0])
    else:
        pi, logp, logp_pi, v, logits = actor_critic(x_ph, a_ph, policy='baseline_categorical_policy',
                                                    action_space=env.action_space.n)
    adv_ph, pi_act_ph, logp_old_ph, v_trace_ph = core.placeholders(None, None, None, None)
    advantages = tf.stop_gradient(adv_ph)
    all_phs = [x_ph, a_ph, adv_ph, pi_act_ph]

    # every steps, get : action, value and logprob.
    get_action_ops = [pi, v, logp_pi]
    logits_op = [logits]



    # Count variables
    var_counts = tuple(core.count_vars(scope) for scope in ['pi', 'v'])
    logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts)

    # need to get rho_param from the v_trace function..

    c_param = tf.minimum(tf.exp(logp - logp_old_ph) ,c_bar)
    rho_param = tf.minimum(tf.exp(logp - logp_old_ph) ,rho_bar)

    def compute_baseline_loss(v_trace_ph, v):
        # Loss for the baseline, summed over the time dimension.
        # Multiply by 0.5 to match the standard update rule:
        # d(loss) / d(baseline) = advantage
        return .5 * tf.reduce_sum(tf.square(v_trace_ph - v))

    def compute_entropy_loss(logits):
        policy = tf.nn.softmax(logits)
        log_policy = tf.nn.log_softmax(logits)
        entropy_per_timestep = tf.reduce_sum(-policy * log_policy, axis=-1)
        return -tf.reduce_sum(entropy_per_timestep)

#advantages = adv_buf[i]

    def compute_policy_gradient_loss(logits, advantages, a=all_phs[1]):
        #actions = tf.one_hot(a,depth=act_dim)
        cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(
            labels=a, logits=logits)
        advantages = tf.stop_gradient(advantages)
        policy_gradient_loss_per_timestep = cross_entropy * advantages
        return tf.reduce_sum(policy_gradient_loss_per_timestep)

    total_loss = compute_entropy_loss(logits) * entropy_cost + compute_baseline_loss(v_trace_ph, v) * baseline_cost + \
                 compute_policy_gradient_loss(logits, adv_ph, all_phs[1])

    #pi_loss = tf.reduce_mean(adv_ph*rho_param)

    #v_loss = tf.reduce_mean((v_trace_ph - v) ** 2)

    def v_trace(obs_list, rews_list, act_list, logp_list, gamma, c_param, rho_param, v, obs_dim1, obs_dim2, last_obs_buf, sess):
        """Prend en entrée les trajectoires et les rewards associés, renvoie un dictionaire associé à des states : à un state x_s est associé un scalaire v_{x_s}
        les trajectoires seront une liste de trajectoires

        Args:
           obs_list: a list of different paths observations used for v_trace.
           rews_list: the list of the rewards lists from each of every paths used for v_trace.
           act_list: a list of the actions lists from each of every paths used for v_trace.
           logp_list: a vector of log probabilities log(p_old(a|s)) used for v_trace.
           gamma : hyperparam in v_trace and GAE
           c_param : a placeholder to be fed.
           rho_param : a placeholder to be fed
           v : a tf function for the value. Depends on x_ph and a_ph
           obs_dim1 : size of rows for board
           obs_dim2 : size of cols for board
           sess: contains the up to date policy of the graph from the learner at the time of computing v_trace.
        """

        size_obs = len(obs_list)
        v_tr = np.zeros(size_obs+1)

        c_param = sess.run([c_param],feed_dict={x_ph: obs_list, a_ph: act_list, logp_old_ph: logp_list})[0]
        c_param[-1] = 1
        rho_param = sess.run([rho_param], feed_dict={x_ph: obs_list, a_ph: act_list, logp_old_ph: logp_list})
        #v_tr[-1] = sess.run([v],feed_dict={x_ph: np.reshape(obs_list[-1], (1, obs_dim1, obs_dim2, 1))}) + rews_list[-1] * rho_param[0][-1]
        v_tr[-1] = last_val_buf
        last_obs = np.reshape(obs_list[-1], (1, obs_dim1, obs_dim2, 1))
        v_tr[-2] = sess.run([v],feed_dict={x_ph: last_obs})[0]+rho_param[0][-1]*(rews_list[-1] + gamma * sess.run([v],feed_dict={x_ph: last_obs_buf})[0]- sess.run([v],feed_dict={x_ph: last_obs})[0]) + gamma * c_param[-1] *(v_tr[-1] - sess.run([v],feed_dict={x_ph: last_obs_buf})[0] )
        for i in range(size_obs-1):
            obs_t_1 = np.reshape(obs_list[size_obs-2-i], (1, obs_dim1, obs_dim2, 1))
            obs_t = np.reshape(obs_list[size_obs-i-1],(1,obs_dim1, obs_dim2, 1))
            v_tr[size_obs-2-i] = sess.run([v],feed_dict={x_ph: obs_t_1})[0]+rho_param[0][size_obs-2-i]*(rews_list[size_obs-2-i] + gamma * sess.run([v], feed_dict={x_ph: obs_t})[0]- sess.run([v],feed_dict={x_ph: obs_t_1})[0]) + gamma * c_param[size_obs-2-i] *(v_tr[size_obs-i-1] - sess.run([v], feed_dict={x_ph: obs_t})[0] )
        return v_tr

    # with adv_ph the advantage with v_trace. On the whole thing?..
    with tf.name_scope('pi_loss'):
        #core.variable_summaries(pi_loss)
        core.variable_summaries(total_loss)

    # Optimizers
    #train_pi = MpiAdamOptimizer(learning_rate=pi_lr).minimize(pi_loss)
    #train_v = MpiAdamOptimizer(learning_rate=vf_lr).minimize(v_loss)
    #train_pi = MpiAdamOptimizer(learning_rate=pi_lr).minimize(total_loss)
    total_env_frames=1e6
    momentum=0.
    epsilon=.1
    decay=.99

    #tf.get_variable(
    #    'num_environment_frames',
    #    initializer=tf.zeros_initializer(),
    #    shape=[],
    #    dtype=tf.float32,
    #    trainable=False,
    #    collections=[tf.GraphKeys.GLOBAL_STEP, tf.GraphKeys.GLOBAL_VARIABLES])

    #num_env_frames = tf.train.get_global_step()
    #learning_rate = tf.train.polynomial_decay(pi_lr, num_env_frames,
    #                                          total_env_frames, 0)
    global_step = 100
    starter_learning_rate = 3e-4
    end_learning_rate=3e-5
    decay_steps=5e2
    learning_rate = tf.compat.v1.train.polynomial_decay(starter_learning_rate, global_step,
                                                        decay_steps, 0)

    optimizer = tf.train.RMSPropOptimizer(learning_rate, decay,
                                          momentum, epsilon)
    train_pi = optimizer.minimize(total_loss)

    sess = tf.Session()
    merged = tf.summary.merge_all()
    train_writer = tf.summary.FileWriter(tensorboard_path + '/train',
                                         sess.graph)
    test_writer = tf.summary.FileWriter(tensorboard_path + '/test')

    sess.run(tf.global_variables_initializer())
    sess.run(sync_all_params())
    # logger.setup_tf_saver(sess, inputs={'x': x_ph}, outputs={'pi':pi, 'v': v})


    def update(adv_buf, obs_list, act_list, logp_list):
        #pi_l_old, v_l_old = sess.run([pi_loss, v_loss],feed_dict={x_ph
        #
        # : obs_list[0], a_ph: act_list[0], logp_old_ph: logp_list[0], v_trace_ph: v_trace_list[0][:-1], adv_ph: adv_buf[0]})
        for i in range(n):
            for _ in range(train_pi_iters):
                sess.run(train_pi, feed_dict ={x_ph: obs_list[i], a_ph: act_list[i], logp_old_ph: logp_list[i], adv_ph: adv_buf[i], v_trace_ph: v_trace_list[i][:-1]})
            #for _ in range(train_v_iters):
            #    sess.run(train_v,feed_dict={x_ph: obs_list[i], a_ph: act_list[i], v_trace_ph: v_trace_list[i][:-1]})
        #pi_l_new, v_l_new = sess.run([pi_loss, v_loss],feed_dict={x_ph: obs_list[0], a_ph: act_list[0], logp_old_ph: logp_list[0], v_trace_ph: v_trace_list[0][:-1], adv_ph: adv_buf[0]})
        #logger.store(LossPi=pi_l_old, LossV=v_l_old, DeltaLossPi=(pi_l_new-pi_l_old), DeltaLossV=(v_l_new - v_l_old))


    saver = tf.train.Saver()
    save_path = saver.save(sess,export_dir)

    for epoch in range(epochs):
        # Begins collecting trajectories and computing v_traces, adv.
        obs_list = []
        rew_list = []
        act_list = []
        val_list = []
        logp_list = []
        v_trace_list = []
        adv_buf = []
        actors = [Actor(x_ph, a_ph, np.random.random_integers(0, high=39239, size=1)[0]) for i in range(n)]
        ep_len = []
        last_rew_list = []
        for i in range(n):
            actors[i].load_last_weights(export_dir)
            obs_buf, act_buf, rew_buf, val_buf, logp_buf, last_rew_buf, last_val_buf, last_obs_buf = actors[i].get_episode(env,get_action_ops,gym_or_pyco,obs_dim)
            obs_buf = np.reshape(obs_buf, (np.shape(obs_buf)[0], obs_dim[0], obs_dim[1], 1))
            ep_len.append(len(obs_buf))
            last_rew_list = np.append(last_rew_list, last_rew_buf)
            logp_buf = np.reshape(logp_buf, (np.shape(logp_buf)[0]))
            obs_list.append(obs_buf)
            rew_list.append(rew_buf)
            act_list.append(act_buf)
            val_list.append(val_buf)
            logp_list.append(logp_buf)
            v_trace_list.append(v_trace(obs_list[i], rew_list[i], act_list[i], logp_list[i], gamma, c_param, rho_param, v, obs_dim[0], obs_dim[1], last_obs_buf, sess))
            rews = np.append(rew_list[i], last_rew_buf)
            vals = np.append(val_list[i], last_val_buf)
            adv = rews[:-1] + gamma * v_trace_list[i][1:] - vals[:-1]
            # normalization of adv:
            adv_mean, adv_std = mpi_statistics_scalar(adv)
            adv = (adv - adv_mean) / (adv_std + 1e-5)
            adv_buf.append(adv)

        update(adv_buf, obs_list, act_list, logp_list)
        saver = tf.train.Saver()
        save_path = saver.save(sess,
                               export_dir)
        EpRet = []
        for k in range(n):
            EpRet = np.append(EpRet,sum(rew_list[k]))
            EpRet[-1] = EpRet[-1]+last_rew_list[k]
        logger.store(EpRet=EpRet)
        logger.store(EpLen=ep_len)
        logger.log_tabular('Epoch', epoch)
        logger.log_tabular('EpRet', with_min_and_max=True)
        logger.log_tabular('EpLen', with_min_and_max=True)
        logger.dump_tabular()