Esempio n. 1
0
def test_mpi_adam():
    """
    tests the MpiAdam object's functionality
    """
    np.random.seed(0)
    tf.set_random_seed(0)

    a_var = tf.Variable(np.random.randn(3).astype('float32'))
    b_var = tf.Variable(np.random.randn(2, 5).astype('float32'))
    loss = tf.reduce_sum(tf.square(a_var)) + tf.reduce_sum(tf.sin(b_var))

    learning_rate = 1e-2
    update_op = tf.train.AdamOptimizer(learning_rate).minimize(loss)
    do_update = tf_utils.function([], loss, updates=[update_op])

    tf.get_default_session().run(tf.global_variables_initializer())
    for step in range(10):
        print(step, do_update())

    tf.set_random_seed(0)
    tf.get_default_session().run(tf.global_variables_initializer())

    var_list = [a_var, b_var]
    lossandgrad = tf_utils.function(
        [], [loss, tf_utils.flatgrad(loss, var_list)], updates=[update_op])
    adam = MpiAdam(var_list)

    for step in range(10):
        loss, grad = lossandgrad()
        adam.update(grad, learning_rate)
        print(step, loss)
Esempio n. 2
0
    def __init__(self, env, hidden_size, entcoeff=0.001, scope="adversary"):
        """
        reward regression from observations and transitions

        :param env: (Gym Environment)
        :param hidden_size: ([int]) the hidden dimension for the MLP
        :param entcoeff: (float) the entropy loss weight
        :param scope: (str) tensorflow variable scope
        """
        self.scope = scope
        self.observation_shape = env.observation_space.shape
        self.actions_shape = env.action_space.shape
        self.input_shape = tuple([
            o + a for o, a in zip(self.observation_shape, self.actions_shape)
        ])
        self.num_actions = env.action_space.shape[0]
        self.hidden_size = hidden_size
        self.build_ph()
        # Build grpah
        generator_logits = self.build_graph(self.generator_obs_ph,
                                            self.generator_acs_ph,
                                            reuse=False)
        expert_logits = self.build_graph(self.expert_obs_ph,
                                         self.expert_acs_ph,
                                         reuse=True)
        # Build accuracy
        generator_acc = tf.reduce_mean(
            tf.to_float(tf.nn.sigmoid(generator_logits) < 0.5))
        expert_acc = tf.reduce_mean(
            tf.to_float(tf.nn.sigmoid(expert_logits) > 0.5))
        # Build regression loss
        # let x = logits, z = targets.
        # z * -log(sigmoid(x)) + (1 - z) * -log(1 - sigmoid(x))
        generator_loss = tf.nn.sigmoid_cross_entropy_with_logits(
            logits=generator_logits, labels=tf.zeros_like(generator_logits))
        generator_loss = tf.reduce_mean(generator_loss)
        expert_loss = tf.nn.sigmoid_cross_entropy_with_logits(
            logits=expert_logits, labels=tf.ones_like(expert_logits))
        expert_loss = tf.reduce_mean(expert_loss)
        # Build entropy loss
        logits = tf.concat([generator_logits, expert_logits], 0)
        entropy = tf.reduce_mean(logit_bernoulli_entropy(logits))
        entropy_loss = -entcoeff * entropy
        # Loss + Accuracy terms
        self.losses = [
            generator_loss, expert_loss, entropy, entropy_loss, generator_acc,
            expert_acc
        ]
        self.loss_name = [
            "generator_loss", "expert_loss", "entropy", "entropy_loss",
            "generator_acc", "expert_acc"
        ]
        self.total_loss = generator_loss + expert_loss + entropy_loss
        # Build Reward for policy
        self.reward_op = -tf.log(1 - tf.nn.sigmoid(generator_logits) + 1e-8)
        var_list = self.get_trainable_variables()
        self.lossandgrad = tf_util.function([
            self.generator_obs_ph, self.generator_acs_ph, self.expert_obs_ph,
            self.expert_acs_ph
        ], self.losses + [tf_util.flatgrad(self.total_loss, var_list)])
Esempio n. 3
0
 def _setup_critic_optimizer(self):
     """
     setup the optimizer for the critic
     """
     if self.verbose >= 2:
         logger.info('setting up critic optimizer')
     normalized_critic_target_tf = tf.clip_by_value(normalize(self.critic_target, self.ret_rms),
                                                    self.return_range[0], self.return_range[1])
     self.critic_loss = tf.reduce_mean(tf.square(self.normalized_critic_tf - normalized_critic_target_tf))
     if self.critic_l2_reg > 0.:
         critic_reg_vars = [var for var in tf_util.get_trainable_vars('model/qf/')
                            if 'bias' not in var.name and 'output' not in var.name and 'b' not in var.name]
         if self.verbose >= 2:
             for var in critic_reg_vars:
                 logger.info('  regularizing: {}'.format(var.name))
             logger.info('  applying l2 regularization with {}'.format(self.critic_l2_reg))
         critic_reg = tc.layers.apply_regularization(
             tc.layers.l2_regularizer(self.critic_l2_reg),
             weights_list=critic_reg_vars
         )
         self.critic_loss += critic_reg
     critic_shapes = [var.get_shape().as_list() for var in tf_util.get_trainable_vars('model/qf/')]
     critic_nb_params = sum([reduce(lambda x, y: x * y, shape) for shape in critic_shapes])
     if self.verbose >= 2:
         logger.info('  critic shapes: {}'.format(critic_shapes))
         logger.info('  critic params: {}'.format(critic_nb_params))
     self.critic_grads = tf_util.flatgrad(self.critic_loss, tf_util.get_trainable_vars('model/qf/'),
                                          clip_norm=self.clip_norm)
     self.critic_optimizer = MpiAdam(var_list=tf_util.get_trainable_vars('model/qf/'), beta1=0.9, beta2=0.999,
                                     epsilon=1e-08)
Esempio n. 4
0
    def _setup_critic_optimizer(self):
        """
        setup the optimizer for the critic
        """
        if self.verbose >= 2:
            logger.info('setting up critic optimizer')

        ### BSS LOSS ###
        all_vars = [v for v in tf.global_variables()]
        self.l2_loss = 0.0
        for var in all_vars:
            if 'qf' in var.name:
                self.l2_loss += tf.losses.mean_squared_error(
                    tf.zeros(var.shape), var)

        _, qf_features = self.policy_tf.feature_matrices()
        singular_qf = tf.linalg.svd(qf_features, compute_uv=False)
        self.bss_loss = tf.reduce_sum(tf.square(singular_qf[-1]))
        ### BSS LOSS ###

        normalized_critic_target_tf = tf.clip_by_value(
            normalize(self.critic_target, self.ret_rms), self.return_range[0],
            self.return_range[1])
        self.critic_loss = tf.reduce_mean(tf.square(self.normalized_critic_tf - normalized_critic_target_tf)) + \
            self.bss_coef * self.bss_loss + self.l2_coef * self.l2_loss
        if self.critic_l2_reg > 0.:
            critic_reg_vars = [
                var for var in tf_util.get_trainable_vars('model/qf/')
                if 'bias' not in var.name and 'qf_output' not in var.name
                and 'b' not in var.name
            ]
            if self.verbose >= 2:
                for var in critic_reg_vars:
                    logger.info('  regularizing: {}'.format(var.name))
                logger.info('  applying l2 regularization with {}'.format(
                    self.critic_l2_reg))
            critic_reg = tc.layers.apply_regularization(
                tc.layers.l2_regularizer(self.critic_l2_reg),
                weights_list=critic_reg_vars)
            self.critic_loss += critic_reg
        critic_shapes = [
            var.get_shape().as_list()
            for var in tf_util.get_trainable_vars('model/qf/')
        ]
        critic_nb_params = sum(
            [reduce(lambda x, y: x * y, shape) for shape in critic_shapes])
        if self.verbose >= 2:
            logger.info('  critic shapes: {}'.format(critic_shapes))
            logger.info('  critic params: {}'.format(critic_nb_params))
        self.critic_grads = tf_util.flatgrad(
            self.critic_loss,
            tf_util.get_trainable_vars('model/qf/'),
            clip_norm=self.clip_norm)
        self.critic_optimizer = MpiAdam(
            var_list=tf_util.get_trainable_vars('model/qf/'),
            beta1=0.9,
            beta2=0.999,
            epsilon=1e-08)
Esempio n. 5
0
 def _setup_actor_optimizer(self):
     """
     setup the optimizer for the actor
     """
     if self.verbose >= 2:
         logger.info('setting up actor optimizer')
     self.actor_loss = -tf.reduce_mean(self.critic_with_actor_tf)
     actor_shapes = [var.get_shape().as_list() for var in tf_util.get_trainable_vars('model/pi/')]
     actor_nb_params = sum([reduce(lambda x, y: x * y, shape) for shape in actor_shapes])
     if self.verbose >= 2:
         logger.info('  actor shapes: {}'.format(actor_shapes))
         logger.info('  actor params: {}'.format(actor_nb_params))
     self.actor_grads = tf_util.flatgrad(self.actor_loss, tf_util.get_trainable_vars('model/pi/'),
                                         clip_norm=self.clip_norm)
     self.actor_optimizer = MpiAdam(var_list=tf_util.get_trainable_vars('model/pi/'), beta1=0.9, beta2=0.999,
                                    epsilon=1e-08)
Esempio n. 6
0
    def _setup_actor_optimizer(self):
        """
        setup the optimizer for the actor
        """
        if self.verbose >= 2:
            logger.info('setting up actor optimizer')

        ### BSS LOSS ###
        all_vars = [v for v in tf.global_variables()]
        self.l2_loss = 0.0
        for var in all_vars:
            if 'pi' in var.name:
                self.l2_loss += tf.losses.mean_squared_error(
                    tf.zeros(var.shape), var)

        pi_features, _ = self.policy_tf.feature_matrices()
        singular_pi = tf.linalg.svd(pi_features, compute_uv=False)
        self.bss_loss = tf.reduce_sum(tf.square(singular_pi[-1]))
        ### BSS LOSS ###

        self.actor_loss = -tf.reduce_mean(self.critic_with_actor_tf) + \
                          self.bss_coef * self.bss_loss + self.l2_coef * self.l2_loss
        actor_shapes = [
            var.get_shape().as_list()
            for var in tf_util.get_trainable_vars('model/pi/')
        ]
        actor_nb_params = sum(
            [reduce(lambda x, y: x * y, shape) for shape in actor_shapes])
        if self.verbose >= 2:
            logger.info('  actor shapes: {}'.format(actor_shapes))
            logger.info('  actor params: {}'.format(actor_nb_params))
        self.actor_grads = tf_util.flatgrad(
            self.actor_loss,
            tf_util.get_trainable_vars('model/pi/'),
            clip_norm=self.clip_norm)
        self.actor_optimizer = MpiAdam(
            var_list=tf_util.get_trainable_vars('model/pi/'),
            beta1=0.9,
            beta2=0.999,
            epsilon=1e-08)
Esempio n. 7
0
def learn(
        env,
        policy_func,
        *,
        timesteps_per_batch,  # timesteps per actor per update
        clip_param,
        entcoeff,  # clipping parameter epsilon, entropy coeff
        optim_epochs,
        optim_stepsize,
        optim_batchsize,  # optimization hypers
        gamma,
        lam,  # advantage estimation
        max_timesteps=0,
        max_episodes=0,
        max_iters=0,
        max_seconds=0,  # time constraint
        callback=None,  # you can do anything in the callback, since it takes locals(), globals()
        adam_epsilon=1e-5,
        schedule='constant',  # annealing for stepsize parameters (epsilon and adam)
        num_options=2,
        app='',
        saves=False,
        wsaves=False,
        epoch=-1,
        seed=1,
        dc=0):

    optim_batchsize_ideal = optim_batchsize
    np.random.seed(seed)
    tf.set_random_seed(seed)

    # env._seed(seed)

    gamename = env.spec.id[:-3].lower()
    gamename += 'seed' + str(seed)
    gamename += app

    dirname = '{}_{}opts_saves/'.format(gamename, num_options)

    if wsaves:
        first = True
        if not os.path.exists(dirname):
            os.makedirs(dirname)
            first = False
        # while os.path.exists(dirname) and first:
        #     dirname += '0'

        files = ['pposgd_simple.py', 'mlp_policy.py', 'run_main.py']
        for i in range(len(files)):
            src = os.path.expanduser('~/baselines/baselines/ppo1/') + files[i]
            dest = os.path.expanduser('~/baselines/baselines/ppo1/') + dirname
            shutil.copy2(src, dest)

    # ----------------------------------------
    ob_space = env.observation_space
    ac_space = env.action_space
    pi = policy_func("pi", ob_space,
                     ac_space)  # Construct network for new policy
    oldpi = policy_func("oldpi", ob_space, ac_space)  # Network for old policy
    atarg = tf.placeholder(
        dtype=tf.float32,
        shape=[None])  # Target advantage function (if applicable)
    ret = tf.placeholder(dtype=tf.float32, shape=[None])  # Empirical return

    # option = tf.placeholder(dtype=tf.int32, shape=[None])

    lrmult = tf.placeholder(
        name='lrmult', dtype=tf.float32,
        shape=[])  # learning rate multiplier, updated with schedule
    clip_param = clip_param * lrmult  # Annealed cliping parameter epislon

    # pdb.set_trace()
    ob = U.get_placeholder_cached(name="ob")
    option = U.get_placeholder_cached(name="option")
    term_adv = U.get_placeholder(name='term_adv',
                                 dtype=tf.float32,
                                 shape=[None])

    ac = pi.pdtype.sample_placeholder([None])

    kloldnew = oldpi.pd.kl(pi.pd)
    ent = pi.pd.entropy()
    meankl = tf.reduce_mean(kloldnew)
    meanent = tf.reduce_mean(ent)
    pol_entpen = (-entcoeff) * meanent

    ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac))  # pnew / pold
    surr1 = ratio * atarg  # surrogate from conservative policy iteration
    surr2 = tf.clip_by_value(ratio, 1.0 - clip_param,
                             1.0 + clip_param) * atarg  #
    pol_surr = -tf.reduce_mean(tf.minimum(
        surr1, surr2))  # PPO's pessimistic surrogate (L^CLIP)

    vf_loss = tf.reduce_mean(tf.square(pi.vpred - ret))
    total_loss = pol_surr + pol_entpen + vf_loss
    losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent]
    loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"]

    term_loss = pi.tpred * term_adv

    log_pi = tf.log(tf.clip_by_value(pi.op_pi, 1e-20, 1.0))
    entropy = -tf.reduce_sum(pi.op_pi * log_pi, reduction_indices=1)
    op_loss = -tf.reduce_sum(log_pi[0][option[0]] * atarg + entropy * 0.1)

    total_loss += op_loss

    var_list = pi.get_trainable_variables()
    term_list = var_list[6:8]

    lossandgrad = U.function([ob, ac, atarg, ret, lrmult, option, term_adv],
                             losses + [U.flatgrad(total_loss, var_list)])
    termloss = U.function([ob, option, term_adv],
                          [U.flatgrad(term_loss, var_list)
                           ])  # Since we will use a different step size.
    adam = MpiAdam(var_list, epsilon=adam_epsilon)

    assign_old_eq_new = U.function(
        [], [],
        updates=[
            tf.assign(oldv, newv)
            for (oldv,
                 newv) in zipsame(oldpi.get_variables(), pi.get_variables())
        ])

    compute_losses = U.function([ob, ac, atarg, ret, lrmult, option], losses)
    U.initialize()
    adam.sync()
    saver = tf.train.Saver(max_to_keep=10000)

    results = []
    if saves:
        results = open(
            gamename + '_' + str(num_options) + 'opts_' + '_results.csv', 'w')

        out = 'epoch,avg_reward'

        for opt in range(num_options):
            out += ',option {} dur'.format(opt)
        for opt in range(num_options):
            out += ',option {} std'.format(opt)
        for opt in range(num_options):
            out += ',option {} term'.format(opt)
        for opt in range(num_options):
            out += ',option {} adv'.format(opt)

        out += '\n'

        results.write(out)

        # results.write('epoch,avg_reward,option 1 dur, option 2 dur, option 1 term, option 2 term\n')
        results.flush()

    if epoch >= 0:
        dirname = '{}_{}opts_saves/'.format(gamename, num_options)
        print("Loading weights from iteration: " + str(epoch))

        filename = dirname + '{}_epoch_{}.ckpt'.format(gamename, epoch)
        saver.restore(U.get_session(), filename)

    episodes_so_far = 0
    timesteps_so_far = 0
    global iters_so_far
    iters_so_far = 0
    tstart = time.time()
    lenbuffer = deque(maxlen=100)  # rolling buffer for episode lengths
    rewbuffer = deque(maxlen=100)  # rolling buffer for episode rewards

    assert sum(
        [max_iters > 0, max_timesteps > 0, max_episodes > 0,
         max_seconds > 0]) == 1, "Only one time constraint permitted"

    seg_gen = traj_segment_generator(pi,
                                     env,
                                     timesteps_per_batch,
                                     stochastic=True,
                                     num_options=num_options,
                                     saves=saves,
                                     results=results,
                                     rewbuffer=rewbuffer,
                                     dc=dc)

    datas = [0 for _ in range(num_options)]

    while True:
        if callback: callback(locals(), globals())
        if max_timesteps and timesteps_so_far >= max_timesteps:
            break
        elif max_episodes and episodes_so_far >= max_episodes:
            break
        elif max_iters and iters_so_far >= max_iters:
            break
        elif max_seconds and time.time() - tstart >= max_seconds:
            break

        if schedule == 'constant':
            cur_lrmult = 1.0
        elif schedule == 'linear':
            cur_lrmult = max(1.0 - float(timesteps_so_far) / max_timesteps, 0)
        else:
            raise NotImplementedError

        logger.log("********** Iteration %i ************" % iters_so_far)

        seg = seg_gen.__next__()
        add_vtarg_and_adv(seg, gamma, lam)

        opt_d = []
        for i in range(num_options):
            dur = np.mean(
                seg['opt_dur'][i]) if len(seg['opt_dur'][i]) > 0 else 0.
            opt_d.append(dur)

        std = []
        for i in range(num_options):
            logstd = np.mean(
                seg['logstds'][i]) if len(seg['logstds'][i]) > 0 else 0.
            std.append(np.exp(logstd))

        print("mean opt dur:", opt_d)
        print("mean op pol:", np.mean(np.array(seg['optpol_p']), axis=0))
        print("mean term p:", np.mean(np.array(seg['term_p']), axis=0))
        print("mean value val:", np.mean(np.array(seg['value_val']), axis=0))
        ob, ac, opts, atarg, tdlamret = seg["ob"], seg["ac"], seg["opts"], seg[
            "adv"], seg["tdlamret"]
        vpredbefore = seg["vpred"]  # predicted value function before udpate
        atarg = (atarg - atarg.mean()
                 ) / atarg.std()  # standardized advantage function estimate

        if hasattr(pi, "ob_rms"):
            pi.ob_rms.update(ob)  # update running mean/std for policy
        assign_old_eq_new()  # set old parameter values to new parameter values

        if iters_so_far % 5 == 0 and wsaves:
            print("weights are saved...")
            filename = dirname + '{}_epoch_{}.ckpt'.format(
                gamename, iters_so_far)
            save_path = saver.save(U.get_session(), filename)

        min_batch = 160
        t_advs = [[] for _ in range(num_options)]
        for opt in range(num_options):
            indices = np.where(opts == opt)[0]
            print("batch size:", indices.size)
            opt_d[opt] = indices.size
            if not indices.size:
                t_advs[opt].append(0.)
                continue

            # This part is only necessasry when we use options.
            # We proceed to these verifications in order not to discard any collected trajectories.
            if datas[opt] != 0:
                if (indices.size < min_batch and datas[opt].n > min_batch):
                    datas[opt] = Dataset(dict(ob=ob[indices],
                                              ac=ac[indices],
                                              atarg=atarg[indices],
                                              vtarg=tdlamret[indices]),
                                         shuffle=not pi.recurrent)
                    t_advs[opt].append(0.)
                    continue

                elif indices.size + datas[opt].n < min_batch:
                    # pdb.set_trace()
                    oldmap = datas[opt].data_map

                    cat_ob = np.concatenate((oldmap['ob'], ob[indices]))
                    cat_ac = np.concatenate((oldmap['ac'], ac[indices]))
                    cat_atarg = np.concatenate(
                        (oldmap['atarg'], atarg[indices]))
                    cat_vtarg = np.concatenate(
                        (oldmap['vtarg'], tdlamret[indices]))
                    datas[opt] = Dataset(dict(ob=cat_ob,
                                              ac=cat_ac,
                                              atarg=cat_atarg,
                                              vtarg=cat_vtarg),
                                         shuffle=not pi.recurrent)
                    t_advs[opt].append(0.)
                    continue

                elif (indices.size + datas[opt].n > min_batch and datas[opt].n
                      < min_batch) or (indices.size > min_batch
                                       and datas[opt].n < min_batch):

                    oldmap = datas[opt].data_map
                    cat_ob = np.concatenate((oldmap['ob'], ob[indices]))
                    cat_ac = np.concatenate((oldmap['ac'], ac[indices]))
                    cat_atarg = np.concatenate(
                        (oldmap['atarg'], atarg[indices]))
                    cat_vtarg = np.concatenate(
                        (oldmap['vtarg'], tdlamret[indices]))
                    datas[opt] = d = Dataset(dict(ob=cat_ob,
                                                  ac=cat_ac,
                                                  atarg=cat_atarg,
                                                  vtarg=cat_vtarg),
                                             shuffle=not pi.recurrent)

                if (indices.size > min_batch and datas[opt].n > min_batch):
                    datas[opt] = d = Dataset(dict(ob=ob[indices],
                                                  ac=ac[indices],
                                                  atarg=atarg[indices],
                                                  vtarg=tdlamret[indices]),
                                             shuffle=not pi.recurrent)

            elif datas[opt] == 0:
                datas[opt] = d = Dataset(dict(ob=ob[indices],
                                              ac=ac[indices],
                                              atarg=atarg[indices],
                                              vtarg=tdlamret[indices]),
                                         shuffle=not pi.recurrent)

            optim_batchsize = optim_batchsize or ob.shape[0]
            optim_epochs = np.clip(
                np.int(10 * (indices.size /
                             (timesteps_per_batch / num_options))), 10,
                10) if num_options > 1 else optim_epochs
            print("optim epochs:", optim_epochs)
            logger.log("Optimizing...")

            # Here we do a bunch of optimization epochs over the data
            for _ in range(optim_epochs):
                losses = [
                ]  # list of tuples, each of which gives the loss for a minibatch
                for batch in d.iterate_once(optim_batchsize):

                    tadv, nodc_adv = pi.get_term_adv(batch["ob"], [opt])
                    tadv = tadv if num_options > 1 else np.zeros_like(tadv)
                    t_advs[opt].append(nodc_adv)

                    *newlosses, grads = lossandgrad(batch["ob"], batch["ac"],
                                                    batch["atarg"],
                                                    batch["vtarg"], cur_lrmult,
                                                    [opt], tadv)
                    termg = termloss(batch["ob"], [opt], tadv)
                    adam.update(termg[0], 5e-7 * cur_lrmult)
                    adam.update(grads, optim_stepsize * cur_lrmult)
                    losses.append(newlosses)

        lrlocal = (seg["ep_lens"], seg["ep_rets"])  # local values
        listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal)  # list of tuples
        lens, rews = map(flatten_lists, zip(*listoflrpairs))
        lenbuffer.extend(lens)
        rewbuffer.extend(rews)
        logger.record_tabular("EpLenMean", np.mean(lenbuffer))
        logger.record_tabular("EpRewMean", np.mean(rewbuffer))
        logger.record_tabular("EpThisIter", len(lens))
        episodes_so_far += len(lens)
        timesteps_so_far += sum(lens)
        iters_so_far += 1
        logger.record_tabular("EpisodesSoFar", episodes_so_far)
        logger.record_tabular("TimestepsSoFar", timesteps_so_far)
        logger.record_tabular("TimeElapsed", time.time() - tstart)
        if MPI.COMM_WORLD.Get_rank() == 0:
            logger.dump_tabular()

        if saves:
            out = "{},{}"
            for _ in range(num_options):
                out += ",{},{},{},{}"
            out += "\n"

            info = [iters_so_far, np.mean(rewbuffer)]
            for i in range(num_options):
                info.append(opt_d[i])
            for i in range(num_options):
                info.append(std[i])
            for i in range(num_options):
                info.append(np.mean(np.array(seg['term_p']), axis=0)[i])
            for i in range(num_options):
                info.append(np.mean(t_advs[i]))

            results.write(out.format(*info))
            results.flush()
    def __init__(self,
                 observation_space,
                 action_space,
                 hidden_size,
                 entcoeff=0.001,
                 scope="adversary",
                 normalize=True):
        """
        Reward regression from observations and transitions

        :param observation_space: (gym.spaces)
        :param action_space: (gym.spaces)
        :param hidden_size: ([int]) the hidden dimension for the MLP
        :param entcoeff: (float) the entropy loss weight
        :param scope: (str) tensorflow variable scope
        :param normalize: (bool) Whether to normalize the reward or not
        """
        # TODO: support images properly (using a CNN)
        self.scope = scope
        self.observation_shape = observation_space.shape
        self.actions_shape = action_space.shape

        if isinstance(action_space, gym.spaces.Box):
            # Continuous action space
            self.discrete_actions = False
            self.n_actions = action_space.shape[0]
        elif isinstance(action_space, gym.spaces.Discrete):
            self.n_actions = action_space.n
            self.discrete_actions = True
        else:
            raise ValueError(
                'Action space not supported: {}'.format(action_space))

        self.hidden_size = hidden_size
        self.normalize = normalize
        self.obs_rms = None

        # Placeholders
        self.generator_obs_ph = tf.compat.v1.placeholder(
            observation_space.dtype, (None, ) + self.observation_shape,
            name="observations_ph")
        self.generator_acs_ph = tf.compat.v1.placeholder(
            action_space.dtype, (None, ) + self.actions_shape,
            name="actions_ph")
        self.expert_obs_ph = tf.compat.v1.placeholder(
            observation_space.dtype, (None, ) + self.observation_shape,
            name="expert_observations_ph")
        self.expert_acs_ph = tf.compat.v1.placeholder(
            action_space.dtype, (None, ) + self.actions_shape,
            name="expert_actions_ph")
        # Build graph
        generator_logits = self.build_graph(self.generator_obs_ph,
                                            self.generator_acs_ph,
                                            reuse=False)
        expert_logits = self.build_graph(self.expert_obs_ph,
                                         self.expert_acs_ph,
                                         reuse=True)
        # Build accuracy
        generator_acc = tf.reduce_mean(input_tensor=tf.cast(
            tf.nn.sigmoid(generator_logits) < 0.5, tf.float32))
        expert_acc = tf.reduce_mean(input_tensor=tf.cast(
            tf.nn.sigmoid(expert_logits) > 0.5, tf.float32))
        # Build regression loss
        # let x = logits, z = targets.
        # z * -log(sigmoid(x)) + (1 - z) * -log(1 - sigmoid(x))
        generator_loss = tf.nn.sigmoid_cross_entropy_with_logits(
            logits=generator_logits, labels=tf.zeros_like(generator_logits))
        generator_loss = tf.reduce_mean(input_tensor=generator_loss)
        expert_loss = tf.nn.sigmoid_cross_entropy_with_logits(
            logits=expert_logits, labels=tf.ones_like(expert_logits))
        expert_loss = tf.reduce_mean(input_tensor=expert_loss)
        # Build entropy loss
        logits = tf.concat([generator_logits, expert_logits], 0)
        entropy = tf.reduce_mean(input_tensor=logit_bernoulli_entropy(logits))
        entropy_loss = -entcoeff * entropy
        # Loss + Accuracy terms
        self.losses = [
            generator_loss, expert_loss, entropy, entropy_loss, generator_acc,
            expert_acc
        ]
        self.loss_name = [
            "generator_loss", "expert_loss", "entropy", "entropy_loss",
            "generator_acc", "expert_acc"
        ]
        self.total_loss = generator_loss + expert_loss + entropy_loss
        # Build Reward for policy
        self.reward_op = -tf.math.log(1 - tf.nn.sigmoid(generator_logits) +
                                      1e-8)
        var_list = self.get_trainable_variables()
        self.lossandgrad = tf_util.function([
            self.generator_obs_ph, self.generator_acs_ph, self.expert_obs_ph,
            self.expert_acs_ph
        ], self.losses + [tf_util.flatgrad(self.total_loss, var_list)])
    def setup_model(self):
        # prevent import loops

        with SetVerbosity(self.verbose):

            assert issubclass(self.policy, ActorCriticPolicy), "Error: the input policy for the TRPO model must be " \
                                                               "an instance of common.policies.ActorCriticPolicy."

            self.nworkers = MPI.COMM_WORLD.Get_size()
            print("number of workers are", self.nworkers)
            self.rank = MPI.COMM_WORLD.Get_rank()
            np.set_printoptions(precision=3)

            self.graph = tf.Graph()
            with self.graph.as_default():
                self.sess = tf_util.single_threaded_session(graph=self.graph)
                self._setup_learn(self.seed)

                # Construct network for new policy
                self.policy_pi = self.policy(self.sess,
                                             self.observation_space,
                                             self.action_space,
                                             self.n_envs,
                                             1,
                                             None,
                                             reuse=False,
                                             **self.policy_kwargs)

                # Network for old policy
                with tf.variable_scope("oldpi", reuse=False):
                    old_policy = self.policy(self.sess,
                                             self.observation_space,
                                             self.action_space,
                                             self.n_envs,
                                             1,
                                             None,
                                             reuse=False,
                                             **self.policy_kwargs)
                # Network for phi
                with tf.variable_scope("phi", reuse=False):
                    self.policy_phi = self.policy(self.sess,
                                                  self.observation_space,
                                                  self.action_space,
                                                  self.n_envs,
                                                  1,
                                                  None,
                                                  reuse=False,
                                                  **self.policy_kwargs)
                # Network for phi old
                with tf.variable_scope("oldphi", reuse=False):
                    self.policy_phi_old = self.policy(self.sess,
                                                      self.observation_space,
                                                      self.action_space,
                                                      self.n_envs,
                                                      1,
                                                      None,
                                                      reuse=False,
                                                      **self.policy_kwargs)

                with tf.variable_scope("loss", reuse=False):
                    atarg = tf.placeholder(dtype=tf.float32, shape=[
                        None
                    ])  # Target advantage function (if applicable)
                    ret = tf.placeholder(dtype=tf.float32,
                                         shape=[None])  # Empirical return

                    observation = self.policy_pi.obs_ph
                    action = self.policy_pi.pdtype.sample_placeholder([None])

                    kloldnew = old_policy.proba_distribution.kl(
                        self.policy_pi.proba_distribution)
                    #kloldnew = self.policy_pi.proba_distribution.kl(old_policy.proba_distribution)
                    ent = self.policy_pi.proba_distribution.entropy()
                    meankl = tf.reduce_mean(kloldnew)
                    meanent = tf.reduce_mean(ent)
                    entbonus = self.entcoeff * meanent

                    vferr = tf.reduce_mean(
                        tf.square(self.policy_pi.value_flat - ret))
                    vf_phi_err = tf.reduce_mean(
                        tf.square(self.policy_phi.value_flat - ret))
                    vf_phi_old_err = tf.reduce_mean(
                        tf.square(self.policy_phi_old.value_flat))

                    # advantage * pnew / pold
                    ratio = tf.exp(
                        self.policy_pi.proba_distribution.logp(action) -
                        old_policy.proba_distribution.logp(action))
                    surrgain = tf.reduce_mean(ratio * atarg)

                    optimgain = surrgain + entbonus
                    losses = [optimgain, meankl, entbonus, surrgain, meanent]
                    self.loss_names = [
                        "optimgain", "meankl", "entloss", "surrgain", "entropy"
                    ]

                    dist = meankl

                    all_var_list = tf_util.get_trainable_vars("model")
                    var_list = [
                        v for v in all_var_list
                        if "/vf" not in v.name and "/q/" not in v.name
                    ]
                    vf_var_list = [
                        v for v in all_var_list
                        if "/pi" not in v.name and "/logstd" not in v.name
                    ]
                    all_var_oldpi_list = tf_util.get_trainable_vars("oldpi")
                    var_oldpi_list = [
                        v for v in all_var_oldpi_list
                        if "/vf" not in v.name and "/q/" not in v.name
                    ]

                    all_var_phi_list = tf_util.get_trainable_vars("phi")
                    vf_phi_var_list = [
                        v for v in all_var_phi_list if "/pi" not in v.name
                        and "/logstd" not in v.name and "/q" not in v.name
                    ]
                    all_var_phi_old_list = tf_util.get_trainable_vars("oldphi")
                    vf_phi_old_var_list = [
                        v for v in all_var_phi_old_list if "/pi" not in v.name
                        and "/logstd" not in v.name and "/q" not in v.name
                    ]
                    #print("vars", vf_var_list)
                    self.policy_vars = all_var_list
                    self.oldpolicy_vars = all_var_oldpi_list
                    print("all var list", all_var_list)
                    print("phi vars", vf_phi_var_list)
                    print("phi old vars", vf_phi_old_var_list)

                    self.get_flat = tf_util.GetFlat(var_list, sess=self.sess)
                    self.set_from_flat = tf_util.SetFromFlat(var_list,
                                                             sess=self.sess)

                    klgrads = tf.gradients(dist, var_list)
                    flat_tangent = tf.placeholder(dtype=tf.float32,
                                                  shape=[None],
                                                  name="flat_tan")
                    shapes = [var.get_shape().as_list() for var in var_list]
                    start = 0
                    tangents = []
                    for shape in shapes:
                        var_size = tf_util.intprod(shape)
                        tangents.append(
                            tf.reshape(flat_tangent[start:start + var_size],
                                       shape))
                        start += var_size
                    gvp = tf.add_n([
                        tf.reduce_sum(grad * tangent)
                        for (grad, tangent) in zipsame(klgrads, tangents)
                    ])  # pylint: disable=E1111
                    fvp = tf_util.flatgrad(gvp, var_list)

                    tf.summary.scalar('entropy_loss', meanent)
                    tf.summary.scalar('policy_gradient_loss', optimgain)
                    tf.summary.scalar('value_function_loss', surrgain)
                    tf.summary.scalar('approximate_kullback-leibler', meankl)
                    tf.summary.scalar(
                        'loss',
                        optimgain + meankl + entbonus + surrgain + meanent)

                    self.assign_old_eq_new = \
                        tf_util.function([], [], updates=[tf.assign(oldv, newv) for (oldv, newv) in
                                                          zipsame(tf_util.get_globals_vars("oldpi"),
                                                                  tf_util.get_globals_vars("model"))])
                    self.compute_losses = tf_util.function(
                        [observation, old_policy.obs_ph, action, atarg],
                        losses)
                    self.compute_fvp = tf_util.function([
                        flat_tangent, observation, old_policy.obs_ph, action,
                        atarg
                    ], fvp)
                    self.compute_vflossandgrad = tf_util.function(
                        [observation, old_policy.obs_ph, ret],
                        tf_util.flatgrad(vferr, vf_var_list))
                    self.compute_vf_phi_lossandgrad = tf_util.function(
                        [observation, self.policy_phi.obs_ph, ret],
                        tf_util.flatgrad(vf_phi_err, vf_phi_var_list))
                    self.compute_vf_loss = tf_util.function(
                        [observation, old_policy.obs_ph, ret], vferr)
                    self.compute_vf_phi_loss = tf_util.function(
                        [observation, self.policy_phi.obs_ph, ret], vf_phi_err)
                    #self.compute_vf_phi_old_loss = tf_util.function([self.policy_phi_old.obs_ph], vf_phi_old_err)
                    #self.phi_old_obs = np.array([-0.012815  , -0.02076313,  0.07524705,  0.09407324,  0.0901745 , -0.09339058,  0.03544853, -0.03297224])
                    #self.phi_old_obs = self.phi_old_obs.reshape((1, 8))

                    update_phi_old_expr = []
                    for var, var_target in zip(
                            sorted(vf_phi_var_list, key=lambda v: v.name),
                            sorted(vf_phi_old_var_list, key=lambda v: v.name)):
                        update_phi_old_expr.append(var_target.assign(var))
                    update_phi_old_expr = tf.group(*update_phi_old_expr)

                    self.update_phi_old = tf_util.function(
                        [], [], updates=[update_phi_old_expr])

                    @contextmanager
                    def timed(msg):
                        if self.rank == 0 and self.verbose >= 1:
                            print(colorize(msg, color='magenta'))
                            start_time = time.time()
                            yield
                            print(
                                colorize("done in {:.3f} seconds".format(
                                    (time.time() - start_time)),
                                         color='magenta'))
                        else:
                            yield

                    @contextmanager
                    def temp_seed(seed):
                        state = np.random.get_state()
                        np.random.seed(seed)
                        try:
                            yield
                        finally:
                            np.random.set_state(state)

                    def allmean(arr):
                        assert isinstance(arr, np.ndarray)
                        out = np.empty_like(arr)
                        MPI.COMM_WORLD.Allreduce(arr, out, op=MPI.SUM)
                        out /= self.nworkers
                        return out

                    tf_util.initialize(sess=self.sess)

                    th_init = self.get_flat()
                    MPI.COMM_WORLD.Bcast(th_init, root=0)
                    self.set_from_flat(th_init)

                with tf.variable_scope("Adam_mpi", reuse=False):
                    self.vfadam = MpiAdam(vf_var_list, sess=self.sess)
                    self.vf_phi_adam = MpiAdam(vf_phi_var_list, sess=self.sess)
                    self.vfadam.sync()
                    self.vf_phi_adam.sync()

                with tf.variable_scope("input_info", reuse=False):
                    tf.summary.scalar('discounted_rewards',
                                      tf.reduce_mean(ret))
                    tf.summary.scalar('learning_rate',
                                      tf.reduce_mean(self.vf_stepsize))
                    tf.summary.scalar('advantage', tf.reduce_mean(atarg))
                    tf.summary.scalar('kl_clip_range',
                                      tf.reduce_mean(self.max_kl))

                self.timed = timed
                self.allmean = allmean
                self.temp_seed = temp_seed

                self.step = self.policy_pi.step
                self.proba_step = self.policy_pi.proba_step
                self.initial_state = self.policy_pi.initial_state

                self.params = tf_util.get_trainable_vars(
                    "model") + tf_util.get_trainable_vars("oldpi")

                self.summary = tf.summary.merge_all()

                self.compute_lossandgrad = \
                    tf_util.function([observation, old_policy.obs_ph, action, atarg, ret],
                                     [self.summary, tf_util.flatgrad(optimgain, var_list)] + losses)
Esempio n. 10
0
    def setup_model(self):
        # prevent import loops
        from stable_baselines.gail.adversary import TransitionClassifier

        with SetVerbosity(self.verbose):

            assert issubclass(self.policy, ActorCriticPolicy), "Error: the input policy for the TRPO model must be " \
                                                               "an instance of common.policies.ActorCriticPolicy."

            self.nworkers = MPI.COMM_WORLD.Get_size()
            self.rank = MPI.COMM_WORLD.Get_rank()
            np.set_printoptions(precision=3)

            self.graph = tf.Graph()
            with self.graph.as_default():
                self.set_random_seed(self.seed)
                self.sess = tf_util.make_session(num_cpu=self.n_cpu_tf_sess, graph=self.graph)

                if self.using_gail:
                    self.reward_giver = TransitionClassifier(self.observation_space, self.action_space,
                                                             self.hidden_size_adversary,
                                                             entcoeff=self.adversary_entcoeff)

                # Penalty related variable
                with tf.variable_scope('penalty'):
                    cur_cost_ph = tf.placeholder(dtype=tf.float32, shape=[None]) # episodic cost

                    param_init = np.log(max(np.exp(self.penalty_init) - 1, 1e-8))
                    penalty_param = tf.get_variable('penalty_param',
                                                    initializer=float(param_init),
                                                    trainable=True,
                                                    dtype=tf.float32)
                penalty = tf.nn.softplus(penalty_param)
                penalty_loss = tf.reduce_mean(-penalty_param * (cur_cost_ph - self.cost_lim))

                # Construct network for new policy
                self.policy_pi = self.policy(self.sess, self.observation_space, self.action_space, self.n_envs, 1,
                                             None, reuse=False, **self.policy_kwargs)

                # Network for old policy
                with tf.variable_scope("oldpi", reuse=False):
                    old_policy = self.policy(self.sess, self.observation_space, self.action_space, self.n_envs, 1,
                                             None, reuse=False, **self.policy_kwargs)
                
                # # Network for safety value function
                # with tf.variable_Scope("vc",reuse=False):
                #     self.cost_value = MLPValue(self.sess, self.observation_spacem, self.n_envs, 1, None)
                
                with tf.variable_scope("loss", reuse=False):
                    atarg = tf.placeholder(dtype=tf.float32, shape=[None])  # Target advantage function (if applicable)
                    ret = tf.placeholder(dtype=tf.float32, shape=[None])  # Empirical return
                    catarg = tf.placeholder(dtype=tf.float32, shape=[None]) # Target cost advantage function
                    cret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical cost

                    observation = self.policy_pi.obs_ph
                    action = self.policy_pi.pdtype.sample_placeholder([None])

                    kloldnew = old_policy.proba_distribution.kl(self.policy_pi.proba_distribution)
                    ent = self.policy_pi.proba_distribution.entropy()
                    meankl = tf.reduce_mean(kloldnew)
                    meanent = tf.reduce_mean(ent)
                    entbonus = self.entcoeff * meanent

                    vferr = tf.reduce_mean(tf.square(self.policy_pi.value_flat - ret))
                    vcerr = tf.reduce_mean(tf.square(self.policy_pi.vcf_flat - cret))
                    
                    # advantage * pnew / pold
                    ratio = tf.exp(self.policy_pi.proba_distribution.logp(action) -
                                   old_policy.proba_distribution.logp(action))
                    surrgain = tf.reduce_mean(ratio * atarg)
                    # Surrogate for cost function
                    surrcost = tf.reduce_mean(ratio * catarg)

                    optimgain = surrgain + entbonus
                    # Include surr_cost in pi_objective
                    optimgain -= penalty * surrcost
                    optimgain /= (1 + penalty)
                    # # Loss function for pi is negative of pi_objective
                    # optimgain = -optimgain # Should we??
                    
                    losses = [optimgain, meankl, entbonus, surrgain, meanent, surrcost]
                    self.loss_names = ["optimgain", "meankl", "entloss", "surrgain", "entropy", "surrcost"]

                    dist = meankl

                    all_var_list = tf_util.get_trainable_vars("model")
                    var_list = [v for v in all_var_list if "/vf" not in v.name and "/q/" not in v.name and "/vcf" not in v.name] # policy parameters
                    vf_var_list = [v for v in all_var_list if "/pi" not in v.name and "/logstd" not in v.name and "/vcf" not in v.name] # value parameters
                    vcf_var_list = [v for v in all_var_list if "/pi" not in v.name and "/logstd" not in v.name and "/vf" not in v.name] # cost value parameters

                    self.get_flat = tf_util.GetFlat(var_list, sess=self.sess)
                    self.set_from_flat = tf_util.SetFromFlat(var_list, sess=self.sess)

                    klgrads = tf.gradients(dist, var_list)
                    flat_tangent = tf.placeholder(dtype=tf.float32, shape=[None], name="flat_tan")
                    shapes = [var.get_shape().as_list() for var in var_list]
                    start = 0
                    tangents = []
                    for shape in shapes:
                        var_size = tf_util.intprod(shape)
                        tangents.append(tf.reshape(flat_tangent[start: start + var_size], shape))
                        start += var_size
                    gvp = tf.add_n([tf.reduce_sum(grad * tangent)
                                    for (grad, tangent) in zipsame(klgrads, tangents)])  # pylint: disable=E1111
                    # Fisher vector products
                    fvp = tf_util.flatgrad(gvp, var_list)

                    tf.summary.scalar('penalty_loss', penalty_loss)
                    tf.summary.scalar('entropy_loss', meanent)
                    tf.summary.scalar('policy_gradient_loss', optimgain)
                    tf.summary.scalar('value_function_loss', surrgain)
                    tf.summary.scalar('constraint_cost_function_loss', surrcost)
                    tf.summary.scalar('approximate_kullback-leibler', meankl)
                    tf.summary.scalar('loss', optimgain + meankl + entbonus + surrgain + meanent + surrcost + penalty_loss)

                    self.assign_old_eq_new = \
                        tf_util.function([], [], updates=[tf.assign(oldv, newv) for (oldv, newv) in
                                                          zipsame(tf_util.get_globals_vars("oldpi"),
                                                                  tf_util.get_globals_vars("model"))])
                    self.compute_losses = tf_util.function([observation, old_policy.obs_ph, action, atarg, catarg], losses)
                    self.compute_fvp = tf_util.function([flat_tangent, observation, old_policy.obs_ph, action, atarg, catarg],
                                                        fvp) # Why need all inputs? Might for implementation easiness
                    # self.compute_vflossandgrad = tf_util.function([observation, old_policy.obs_ph, ret],
                    #                                               tf_util.flatgrad(vferr, vf_var_list)) # Why need old_policy.obs_ph? Doesn't seem to be used
                    # self.compute_vcflossandgrad = tf_util.function([observation, old_policy.obs_ph, cret],
                    #                                               tf_util.flatgrad(vcerr, vcf_var_list))
                    self.compute_vflossandgrad = tf_util.function([observation, old_policy.obs_ph, ret, cret],
                                                                  [tf_util.flatgrad(vferr, vf_var_list), tf_util.flatgrad(vcerr, vcf_var_list)])
                    self.compute_lagrangiangrad = tf_util.function([cur_cost_ph],
                                                                   tf_util.flatgrad(penalty_loss, [penalty_param]))

                    @contextmanager
                    def timed(msg):
                        if self.rank == 0 and self.verbose >= 1:
                            print(colorize(msg, color='magenta'))
                            start_time = time.time()
                            yield
                            print(colorize("done in {:.3f} seconds".format((time.time() - start_time)),
                                           color='magenta'))
                        else:
                            yield

                    def allmean(arr):
                        assert isinstance(arr, np.ndarray)
                        out = np.empty_like(arr)
                        MPI.COMM_WORLD.Allreduce(arr, out, op=MPI.SUM)
                        out /= self.nworkers
                        return out

                    tf_util.initialize(sess=self.sess)

                    th_init = self.get_flat()
                    MPI.COMM_WORLD.Bcast(th_init, root=0)
                    self.set_from_flat(th_init)

                with tf.variable_scope("Adam_mpi", reuse=False):
                    self.vfadam = MpiAdam(vf_var_list, sess=self.sess)
                    if self.using_gail:
                        self.d_adam = MpiAdam(self.reward_giver.get_trainable_variables(), sess=self.sess)
                        self.d_adam.sync()
                    self.vfadam.sync()
                    # optimizer for constraint costs value function
                    self.vcadam = MpiAdam(vcf_var_list, sess=self.sess)
                    self.vcadam.sync()
                    # optimizer for lagragian value of safe RL
                    self.penaltyadam = MpiAdam([penalty_param], sess=self.sess)
                    self.penaltyadam.sync()

                with tf.variable_scope("input_info", reuse=False):
                    tf.summary.scalar('discounted_rewards', tf.reduce_mean(ret))
                    tf.summary.scalar('discounted_costs', tf.reduce_mean(cret))
                    tf.summary.scalar('learning_rate', tf.reduce_mean(self.vf_stepsize))
                    tf.summary.scalar('advantage', tf.reduce_mean(atarg))
                    tf.summary.scalar('cost_advantage', tf.reduce_mean(catarg))
                    tf.summary.scalar('kl_clip_range', tf.reduce_mean(self.max_kl))

                    if self.full_tensorboard_log:
                        tf.summary.histogram('discounted_rewards', ret)
                        tf.summary.histogram('discounted_rewards', cret)
                        tf.summary.histogram('learning_rate', self.vf_stepsize)
                        tf.summary.histogram('penalty_learning_rate', self.penalty_lr)
                        tf.summary.histogram('advantage', atarg)
                        tf.summary.histogram('cost_advantage', catarg)
                        tf.summary.histogram('kl_clip_range', self.max_kl)
                        if tf_util.is_image(self.observation_space):
                            tf.summary.image('observation', observation)
                        else:
                            tf.summary.histogram('observation', observation)

                self.timed = timed
                self.allmean = allmean

                self.step = self.policy_pi.step
                self.proba_step = self.policy_pi.proba_step
                self.initial_state = self.policy_pi.initial_state

                self.params = tf_util.get_trainable_vars("model") + tf_util.get_trainable_vars("oldpi")
                if self.using_gail:
                    self.params.extend(self.reward_giver.get_trainable_variables())

                self.summary = tf.summary.merge_all()

                self.compute_lossandgrad = \
                    tf_util.function([observation, old_policy.obs_ph, action, atarg, catarg, ret, cret, cur_cost_ph],
                                     [self.summary, tf_util.flatgrad(optimgain, var_list)] + losses)
Esempio n. 11
0
    def setup_model(self):
        # prevent import loops
        from stable_baselines.gail.adversary import TransitionClassifier
        from stable_baselines.mdal.adversary import TabularAdversaryTF, NeuralAdversaryTRPO


        with SetVerbosity(self.verbose):

            assert issubclass(self.policy, ActorCriticPolicy), "Error: the input policy for the MDPO model must be " \
                                                               "an instance of common.policies.ActorCriticPolicy."

            self.nworkers = MPI.COMM_WORLD.Get_size()
            self.rank = MPI.COMM_WORLD.Get_rank()
            np.set_printoptions(precision=3)

            self.graph = tf.Graph()
            with self.graph.as_default():
                self.sess = tf_util.single_threaded_session(graph=self.graph)
                # self._setup_learn(self.seed)
                self._setup_learn()

                if self.using_gail:
                    self.reward_giver = TransitionClassifier(self.observation_space, self.action_space,
                                                             self.hidden_size_adversary,
                                                             entcoeff=self.adversary_entcoeff)
                elif self.using_mdal:
                    if self.neural:
                        self.reward_giver = NeuralAdversaryTRPO(self.sess, self.observation_space, self.action_space,
                                                                self.hidden_size_adversary,
                                                                entcoeff=self.adversary_entcoeff)
                    else:
                        self.reward_giver = TabularAdversaryTF(self.sess, self.observation_space, self.action_space,
                                                                 self.hidden_size_adversary,
                                                                 entcoeff=self.adversary_entcoeff,
                                                                 expert_features=self.expert_dataset.successor_features,
                                                                 exploration_bonus=self.exploration_bonus,
                                                                 bonus_coef=self.bonus_coef,
                                                                 t_c=self.t_c,
                                                                 is_action_features=self.is_action_features)
                # Construct network for new policy
                self.policy_pi = self.policy(self.sess, self.observation_space, self.action_space, self.n_envs, 1,
                                             None, reuse=False, **self.policy_kwargs)

                # Network for old policy
                with tf.variable_scope("oldpi", reuse=False):
                    self.old_policy = self.policy(self.sess, self.observation_space, self.action_space, self.n_envs, 1,
                                             None, reuse=False, **self.policy_kwargs)

                # Network for fitting closed form
                with tf.variable_scope("closedpi", reuse=False):
                    self.closed_policy = self.policy(self.sess, self.observation_space, self.action_space, self.n_envs, 1,
                                             None, reuse=False, **self.policy_kwargs)

                with tf.variable_scope("loss", reuse=False):
                    self.atarg = tf.placeholder(dtype=tf.float32, shape=[None])  # Target advantage function (if applicable)
                    self.vtarg = tf.placeholder(dtype=tf.float32, shape=[None])
                    self.ret = tf.placeholder(dtype=tf.float32, shape=[None])  # Empirical return
                    self.learning_rate_ph = tf.placeholder(dtype=tf.float32, shape=[], name="learning_rate_ph")
                    self.outer_learning_rate_ph = tf.placeholder(dtype=tf.float32, shape=[], name="outer_learning_rate_ph")
                    self.old_vpred_ph = tf.placeholder(dtype=tf.float32, shape=[None], name="old_vpred_ph")
                    self.clip_range_vf_ph = tf.placeholder(dtype=tf.float32, shape=[], name="clip_range_ph")

                    observation = self.policy_pi.obs_ph
                    self.action = self.policy_pi.pdtype.sample_placeholder([None])

                    if self.tsallis_q == 1.0:
                        kloldnew = self.policy_pi.proba_distribution.kl(self.old_policy.proba_distribution)
                        ent = self.policy_pi.proba_distribution.entropy()
                        meankl = tf.reduce_mean(kloldnew)

                    else:
                        logp_pi = self.policy_pi.proba_distribution.logp(self.action)
                        logp_pi_old =  self.old_policy.proba_distribution.logp(self.action)
                        ent = self.policy_pi.proba_distribution.entropy()
                        #kloldnew = self.policy_pi.proba_distribution.kl_tsallis(self.old_policy.proba_distribution, self.tsallis_q)
                        tsallis_q = 2.0 - self.tsallis_q
                        meankl = tf.reduce_mean(tf_log_q(tf.exp(logp_pi), tsallis_q) - tf_log_q(tf.exp(logp_pi_old), tsallis_q)) #tf.reduce_mean(kloldnew)

                    meanent = tf.reduce_mean(ent)
                    entbonus = self.entcoeff * meanent

                    if self.cliprange_vf is None:
                        vpred_clipped = self.policy_pi.value_flat
                    else:
                        vpred_clipped = self.old_vpred_ph + \
                            tf.clip_by_value(self.policy_pi.value_flat - self.old_vpred_ph,
                                             - self.clip_range_vf_ph, self.clip_range_vf_ph)

                    vf_losses1 = tf.square(self.policy_pi.value_flat - self.ret)
                    vf_losses2 = tf.square(vpred_clipped - self.ret)
                    vferr = tf.reduce_mean(tf.maximum(vf_losses1, vf_losses2))

                    # advantage * pnew / pold
                    ratio = tf.exp(self.policy_pi.proba_distribution.logp(self.action) -
                                   self.old_policy.proba_distribution.logp(self.action))

                    if self.method == "multistep-SGD":
                        surrgain = tf.reduce_mean(ratio * self.atarg) - meankl / self.learning_rate_ph
                    elif self.method == "closedreverse-KL":
                        surrgain = tf.reduce_mean(tf.exp(self.atarg) * self.policy_pi.proba_distribution.logp(self.action))
                    else:
                        policygain = tf.reduce_mean(tf.exp(self.atarg) * tf.log(self.closed_policy.proba_distribution.mean))
                        surrgain = tf.reduce_mean(ratio * self.atarg) - tf.reduce_mean(self.learning_rate_ph * ratio * self.policy_pi.proba_distribution.logp(self.action))

                    optimgain = surrgain #+ entbonus - self.learning_rate_ph * meankl
                    losses = [optimgain, meankl, entbonus, surrgain, meanent]
                    self.loss_names = ["optimgain", "meankl", "entloss", "surrgain", "entropy"]

                    dist = meankl

                    all_var_list = tf_util.get_trainable_vars("model")
                    var_list = [v for v in all_var_list if "/vf" not in v.name and "/q/" not in v.name]
                    vf_var_list = [v for v in all_var_list if "/pi" not in v.name and "/logstd" not in v.name]
                    print("policy vars", var_list)

                    all_closed_var_list = tf_util.get_trainable_vars("closedpi")
                    closed_var_list = [v for v in all_closed_var_list if "/vf" not in v.name and "/q" not in v.name]

                    self.get_flat = tf_util.GetFlat(var_list, sess=self.sess)
                    self.set_from_flat = tf_util.SetFromFlat(var_list, sess=self.sess)

                    klgrads = tf.gradients(dist, var_list)
                    flat_tangent = tf.placeholder(dtype=tf.float32, shape=[None], name="flat_tan")
                    shapes = [var.get_shape().as_list() for var in var_list]
                    start = 0
                    tangents = []
                    for shape in shapes:
                        var_size = tf_util.intprod(shape)
                        tangents.append(tf.reshape(flat_tangent[start: start + var_size], shape))
                        start += var_size
                    gvp = tf.add_n([tf.reduce_sum(grad * tangent)
                                    for (grad, tangent) in zipsame(klgrads, tangents)])  # pylint: disable=E1111
                    fvp = tf_util.flatgrad(gvp, var_list)

                    # tf.summary.scalar('entropy_loss', meanent)
                    # tf.summary.scalar('policy_gradient_loss', optimgain)
                    # tf.summary.scalar('value_function_loss', surrgain)
                    # tf.summary.scalar('approximate_kullback-leibler', meankl)
                    # tf.summary.scalar('loss', optimgain + meankl + entbonus + surrgain + meanent)

                    self.assign_old_eq_new = \
                        tf_util.function([], [], updates=[tf.assign(oldv, newv) for (oldv, newv) in
                                                          zipsame(tf_util.get_globals_vars("oldpi"),
                                                                  tf_util.get_globals_vars("model"))])
                    self.compute_losses = tf_util.function([observation, self.old_policy.obs_ph, self.action, self.atarg, self.learning_rate_ph, self.vtarg], losses)
                    self.compute_fvp = tf_util.function([flat_tangent, observation, self.old_policy.obs_ph, self.action, self.atarg],
                                                        fvp)
                    self.compute_vflossandgrad = tf_util.function([observation, self.old_policy.obs_ph, self.ret, self.old_vpred_ph, self.clip_range_vf_ph],
                                                                  tf_util.flatgrad(vferr, vf_var_list))

                    grads = tf.gradients(-optimgain, var_list)
                    grads, _grad_norm = tf.clip_by_global_norm(grads, 0.5)
                    trainer = tf.train.AdamOptimizer(learning_rate=self.outer_learning_rate_ph, epsilon=1e-5)
                    # trainer = tf.train.AdamOptimizer(learning_rate=3e-4, epsilon=1e-5)
                    grads = list(zip(grads, var_list))
                    self._train = trainer.apply_gradients(grads)

                    @contextmanager
                    def timed(msg):
                        if self.rank == 0 and self.verbose >= 1:
                            # print(colorize(msg, color='magenta'))
                            # start_time = time.time()
                            yield
                            # print(colorize("done in {:.3f} seconds".format((time.time() - start_time)),
                            #                color='magenta'))
                        else:
                            yield

                    def allmean(arr):
                        assert isinstance(arr, np.ndarray)
                        out = np.empty_like(arr)
                        MPI.COMM_WORLD.Allreduce(arr, out, op=MPI.SUM)
                        out /= self.nworkers
                        return out

                    tf_util.initialize(sess=self.sess)

                    th_init = self.get_flat()
                    MPI.COMM_WORLD.Bcast(th_init, root=0)
                    self.set_from_flat(th_init)

                with tf.variable_scope("Adam_mpi", reuse=False):
                    self.vfadam = MpiAdam(vf_var_list, sess=self.sess)
                    if self.using_gail or self.using_mdal:
                        self.d_adam = MpiAdam(self.reward_giver.get_trainable_variables(), sess=self.sess)
                        self.d_adam.sync()
                    self.vfadam.sync()

                with tf.variable_scope("input_info", reuse=False):
                    tf.summary.scalar('discounted_rewards', tf.reduce_mean(self.ret))
                    tf.summary.scalar('learning_rate', tf.reduce_mean(self.vf_stepsize))
                    tf.summary.scalar('advantage', tf.reduce_mean(self.atarg))
                    tf.summary.scalar('kl_clip_range', tf.reduce_mean(self.max_kl))

                    if self.full_tensorboard_log:
                        tf.summary.histogram('discounted_rewards', self.ret)
                        tf.summary.histogram('learning_rate', self.vf_stepsize)
                        tf.summary.histogram('advantage', self.atarg)
                        tf.summary.histogram('kl_clip_range', self.max_kl)
                        if tf_util.is_image(self.observation_space):
                            tf.summary.image('observation', observation)
                        else:
                            tf.summary.histogram('observation', observation)

                self.timed = timed
                self.allmean = allmean

                self.step = self.policy_pi.step
                self.proba_step = self.policy_pi.proba_step
                self.initial_state = self.policy_pi.initial_state

                self.params = tf_util.get_trainable_vars("model") + tf_util.get_trainable_vars("oldpi")
                if self.using_gail:
                    self.params.extend(self.reward_giver.get_trainable_variables())

                self.summary = tf.summary.merge_all()

                self.compute_lossandgrad = \
                    tf_util.function([observation, self.old_policy.obs_ph, self.action, self.atarg, self.ret, self.learning_rate_ph, self.vtarg, self.closed_policy.obs_ph],
                                     [self.summary, tf_util.flatgrad(optimgain, var_list)] + losses)
Esempio n. 12
0
    def _setup_model(self, rank, memory_size, alpha, obs_space, a_space,
                     noise_target_action, **kwargs):

        self.graph = tf.Graph()
        with self.graph.as_default():
            self.sess = tf_util.single_threaded_session(graph=self.graph)
            if self.use_prioritiy:
                from .priority_memory import PrioritizedMemory
                self.memory = PrioritizedMemory(capacity=memory_size,
                                                alpha=alpha)
            else:
                from .memory import Memory
                self.memory = Memory(limit=memory_size,
                                     action_shape=a_space.shape,
                                     observation_shape=obs_space.shape)
            # 定义 placeholders
            self.observe_Input = tf.placeholder(tf.float32,
                                                [None] + list(obs_space.shape),
                                                name='observe_Input')
            self.observe_Input_ = tf.placeholder(tf.float32, [None] +
                                                 list(obs_space.shape),
                                                 name='observe_Input_')
            self.R = tf.placeholder(tf.float32, [None, 1], 'r')
            self.terminals1 = tf.placeholder(tf.float32,
                                             shape=(None, 1),
                                             name='terminals1')
            self.ISWeights = tf.placeholder(tf.float32, [None, 1],
                                            name='IS_weights')
            self.n_step_steps = tf.placeholder(tf.float32,
                                               shape=(None, 1),
                                               name='n_step_reached')
            self.q_demo = tf.placeholder(tf.float32, [None, 1],
                                         name='Q_of_actions_from_memory')
            self.come_from_demo = tf.placeholder(tf.float32, [None, 1],
                                                 name='Demo_index')
            self.action_memory = tf.placeholder(tf.float32,
                                                [None] + list(a_space.shape),
                                                name='actions_from_memory')

            with tf.variable_scope('obs_rms'):
                self.obs_rms = RunningMeanStd(shape=obs_space.shape)

            with tf.name_scope('obs_preprocess'):
                self.normalized_observe_Input = tf.clip_by_value(
                    normalize(self.observe_Input, self.obs_rms), -5., 5.)
                self.normalized_observe_Input_ = tf.clip_by_value(
                    normalize(self.observe_Input_, self.obs_rms), -5., 5.)

            with tf.variable_scope('Actor'):
                self.action = self.build_actor(self.normalized_observe_Input,
                                               scope='eval',
                                               trainable=True,
                                               a_space=a_space)
                self.action_ = self.build_actor(self.normalized_observe_Input_,
                                                scope='target',
                                                trainable=False,
                                                a_space=a_space)

                # Target policy smoothing, by adding clipped noise to target actions
                if noise_target_action:
                    epsilon = tf.random_normal(tf.shape(self.action_),
                                               stddev=0.007)
                    epsilon = tf.clip_by_value(epsilon, -0.01, 0.01)
                    a2 = self.action_ + epsilon
                    noised_action_ = tf.clip_by_value(a2, -1, 1)
                else:
                    noised_action_ = self.action_

            with tf.variable_scope('Critic'):
                # Q值都要被clip 防止过估计.
                self.q_1 = tf.clip_by_value(
                    self.build_critic(self.normalized_observe_Input,
                                      self.action,
                                      scope='eval_1',
                                      trainable=True), self.Q_value_range[0],
                    self.Q_value_range[1])

                q_1_ = self.build_critic(self.normalized_observe_Input_,
                                         noised_action_,
                                         scope='target_1',
                                         trainable=False)

                if self.use_TD3:
                    q_2 = tf.clip_by_value(
                        self.build_critic(self.normalized_observe_Input,
                                          self.action,
                                          scope='eval_2',
                                          trainable=True),
                        self.Q_value_range[0], self.Q_value_range[1])

                    q_2_ = self.build_critic(self.normalized_observe_Input_,
                                             noised_action_,
                                             scope='target_2',
                                             trainable=False)

            # Collect networks parameters. It would make it more easily to manage them.
            self.ae_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                               scope='Actor/eval')
            self.at_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                               scope='Actor/target')
            self.ce1_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                                scope='Critic/eval_1')
            self.ct1_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                                scope='Critic/target_1')

            if self.use_TD3:
                self.ce2_params = tf.get_collection(
                    tf.GraphKeys.GLOBAL_VARIABLES, scope='Critic/eval_2')
                self.ct2_params = tf.get_collection(
                    tf.GraphKeys.GLOBAL_VARIABLES, scope='Critic/target_2')

            with tf.variable_scope('Soft_Update'):
                self.soft_replace_a = [
                    tf.assign(t, (1 - TAU) * t + TAU * e)
                    for t, e in zip(self.at_params, self.ae_params)
                ]
                self.soft_replace_c = [
                    tf.assign(t, (1 - TAU) * t + TAU * e)
                    for t, e in zip(self.ct1_params, self.ce1_params)
                ]
                if self.use_TD3:
                    self.soft_replace_c += [
                        tf.assign(t, (1 - TAU) * t + TAU * e)
                        for t, e in zip(self.ct2_params, self.ce2_params)
                    ]

            # critic 的误差 为 (one-step-td 误差 + n-step-td 误差 + critic_online 的L2惩罚)
            # TD3: critic一共有4个, 算两套 critic的误差, 秀儿.
            with tf.variable_scope('Critic_Lose'):
                if self.use_TD3:
                    min_q_ = tf.minimum(q_1_, q_2_)
                else:
                    min_q_ = q_1_

                self.q_target = self.R + (1. -
                                          self.terminals1) * GAMMA * min_q_
                if self.use_n_step:
                    self.n_step_target_q = self.R + (
                        1. - self.terminals1) * tf.pow(
                            GAMMA, self.n_step_steps) * min_q_
                    cliped_n_step_target_q = tf.clip_by_value(
                        self.n_step_target_q, self.Q_value_range[0],
                        self.Q_value_range[1])

                cliped_q_target = tf.clip_by_value(self.q_target,
                                                   self.Q_value_range[0],
                                                   self.Q_value_range[1])

                self.td_error_1 = tf.abs(cliped_q_target - self.q_1)
                if self.use_TD3:
                    self.td_error_2 = tf.abs(cliped_q_target - q_2)

                if self.use_n_step:
                    self.nstep_td_error_1 = tf.abs(cliped_n_step_target_q -
                                                   self.q_1)
                    if self.use_TD3:
                        self.nstep_td_error_2 = tf.abs(cliped_n_step_target_q -
                                                       q_2)

                L2_regular_1 = tf.contrib.layers.apply_regularization(
                    tf.contrib.layers.l2_regularizer(0.001),
                    weights_list=self.ce1_params)
                if self.use_TD3:
                    L2_regular_2 = tf.contrib.layers.apply_regularization(
                        tf.contrib.layers.l2_regularizer(0.001),
                        weights_list=self.ce2_params)

                one_step_losse_1 = tf.reduce_mean(
                    tf.multiply(self.ISWeights, tf.square(
                        self.td_error_1))) * self.lambda_1_step
                if self.use_TD3:
                    one_step_losse_2 = tf.reduce_mean(
                        tf.multiply(self.ISWeights, tf.square(
                            self.td_error_2))) * self.lambda_1_step

                if self.use_n_step:
                    n_step_td_losses_1 = tf.reduce_mean(
                        tf.multiply(
                            self.ISWeights, tf.square(
                                self.nstep_td_error_1))) * self.lambda_n_step
                    c_loss_1 = one_step_losse_1 + n_step_td_losses_1 + L2_regular_1

                    if self.use_TD3:
                        n_step_td_losses_2 = tf.reduce_mean(
                            tf.multiply(self.ISWeights,
                                        tf.square(self.nstep_td_error_2))
                        ) * self.lambda_n_step
                        c_loss_2 = one_step_losse_2 + n_step_td_losses_2 + L2_regular_2
                else:
                    c_loss_1 = one_step_losse_1 + L2_regular_1

                    if self.use_TD3:
                        c_loss_2 = one_step_losse_2 + L2_regular_2

            # actor 的 loss 为 最大化q(s,a) 最小化行为克隆误差.
            # (只有demo的transition 且 demo的action 比 actor生成的action q_1(s,a)高的时候 才会有克隆误差)
            with tf.variable_scope('Actor_lose'):
                Is_worse_than_demo = self.q_1 < self.q_demo
                Is_worse_than_demo = tf.cast(Is_worse_than_demo, tf.float32)
                worse_than_demo = tf.cast(tf.reduce_sum(Is_worse_than_demo),
                                          tf.int8)

                # 算action误差 我用的是平方和, 也有人用均方误差 reduce_mean. 其实都可以.
                # 我的action本来都是很小的数.
                action_diffs = Is_worse_than_demo * tf.reduce_sum(
                    self.come_from_demo *
                    tf.square(self.action - self.action_memory),
                    1,
                    keepdims=True)

                L_BC = self.LAMBDA_BC * tf.reduce_sum(action_diffs)
                a_loss = -tf.reduce_mean(self.q_1) + L_BC

            # Setting optimizer for Actor and Critic
            with tf.variable_scope('Critic_Optimizer'):
                if self.use_TD3:
                    self.critic_grads_1 = tf_util.flatgrad(
                        loss=c_loss_1, var_list=self.ce1_params)
                    self.critic_grads_2 = tf_util.flatgrad(
                        loss=c_loss_2, var_list=self.ce2_params)

                    self.critic_optimizer_1 = MpiAdam(var_list=self.ce1_params,
                                                      beta1=0.9,
                                                      beta2=0.999,
                                                      epsilon=1e-08)
                    self.critic_optimizer_2 = MpiAdam(var_list=self.ce2_params,
                                                      beta1=0.9,
                                                      beta2=0.999,
                                                      epsilon=1e-08)
                else:
                    self.critic_grads = tf_util.flatgrad(
                        loss=c_loss_1, var_list=self.ce1_params)
                    self.critic_optimizer = MpiAdam(var_list=self.ce1_params,
                                                    beta1=0.9,
                                                    beta2=0.999,
                                                    epsilon=1e-08)
            with tf.variable_scope('Actor_Optimizer'):
                self.actor_grads = tf_util.flatgrad(a_loss, self.ae_params)
                self.actor_optimizer = MpiAdam(var_list=self.ae_params,
                                               beta1=0.9,
                                               beta2=0.999,
                                               epsilon=1e-08)
            with self.sess.as_default():
                self._initialize(self.sess)

            # 保存模型
            var_list = tf.global_variables()
            print(
                "var_list!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n"
            )
            for v in var_list:
                print(v)
            self.saver = tf.train.Saver(var_list=var_list, max_to_keep=1)
            self.writer = tf.summary.FileWriter(
                "logs/" + self.experiment_name + "/DDPG_" + str(rank),
                self.graph)
            # TensorBoard summary
            self.a_summary = tf.summary.merge([
                tf.summary.scalar('a_loss', a_loss, family='actor'),
                tf.summary.scalar('L_BC', L_BC, family='actor'),
                tf.summary.scalar('worse_than_demo',
                                  worse_than_demo,
                                  family='actor')
            ])

            if self.use_TD3:
                self.c_summary = tf.summary.merge([
                    tf.summary.scalar('c_loss_1', c_loss_1, family='critic'),
                    tf.summary.scalar('c_loss_2', c_loss_2, family='critic')
                ])
            else:
                self.c_summary = tf.summary.merge(
                    [tf.summary.scalar('c_loss_1', c_loss_1, family='critic')])

            # episode summary
            self.episode_cumulate_reward = tf.placeholder(
                tf.float32, name='episode_cumulate_reward')
            self.episoed_length = tf.placeholder(
                tf.int16, name='episode_cumulate_reward')
            self.success_or_not = tf.placeholder(
                tf.int8, name='episode_cumulate_reward')

            self.eval_episode_cumulate_reward = tf.placeholder(
                tf.float32, name='episode_cumulate_reward')
            self.eval_episoed_length = tf.placeholder(
                tf.int16, name='episode_cumulate_reward')
            self.eval_success_or_not = tf.placeholder(
                tf.int8, name='episode_cumulate_reward')

            self.episode_summary = tf.summary.merge([
                tf.summary.scalar('episode_cumulate_reward',
                                  self.episode_cumulate_reward,
                                  family='episoed_result'),
                tf.summary.scalar('episoed_length',
                                  self.episoed_length,
                                  family='episoed_result'),
                tf.summary.scalar('success_or_not',
                                  self.success_or_not,
                                  family='episoed_result'),
            ])

            self.eval_episode_summary = tf.summary.merge([
                tf.summary.scalar('eval_episode_cumulate_reward',
                                  self.eval_episode_cumulate_reward,
                                  family='Eval_episoed_result'),
                tf.summary.scalar('eval_episoed_length',
                                  self.eval_episoed_length,
                                  family='Eval_episoed_result'),
                tf.summary.scalar('eval_success_or_not',
                                  self.eval_success_or_not,
                                  family='Eval_episoed_result'),
            ])
def learn(env,
          policy_func,
          dataset,
          optim_batch_size=128,
          max_iters=1e4,
          adam_epsilon=1e-5,
          optim_stepsize=3e-4,
          ckpt_dir=None,
          task_name=None,
          verbose=False):
    """
    Learn a behavior clone policy, and return the save location

    :param env: (Gym Environment) the environment
    :param policy_func: (function (str, Gym Space, Gym Space): TensorFlow Tensor) creates the policy
    :param dataset: (Dset or MujocoDset) the dataset manager
    :param optim_batch_size: (int) the batch size
    :param max_iters: (int) the maximum number of iterations
    :param adam_epsilon: (float) the epsilon value for the adam optimizer
    :param optim_stepsize: (float) the optimizer stepsize
    :param ckpt_dir: (str) the save directory, can be None for temporary directory
    :param task_name: (str) the save name, can be None for saving directly to the directory name
    :param verbose: (bool)
    :return: (str) the save location for the TensorFlow model
    """

    val_per_iter = int(max_iters / 10)
    ob_space = env.observation_space
    ac_space = env.action_space
    policy = policy_func("pi", ob_space,
                         ac_space)  # Construct network for new policy
    # placeholder
    obs_ph = policy.obs_ph
    action_ph = policy.pdtype.sample_placeholder([None])
    stochastic_ph = policy.stochastic_ph
    loss = tf.reduce_mean(tf.square(action_ph - policy.ac))
    var_list = policy.get_trainable_variables()
    adam = MpiAdam(var_list, epsilon=adam_epsilon)
    lossandgrad = tf_util.function([obs_ph, action_ph, stochastic_ph],
                                   [loss] + [tf_util.flatgrad(loss, var_list)])

    tf_util.initialize()
    adam.sync()
    logger.log("Pretraining with Behavior Cloning...")
    for iter_so_far in tqdm(range(int(max_iters))):
        ob_expert, ac_expert = dataset.get_next_batch(optim_batch_size,
                                                      'train')
        train_loss, grad = lossandgrad(ob_expert, ac_expert, True)
        adam.update(grad, optim_stepsize)
        if verbose and iter_so_far % val_per_iter == 0:
            ob_expert, ac_expert = dataset.get_next_batch(-1, 'val')
            val_loss, _ = lossandgrad(ob_expert, ac_expert, True)
            logger.log("Training loss: {}, Validation loss: {}".format(
                train_loss, val_loss))

    if ckpt_dir is None:
        savedir_fname = tempfile.TemporaryDirectory().name
    else:
        savedir_fname = os.path.join(ckpt_dir, task_name)
    tf_util.save_state(savedir_fname, var_list=policy.get_variables())
    return savedir_fname
Esempio n. 14
0
    def setup_model(self):
        with SetVerbosity(self.verbose):

            self.graph = tf.Graph()
            with self.graph.as_default():
                self.set_random_seed(self.seed)
                self.sess = tf_util.make_session(num_cpu=self.n_cpu_tf_sess,
                                                 graph=self.graph)

                # Construct network for new policy
                self.policy_pi = self.policy(self.sess,
                                             self.observation_space,
                                             self.action_space,
                                             self.n_envs,
                                             1,
                                             None,
                                             reuse=False,
                                             **self.policy_kwargs)

                # Network for old policy
                with tf.variable_scope("oldpi", reuse=False):
                    old_pi = self.policy(self.sess,
                                         self.observation_space,
                                         self.action_space,
                                         self.n_envs,
                                         1,
                                         None,
                                         reuse=False,
                                         **self.policy_kwargs)

                with tf.variable_scope("loss", reuse=False):
                    self.grad_inverter = grad_inverter(
                        [self.action_space.high, self.action_space.low])
                    # Target advantage function (if applicable)
                    atarg = tf.placeholder(dtype=tf.float32, shape=[None])

                    # Empirical return
                    ret = tf.placeholder(dtype=tf.float32, shape=[None])

                    # learning rate multiplier, updated with schedule
                    lrmult = tf.placeholder(name='lrmult',
                                            dtype=tf.float32,
                                            shape=[])

                    # Annealed cliping parameter epislon
                    clip_param = self.clip_param * lrmult

                    obs_ph = self.policy_pi.obs_ph
                    action_ph = self.policy_pi.pdtype.sample_placeholder(
                        [None])

                    if debug:
                        action_ph_val = tf.Print(
                            action_ph, [
                                action_ph,
                            ],
                            '\n\n ====================Unclipped action in: \n',
                            summarize=-1)
                        action_ph_val = tf.Print(
                            action_ph_val, [],
                            '\n ======================================== \n',
                            summarize=-1)

                    kloldnew = old_pi.proba_distribution.kl(
                        self.policy_pi.proba_distribution)
                    # old_logstd = old_pi.proba_distribution.logstd
                    # new_logstd = self.policy_pi.proba_distribution.logstd
                    # old_std = old_pi.proba_distribution.std
                    # new_std = self.policy_pi.proba_distribution.std
                    ent = self.policy_pi.proba_distribution.entropy()
                    meankl = tf.reduce_mean(kloldnew)
                    # meankl = tf.Print(meankl, [meankl,], "kl value: ")
                    # meankl_log = tf.Print(meankl, [old_logstd,], "high kl, old logstd value: ", summarize=-1)
                    # meankl_log = tf.Print(meankl_log, [new_logstd,], "high kl, new logstd value: ", summarize=-1)
                    # meankl_log = tf.Print(meankl_log, [old_std,], "high kl, old std value: ", summarize=-1)
                    # meankl_log = tf.Print(meankl_log, [new_std,], "high kl, new std value: ", summarize=-1)
                    # meanklvalue_ = tf.where(
                    #     tf.greater(meankl, tf.constant(1, dtype = tf.float32)),
                    #     meankl_log,
                    #     meankl)
                    meanent = tf.reduce_mean(ent)
                    pol_entpen = (-self.entcoeff) * meanent

                    # pnew / pold
                    if debug:
                        old_logp = old_pi.proba_distribution.logp(
                            action_ph_val)
                        old_mean = old_pi.proba_distribution.mode()
                        old_std = old_pi.proba_distribution.std
                        old_logp = tf.Print(old_logp, [
                            old_logp,
                        ],
                                            '======  OLD logp: \n',
                                            summarize=-1)
                        old_logp = tf.Print(old_logp, [], '\n', summarize=-1)
                        old_logp = tf.Print(old_logp, [
                            old_mean,
                        ],
                                            '======  OLD mean: \n',
                                            summarize=-1)
                        old_logp = tf.Print(old_logp, [], '\n', summarize=-1)
                        old_logp = tf.Print(old_logp, [
                            old_std,
                        ],
                                            '======  OLD std: \n',
                                            summarize=-1)
                        old_logp = tf.Print(old_logp, [], '\n', summarize=-1)
                        now_logp = self.policy_pi.proba_distribution.logp(
                            action_ph_val)
                        now_mean = self.policy_pi.proba_distribution.mode()
                        now_std = self.policy_pi.proba_distribution.std
                        now_logp = tf.Print(now_logp, [
                            now_logp,
                        ],
                                            '======  NOW logp: \n',
                                            summarize=-1)
                        now_logp = tf.Print(now_logp, [], '\n', summarize=-1)
                        now_logp = tf.Print(now_logp, [
                            now_mean,
                        ],
                                            '======  NOW mean: \n',
                                            summarize=-1)
                        now_logp = tf.Print(now_logp, [], '\n', summarize=-1)
                        now_logp = tf.Print(now_logp, [
                            now_std,
                        ],
                                            '======  NOW std: \n',
                                            summarize=-1)
                        now_logp = tf.Print(now_logp, [], '\n', summarize=-1)
                    else:
                        now_logp = self.policy_pi.proba_distribution.logp(
                            action_ph)
                        old_logp = old_pi.proba_distribution.logp(action_ph)

                    ratio = tf.exp(now_logp - old_logp)
                    if debug:
                        ratio = tf.Print(ratio, [
                            ratio,
                        ],
                                         'ratio: \n',
                                         summarize=-1)
                        ratio = tf.Print(ratio, [], '\n', summarize=-1)
                    # surrogate from conservative policy iteration
                    surr1 = ratio * atarg
                    surr2 = tf.clip_by_value(ratio, 1.0 - clip_param,
                                             1.0 + clip_param) * atarg

                    # PPO's pessimistic surrogate (L^CLIP)
                    pol_surr = -tf.reduce_mean(tf.minimum(surr1, surr2))
                    vf_loss = tf.reduce_mean(
                        tf.square(self.policy_pi.value_flat - ret))
                    total_loss = pol_surr + pol_entpen + vf_loss
                    losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent]
                    # losses = [pol_surr, pol_entpen, vf_loss, meanklvalue_, meanent]
                    self.loss_names = [
                        "pol_surr", "pol_entpen", "vf_loss", "kl", "ent"
                    ]

                    tf.summary.scalar('entropy_loss', pol_entpen)
                    tf.summary.scalar('policy_gradient_loss', pol_surr)
                    tf.summary.scalar('value_function_loss', vf_loss)
                    tf.summary.scalar('approximate_kullback-leibler', meankl)
                    tf.summary.scalar('clip_factor', clip_param)
                    tf.summary.scalar('loss', total_loss)

                    self.params = tf_util.get_trainable_vars("model")

                    self.assign_old_eq_new = tf_util.function(
                        [], [],
                        updates=[
                            tf.assign(oldv, newv) for (oldv, newv) in zipsame(
                                tf_util.get_globals_vars("oldpi"),
                                tf_util.get_globals_vars("model"))
                        ])

                with tf.variable_scope("Adam_mpi", reuse=False):
                    self.adam = MpiAdam(self.params,
                                        epsilon=self.adam_epsilon,
                                        sess=self.sess)

                with tf.variable_scope("input_info", reuse=False):
                    tf.summary.scalar('discounted_rewards',
                                      tf.reduce_mean(ret))
                    tf.summary.scalar('learning_rate',
                                      tf.reduce_mean(self.optim_stepsize))
                    tf.summary.scalar('advantage', tf.reduce_mean(atarg))
                    tf.summary.scalar('clip_range',
                                      tf.reduce_mean(self.clip_param))

                    if self.full_tensorboard_log:
                        tf.summary.histogram('discounted_rewards', ret)
                        tf.summary.histogram('learning_rate',
                                             self.optim_stepsize)
                        tf.summary.histogram('advantage', atarg)
                        tf.summary.histogram('clip_range', self.clip_param)
                        if tf_util.is_image(self.observation_space):
                            tf.summary.image('observation', obs_ph)
                        else:
                            tf.summary.histogram('observation', obs_ph)

                self.step = self.policy_pi.step
                self.proba_step = self.policy_pi.proba_step
                self.initial_state = self.policy_pi.initial_state

                tf_util.initialize(sess=self.sess)

                self.summary = tf.summary.merge_all()

                self.lossandgrad = tf_util.function(
                    [obs_ph, old_pi.obs_ph, action_ph, atarg, ret, lrmult],
                    [self.summary,
                     tf_util.flatgrad(total_loss, self.params)] + losses)
                self.compute_losses = tf_util.function(
                    [obs_ph, old_pi.obs_ph, action_ph, atarg, ret, lrmult],
                    losses)
Esempio n. 15
0
    def setup_model(self):
        with SetVerbosity(self.verbose):

            self.graph = tf.Graph()
            with self.graph.as_default():
                self.sess = tf_util.single_threaded_session(graph=self.graph)

                # Construct network for new policy
                with tf.variable_scope("pi", reuse=False):
                    self.policy_pi = self.policy(self.sess,
                                                 self.observation_space,
                                                 self.action_space,
                                                 self.n_envs,
                                                 1,
                                                 None,
                                                 reuse=False)

                # Network for old policy
                with tf.variable_scope("oldpi", reuse=False):
                    old_pi = self.policy(self.sess,
                                         self.observation_space,
                                         self.action_space,
                                         self.n_envs,
                                         1,
                                         None,
                                         reuse=False)

                # Target advantage function (if applicable)
                atarg = tf.placeholder(dtype=tf.float32, shape=[None])

                # Empirical return
                ret = tf.placeholder(dtype=tf.float32, shape=[None])

                # learning rate multiplier, updated with schedule
                lrmult = tf.placeholder(name='lrmult',
                                        dtype=tf.float32,
                                        shape=[])

                # Annealed cliping parameter epislon
                clip_param = self.clip_param * lrmult

                obs_ph = self.policy_pi.obs_ph
                action_ph = self.policy_pi.pdtype.sample_placeholder([None])

                kloldnew = old_pi.proba_distribution.kl(
                    self.policy_pi.proba_distribution)
                ent = self.policy_pi.proba_distribution.entropy()
                meankl = tf.reduce_mean(kloldnew)
                meanent = tf.reduce_mean(ent)
                pol_entpen = (-self.entcoeff) * meanent

                # pnew / pold
                ratio = tf.exp(
                    self.policy_pi.proba_distribution.logp(action_ph) -
                    old_pi.proba_distribution.logp(action_ph))

                # surrogate from conservative policy iteration
                surr1 = ratio * atarg
                surr2 = tf.clip_by_value(ratio, 1.0 - clip_param,
                                         1.0 + clip_param) * atarg

                # PPO's pessimistic surrogate (L^CLIP)
                pol_surr = -tf.reduce_mean(tf.minimum(surr1, surr2))
                vf_loss = tf.reduce_mean(
                    tf.square(self.policy_pi.value_fn[:, 0] - ret))
                total_loss = pol_surr + pol_entpen + vf_loss
                losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent]
                self.loss_names = [
                    "pol_surr", "pol_entpen", "vf_loss", "kl", "ent"
                ]

                self.params = tf_util.get_trainable_vars("pi")
                self.lossandgrad = tf_util.function(
                    [obs_ph, old_pi.obs_ph, action_ph, atarg, ret, lrmult],
                    losses + [tf_util.flatgrad(total_loss, self.params)])
                self.adam = MpiAdam(self.params,
                                    epsilon=self.adam_epsilon,
                                    sess=self.sess)

                self.assign_old_eq_new = tf_util.function(
                    [], [],
                    updates=[
                        tf.assign(oldv, newv) for (
                            oldv,
                            newv) in zipsame(tf_util.get_globals_vars("oldpi"),
                                             tf_util.get_globals_vars("pi"))
                    ])
                self.compute_losses = tf_util.function(
                    [obs_ph, old_pi.obs_ph, action_ph, atarg, ret, lrmult],
                    losses)

                self.step = self.policy_pi.step
                self.proba_step = self.policy_pi.proba_step
                self.initial_state = self.policy_pi.initial_state

                tf_util.initialize(sess=self.sess)
    def setup_model(self):
        with SetVerbosity(self.verbose):

            self.graph = tf.Graph()
            with self.graph.as_default():
                self.set_random_seed(self.seed)
                self.sess = tf_util.make_session(num_cpu=self.n_cpu_tf_sess,
                                                 graph=self.graph)

                # Construct network for new policy
                self.policy_pi = self.policy(self.sess,
                                             self.observation_space,
                                             self.action_space,
                                             self.n_envs,
                                             1,
                                             None,
                                             reuse=False,
                                             **self.policy_kwargs)

                # Network for old policy
                with tf.compat.v1.variable_scope("oldpi", reuse=False):
                    old_pi = self.policy(self.sess,
                                         self.observation_space,
                                         self.action_space,
                                         self.n_envs,
                                         1,
                                         None,
                                         reuse=False,
                                         **self.policy_kwargs)

                with tf.compat.v1.variable_scope("loss", reuse=False):
                    # Target advantage function (if applicable)
                    atarg = tf.compat.v1.placeholder(dtype=tf.float32,
                                                     shape=[None])

                    # Empirical return
                    ret = tf.compat.v1.placeholder(dtype=tf.float32,
                                                   shape=[None])

                    # learning rate multiplier, updated with schedule
                    lrmult = tf.compat.v1.placeholder(name='lrmult',
                                                      dtype=tf.float32,
                                                      shape=[])

                    # Annealed cliping parameter epislon
                    clip_param = self.clip_param * lrmult

                    obs_ph = self.policy_pi.obs_ph
                    action_ph = self.policy_pi.pdtype.sample_placeholder(
                        [None])

                    kloldnew = old_pi.proba_distribution.kl(
                        self.policy_pi.proba_distribution)
                    ent = self.policy_pi.proba_distribution.entropy()
                    meankl = tf.reduce_mean(input_tensor=kloldnew)
                    meanent = tf.reduce_mean(input_tensor=ent)
                    pol_entpen = (-self.entcoeff) * meanent

                    # pnew / pold
                    ratio = tf.exp(
                        self.policy_pi.proba_distribution.logp(action_ph) -
                        old_pi.proba_distribution.logp(action_ph))

                    # surrogate from conservative policy iteration
                    surr1 = ratio * atarg
                    surr2 = tf.clip_by_value(ratio, 1.0 - clip_param,
                                             1.0 + clip_param) * atarg

                    # PPO's pessimistic surrogate (L^CLIP)
                    pol_surr = -tf.reduce_mean(
                        input_tensor=tf.minimum(surr1, surr2))
                    vf_loss = tf.reduce_mean(
                        input_tensor=tf.square(self.policy_pi.value_flat -
                                               ret))
                    total_loss = pol_surr + pol_entpen + vf_loss
                    losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent]
                    self.loss_names = [
                        "pol_surr", "pol_entpen", "vf_loss", "kl", "ent"
                    ]

                    tf.compat.v1.summary.scalar('entropy_loss', pol_entpen)
                    tf.compat.v1.summary.scalar('policy_gradient_loss',
                                                pol_surr)
                    tf.compat.v1.summary.scalar('value_function_loss', vf_loss)
                    tf.compat.v1.summary.scalar('approximate_kullback-leibler',
                                                meankl)
                    tf.compat.v1.summary.scalar('clip_factor', clip_param)
                    tf.compat.v1.summary.scalar('loss', total_loss)

                    self.params = tf_util.get_trainable_vars("model")

                    self.assign_old_eq_new = tf_util.function(
                        [], [],
                        updates=[
                            tf.compat.v1.assign(oldv, newv)
                            for (oldv, newv) in zipsame(
                                tf_util.get_globals_vars("oldpi"),
                                tf_util.get_globals_vars("model"))
                        ])

                with tf.compat.v1.variable_scope("Adam_mpi", reuse=False):
                    self.adam = MpiAdam(self.params,
                                        epsilon=self.adam_epsilon,
                                        sess=self.sess)

                with tf.compat.v1.variable_scope("input_info", reuse=False):
                    tf.compat.v1.summary.scalar(
                        'discounted_rewards', tf.reduce_mean(input_tensor=ret))
                    tf.compat.v1.summary.scalar(
                        'learning_rate',
                        tf.reduce_mean(input_tensor=self.optim_stepsize))
                    tf.compat.v1.summary.scalar(
                        'advantage', tf.reduce_mean(input_tensor=atarg))
                    tf.compat.v1.summary.scalar(
                        'clip_range',
                        tf.reduce_mean(input_tensor=self.clip_param))

                    if self.full_tensorboard_log:
                        tf.compat.v1.summary.histogram('discounted_rewards',
                                                       ret)
                        tf.compat.v1.summary.histogram('learning_rate',
                                                       self.optim_stepsize)
                        tf.compat.v1.summary.histogram('advantage', atarg)
                        tf.compat.v1.summary.histogram('clip_range',
                                                       self.clip_param)
                        if tf_util.is_image(self.observation_space):
                            tf.compat.v1.summary.image('observation', obs_ph)
                        else:
                            tf.compat.v1.summary.histogram(
                                'observation', obs_ph)

                self.step = self.policy_pi.step
                self.proba_step = self.policy_pi.proba_step
                self.initial_state = self.policy_pi.initial_state

                tf_util.initialize(sess=self.sess)

                self.summary = tf.compat.v1.summary.merge_all()

                self.lossandgrad = tf_util.function(
                    [obs_ph, old_pi.obs_ph, action_ph, atarg, ret, lrmult],
                    [self.summary,
                     tf_util.flatgrad(total_loss, self.params)] + losses)
                self.compute_losses = tf_util.function(
                    [obs_ph, old_pi.obs_ph, action_ph, atarg, ret, lrmult],
                    losses)
Esempio n. 17
0
    def setup_model(self):
        # prevent import loops
        from stable_baselines.gail.adversary import TransitionClassifier

        with SetVerbosity(self.verbose):

            assert issubclass(self.policy, ActorCriticPolicy), "Error: the input policy for the TRPO model must be " \
                                                               "an instance of common.policies.ActorCriticPolicy."

            self.nworkers = MPI.COMM_WORLD.Get_size()
            self.rank = MPI.COMM_WORLD.Get_rank()
            np.set_printoptions(precision=3)

            self.graph = tf.Graph()
            with self.graph.as_default():
                self.sess = tf_util.single_threaded_session(graph=self.graph)

                if self.using_gail:
                    self.reward_giver = TransitionClassifier(
                        self.observation_space,
                        self.action_space,
                        self.hidden_size_adversary,
                        entcoeff=self.adversary_entcoeff)

                # Construct network for new policy
                self.policy_pi = self.policy(self.sess,
                                             self.observation_space,
                                             self.action_space,
                                             self.n_envs,
                                             1,
                                             None,
                                             reuse=False,
                                             **self.policy_kwargs)

                # Network for old policy
                with tf.variable_scope("oldpi", reuse=False):
                    old_policy = self.policy(self.sess,
                                             self.observation_space,
                                             self.action_space,
                                             self.n_envs,
                                             1,
                                             None,
                                             reuse=False,
                                             **self.policy_kwargs)

                with tf.variable_scope("loss", reuse=False):
                    atarg = tf.placeholder(dtype=tf.float32, shape=[
                        None
                    ])  # Target advantage function (if applicable)
                    ret = tf.placeholder(dtype=tf.float32,
                                         shape=[None])  # Empirical return

                    observation = self.policy_pi.obs_ph
                    action = self.policy_pi.pdtype.sample_placeholder([None])

                    kloldnew = old_policy.proba_distribution.kl(
                        self.policy_pi.proba_distribution)
                    ent = self.policy_pi.proba_distribution.entropy()
                    meankl = tf.reduce_mean(kloldnew)
                    meanent = tf.reduce_mean(ent)
                    entbonus = self.entcoeff * meanent

                    vferr = tf.reduce_mean(
                        tf.square(self.policy_pi.value_fn[:, 0] - ret))

                    # advantage * pnew / pold
                    ratio = tf.exp(
                        self.policy_pi.proba_distribution.logp(action) -
                        old_policy.proba_distribution.logp(action))
                    surrgain = tf.reduce_mean(ratio * atarg)

                    optimgain = surrgain + entbonus
                    losses = [optimgain, meankl, entbonus, surrgain, meanent]
                    self.loss_names = [
                        "optimgain", "meankl", "entloss", "surrgain", "entropy"
                    ]

                    dist = meankl

                    all_var_list = tf_util.get_trainable_vars("model")
                    var_list = [
                        v for v in all_var_list
                        if "/vf" not in v.name and "/q/" not in v.name
                    ]
                    vf_var_list = [
                        v for v in all_var_list
                        if "/pi" not in v.name and "/logstd" not in v.name
                    ]

                    self.get_flat = tf_util.GetFlat(var_list, sess=self.sess)
                    self.set_from_flat = tf_util.SetFromFlat(var_list,
                                                             sess=self.sess)

                    klgrads = tf.gradients(dist, var_list)
                    flat_tangent = tf.placeholder(dtype=tf.float32,
                                                  shape=[None],
                                                  name="flat_tan")
                    shapes = [var.get_shape().as_list() for var in var_list]
                    start = 0
                    tangents = []
                    for shape in shapes:
                        var_size = tf_util.intprod(shape)
                        tangents.append(
                            tf.reshape(flat_tangent[start:start + var_size],
                                       shape))
                        start += var_size
                    gvp = tf.add_n([
                        tf.reduce_sum(grad * tangent)
                        for (grad, tangent) in zipsame(klgrads, tangents)
                    ])  # pylint: disable=E1111
                    fvp = tf_util.flatgrad(gvp, var_list)

                    tf.summary.scalar('entropy_loss', meanent)
                    tf.summary.scalar('policy_gradient_loss', optimgain)
                    tf.summary.scalar('value_function_loss', surrgain)
                    tf.summary.scalar('approximate_kullback-leiber', meankl)
                    tf.summary.scalar(
                        'loss',
                        optimgain + meankl + entbonus + surrgain + meanent)

                    self.assign_old_eq_new = \
                        tf_util.function([], [], updates=[tf.assign(oldv, newv) for (oldv, newv) in
                                                          zipsame(tf_util.get_globals_vars("oldpi"),
                                                                  tf_util.get_globals_vars("model"))])
                    self.compute_losses = tf_util.function(
                        [observation, old_policy.obs_ph, action, atarg],
                        losses)
                    self.compute_fvp = tf_util.function([
                        flat_tangent, observation, old_policy.obs_ph, action,
                        atarg
                    ], fvp)
                    self.compute_vflossandgrad = tf_util.function(
                        [observation, old_policy.obs_ph, ret],
                        tf_util.flatgrad(vferr, vf_var_list))

                    @contextmanager
                    def timed(msg):
                        if self.rank == 0 and self.verbose >= 1:
                            print(colorize(msg, color='magenta'))
                            start_time = time.time()
                            yield
                            print(
                                colorize("done in {:.3f} seconds".format(
                                    (time.time() - start_time)),
                                         color='magenta'))
                        else:
                            yield

                    def allmean(arr):
                        assert isinstance(arr, np.ndarray)
                        out = np.empty_like(arr)
                        MPI.COMM_WORLD.Allreduce(arr, out, op=MPI.SUM)
                        out /= self.nworkers
                        return out

                    tf_util.initialize(sess=self.sess)

                    th_init = self.get_flat()
                    MPI.COMM_WORLD.Bcast(th_init, root=0)
                    self.set_from_flat(th_init)

                with tf.variable_scope("Adam_mpi", reuse=False):
                    self.vfadam = MpiAdam(vf_var_list, sess=self.sess)
                    if self.using_gail:
                        self.d_adam = MpiAdam(
                            self.reward_giver.get_trainable_variables(),
                            sess=self.sess)
                        self.d_adam.sync()
                    self.vfadam.sync()

                with tf.variable_scope("input_info", reuse=False):
                    tf.summary.scalar('discounted_rewards',
                                      tf.reduce_mean(ret))
                    tf.summary.scalar('learning_rate',
                                      tf.reduce_mean(self.vf_stepsize))
                    tf.summary.scalar('advantage', tf.reduce_mean(atarg))
                    tf.summary.scalar('kl_clip_range',
                                      tf.reduce_mean(self.max_kl))

                    if self.full_tensorboard_log:
                        tf.summary.histogram('discounted_rewards', ret)
                        tf.summary.histogram('learning_rate', self.vf_stepsize)
                        tf.summary.histogram('advantage', atarg)
                        tf.summary.histogram('kl_clip_range', self.max_kl)
                        if tf_util.is_image(self.observation_space):
                            tf.summary.image('observation', observation)
                        else:
                            tf.summary.histogram('observation', observation)

                self.timed = timed
                self.allmean = allmean

                self.step = self.policy_pi.step
                self.proba_step = self.policy_pi.proba_step
                self.initial_state = self.policy_pi.initial_state

                self.params = find_trainable_variables("model")
                if self.using_gail:
                    self.params.extend(
                        self.reward_giver.get_trainable_variables())

                self.summary = tf.summary.merge_all()

                self.compute_lossandgrad = \
                    tf_util.function([observation, old_policy.obs_ph, action, atarg, ret],
                                     [self.summary, tf_util.flatgrad(optimgain, var_list)] + losses)
Esempio n. 18
0
def general_actor_critic(input_shape_vec,
                         act_output_shape,
                         comm,
                         learn_rate=[0.001, 0.001],
                         trainable=True,
                         label=""):

    sess = K.get_session()
    np.random.seed(0)
    tf.set_random_seed(0)

    # network 1 (new policy)
    with tf.variable_scope(label + "_pi_new", reuse=False):
        inp = Input(shape=input_shape_vec)  # [5,6,3]
        # rc_lyr = Lambda(lambda x:  ned_to_ripCoords_tf(x, 4000))(inp)
        trunk_x = Reshape([input_shape_vec[0], input_shape_vec[1] * 3])(inp)
        trunk_x = LSTM(128)(trunk_x)
        dist, sample_action_op, action_ph, value_output = ppo_continuous(
            3, trunk_x)

    # network 2 (old policy)
    with tf.variable_scope(label + "_pi_old", reuse=False):
        inp_old = Input(shape=input_shape_vec)  # [5,6,3]
        # rc_lyr = Lambda(lambda x:  ned_to_ripCoords_tf(x, 4000))(inp_old)
        trunk_x = Reshape([input_shape_vec[0],
                           input_shape_vec[1] * 3])(inp_old)
        trunk_x = LSTM(128)(trunk_x)
        dist_old, sample_action_op_old, action_ph_old, value_output_old = ppo_continuous(
            3, trunk_x)

    # additional placeholders
    adv_ph = tf.placeholder(tf.float32, [None], name="advantages_ph")
    alpha_ph = tf.placeholder(tf.float32, shape=(), name="alpha_ph")
    vtarg = tf.placeholder(tf.float32, [None])  # target value placeholder

    # loss
    loss = ppo_continuous_loss(dist, dist_old, value_output, action_ph,
                               alpha_ph, adv_ph, vtarg)

    # gradient
    with tf.variable_scope("grad", reuse=False):
        gradient = tf_util.flatgrad(
            loss, tf_util.get_trainable_vars(label + "_pi_new"))
        adam = MpiAdam(tf_util.get_trainable_vars(label + "_pi_new"),
                       epsilon=0.00001,
                       sess=sess,
                       comm=comm)

    # method for sync'ing the two policies
    assign_old_eq_new = tf_util.function(
        [], [],
        updates=[
            tf.assign(oldv, newv)
            for (oldv,
                 newv) in zipsame(tf_util.get_globals_vars(label + "_pi_old"),
                                  tf_util.get_globals_vars(label + "_pi_new"))
        ])

    # initialize all the things
    init_op = tf.global_variables_initializer()
    sess.run(init_op)

    # methods for interacting with this model

    def sync_weights():
        assign_old_eq_new(sess=sess)

    def sample_action(states, logstd_override=None):
        a = sess.run(sample_action_op, feed_dict={inp: states})
        return a

    def sample_value(states):
        v = sess.run(value_output, feed_dict={inp: states})
        return v

    def train(states, actions, vtarget, advs, alpha):
        alpha = max(alpha, 0.0)
        adam_lr = learn_rate[0]

        g = sess.run(
            [gradient],
            feed_dict={
                inp: states,
                inp_old: states,
                action_ph: actions,
                adv_ph: advs,
                alpha_ph: alpha,
                vtarg: vtarget
            })

        adam.update(g[0], adam_lr * alpha)

    # initial sync
    adam.sync()
    sync_weights()

    return sync_weights, sample_action, sample_value, train