Exemple #1
0
    def _init(self, ob_space, ac_space, hid_size, num_hid_layers, gaussian_fixed_var=True):
        assert isinstance(ob_space, gym.spaces.Box)

        self.pdtype = pdtype = make_pdtype(ac_space)
        sequence_length = None

        ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape))

        with tf.variable_scope("obfilter"):
            self.ob_rms = RunningMeanStd(shape=ob_space.shape)

        with tf.variable_scope('vf'):
            obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0)
            last_out = obz
            for i in range(num_hid_layers):
                last_out = tf.nn.tanh(tf.layers.dense(last_out, hid_size, name="fc%i" % (i + 1),
                                                      kernel_initializer=U.normc_initializer(1.0)))
            self.vpred = tf.layers.dense(last_out, 1, name='final', kernel_initializer=U.normc_initializer(1.0))[:, 0]

        with tf.variable_scope('pol'):
            last_out = obz
            for i in range(num_hid_layers):
                last_out = tf.nn.tanh(tf.layers.dense(last_out, hid_size, name='fc%i' % (i + 1),
                                                      kernel_initializer=U.normc_initializer(1.0)))
            if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box):
                mean = tf.layers.dense(last_out, pdtype.param_shape()[0] // 2, name='final',
                                       kernel_initializer=U.normc_initializer(0.01))
                logstd = tf.get_variable(name="logstd", shape=[1, pdtype.param_shape()[0] // 2],
                                         initializer=tf.zeros_initializer())
                pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1)
            else:
                pdparam = tf.layers.dense(last_out, pdtype.param_shape()[0], name='final',
                                          kernel_initializer=U.normc_initializer(0.01))

        self.pd = pdtype.pdfromflat(pdparam)

        self.state_in = []
        self.state_out = []

        stochastic = tf.placeholder(dtype=tf.bool, shape=())
        ac = U.switch(stochastic, self.pd.sample(), self.pd.mode())
        self._act = U.function([stochastic, ob], [ac, self.vpred])
    def __init__(self, env, hidden_size, expert_dataset):
        with tf.variable_scope('guidance'):
            self.hidden_size = hidden_size
            self.scope = tf.get_variable_scope().name
            self.observation_shape = env.observation_space.shape
            self.actions_shape = env.action_space.shape

            self.build_ph()

            # Build grpah
            # 构建图(生成图和专家图)输出1维
            generator_logits = self.build_graph(self.generator_obs_ph,
                                                self.generator_acs_ph,
                                                reuse=False)
            expert_logits = self.build_graph(self.generator_obs_ph,
                                             self.expert_acs_ph,
                                             reuse=True)

            self.expert_label = tf.ones_like(expert_logits)
            self.generator_label = tf.zeros_like(generator_logits)

            # label
            label = self.generator_label >= self.expert_label
            label = (tf.cast(label, tf.float32) - 0.5) * 2

            loss = RankLoss(predict_score1=generator_logits,
                            predict_score2=expert_logits,
                            label=label)

            self.loss = tf.reduce_mean(loss)

            optimizer = tf.train.AdamOptimizer()
            self.train_op = optimizer.minimize(self.loss)
        # Build Reward for policy 为什么用生成器作为reward_op
        self.reward_op = -tf.log(1 - tf.nn.sigmoid(generator_logits) + 1e-8)
        self.loss_name = ["guidance__loss"]
        var_list = self.get_trainable_variables()
        self.lossandgrad = U.function(
            [self.generator_obs_ph, self.generator_acs_ph, self.expert_acs_ph],
            [self.loss] + [U.flatgrad(self.loss, var_list)])
    def __init__(self, epsilon=1e-2, shape=()):

        self._sum = tf.get_variable(dtype=tf.float64,
                                    shape=shape,
                                    initializer=tf.constant_initializer(0.0),
                                    name="runningsum",
                                    trainable=False)
        self._sumsq = tf.get_variable(
            dtype=tf.float64,
            shape=shape,
            initializer=tf.constant_initializer(epsilon),
            name="runningsumsq",
            trainable=False)
        self._count = tf.get_variable(
            dtype=tf.float64,
            shape=(),
            initializer=tf.constant_initializer(epsilon),
            name="count",
            trainable=False)
        self.shape = shape

        self.mean = tf.to_float(self._sum / self._count)
        self.std = tf.sqrt(
            tf.maximum(
                tf.to_float(self._sumsq / self._count) - tf.square(self.mean),
                1e-2))

        newsum = tf.placeholder(shape=self.shape, dtype=tf.float64, name='sum')
        newsumsq = tf.placeholder(shape=self.shape,
                                  dtype=tf.float64,
                                  name='var')
        newcount = tf.placeholder(shape=[], dtype=tf.float64, name='count')
        updates = [
            tf.assign_add(self._sum, newsum),
            tf.assign_add(self._sumsq, newsumsq),
            tf.assign_add(self._count, newcount)
        ]
        self.incfiltparams = U.function([newsum, newsumsq, newcount], [],
                                        updates=updates)
    def _init(self, ob_space, ac_space, hid_size, num_hid_layers, gaussian_fixed_var=True):
        assert isinstance(ob_space, gym.spaces.Box)

        self.pdtype = pdtype = make_pdtype(ac_space)
        sequence_length = None

        ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape))
        self.obs = ob

        with tf.variable_scope("obfilter"):
            self.ob_rms = RunningMeanStd(shape=ob_space.shape)

        obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0)
        last_out = obz
        for i in range(num_hid_layers):
            last_out = tf.nn.tanh(dense(last_out, hid_size, "vffc%i" % (i+1), weight_init=U.normc_initializer(1.0)))
        self.v_preds = dense(last_out, 1, "vffinal", weight_init=U.normc_initializer(1.0))[:, 0]
        self.pd, self.pi = pdtype.pdfromlatent(last_out)
        # last_out = obz
        # for i in range(num_hid_layers):
        #     last_out = tf.nn.tanh(dense(last_out, hid_size, "polfc%i" % (i+1), weight_init=U.normc_initializer(1.0)))

        # if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box):
        #     mean = dense(last_out, pdtype.param_shape()[0]//2, "polfinal", U.normc_initializer(0.01))
        #     logstd = tf.get_variable(name="logstd", shape=[1, pdtype.param_shape()[0]//2], initializer=tf.zeros_initializer())
        #     pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1)
        # else:
        #     pdparam = dense(last_out, pdtype.param_shape()[0], "polfinal", U.normc_initializer(0.01))

        # self.pd = pdtype.pdfromflat(pdparam)

        # change for BC
        stochastic = U.get_placeholder(name="stochastic", dtype=tf.bool, shape=())
        ac = U.switch(stochastic, self.pd.sample(), self.pd.mode())
        self.ac = ac
        self._act = U.function([stochastic, ob], [ac, self.v_preds])
def learn(env,
          policy_func,
          reward_giver,
          reward_guidance,
          expert_dataset,
          rank,
          pretrained,
          pretrained_weight,
          *,
          g_step,
          d_step,
          entcoeff,
          save_per_iter,
          ckpt_dir,
          log_dir,
          timesteps_per_batch,
          task_name,
          gamma,
          lam,
          algo,
          max_kl,
          cg_iters,
          cg_damping=1e-2,
          vf_stepsize=3e-4,
          d_stepsize=1e-4,
          vf_iters=3,
          max_timesteps=0,
          max_episodes=0,
          max_iters=0,
          loss_percent=0.0,
          callback=None):

    nworkers = MPI.COMM_WORLD.Get_size()
    rank = MPI.COMM_WORLD.Get_rank()
    np.set_printoptions(precision=3)
    # Setup losses and stuff
    # ----------------------------------------
    ob_space = env.observation_space
    ac_space = env.action_space
    policy = build_policy(env, 'mlp', value_network='copy')

    ob = observation_placeholder(ob_space)
    with tf.variable_scope('pi'):
        pi = policy(observ_placeholder=ob)
    with tf.variable_scope('oldpi'):
        oldpi = policy(observ_placeholder=ob)

    atarg = tf.placeholder(
        dtype=tf.float32,
        shape=[None])  # Target advantage function (if applicable)
    ret = tf.placeholder(dtype=tf.float32, shape=[None])  # Empirical return

    ac = pi.pdtype.sample_placeholder([None])

    kloldnew = oldpi.pd.kl(pi.pd)
    ent = pi.pd.entropy()
    meankl = tf.reduce_mean(kloldnew)
    meanent = tf.reduce_mean(ent)
    entbonus = entcoeff * meanent

    vferr = tf.reduce_mean(tf.square(pi.vf - ret))

    ratio = tf.exp(pi.pd.logp(ac) -
                   oldpi.pd.logp(ac))  # advantage * pnew / pold
    surrgain = tf.reduce_mean(ratio * atarg)

    optimgain = surrgain + entbonus
    losses = [optimgain, meankl, entbonus, surrgain, meanent]
    loss_names = ["optimgain", "meankl", "entloss", "surrgain", "entropy"]

    dist = meankl

    all_var_list = get_trainable_variables('pi')
    # var_list = [v for v in all_var_list if v.name.startswith("pi/pol") or v.name.startswith("pi/logstd")]
    # vf_var_list = [v for v in all_var_list if v.name.startswith("pi/vff")]
    var_list = get_pi_trainable_variables("pi")
    vf_var_list = get_vf_trainable_variables("pi")
    # assert len(var_list) == len(vf_var_list) + 1
    d_adam = MpiAdam(reward_giver.get_trainable_variables())
    guidance_adam = MpiAdam(reward_guidance.get_trainable_variables())

    vfadam = MpiAdam(vf_var_list)

    get_flat = U.GetFlat(var_list)
    set_from_flat = U.SetFromFlat(var_list)
    klgrads = tf.gradients(dist, var_list)
    flat_tangent = tf.placeholder(dtype=tf.float32,
                                  shape=[None],
                                  name="flat_tan")
    shapes = [var.get_shape().as_list() for var in var_list]
    start = 0
    tangents = []
    for shape in shapes:
        sz = U.intprod(shape)
        tangents.append(tf.reshape(flat_tangent[start:start + sz], shape))
        start += sz
    gvp = tf.add_n([
        tf.reduce_sum(g * tangent)
        for (g, tangent) in zipsame(klgrads, tangents)
    ])  # pylint: disable=E1111
    fvp = U.flatgrad(gvp, var_list)

    assign_old_eq_new = U.function(
        [], [],
        updates=[
            tf.assign(oldv, newv)
            for (oldv,
                 newv) in zipsame(get_variables('oldpi'), get_variables('pi'))
        ])
    compute_losses = U.function([ob, ac, atarg], losses)
    compute_lossandgrad = U.function([ob, ac, atarg], losses +
                                     [U.flatgrad(optimgain, var_list)])
    compute_fvp = U.function([flat_tangent, ob, ac, atarg], fvp)
    compute_vflossandgrad = U.function([ob, ret],
                                       U.flatgrad(vferr, vf_var_list))

    @contextmanager
    def timed(msg):
        if rank == 0:
            print(colorize(msg, color='magenta'))
            tstart = time.time()
            yield
            print(
                colorize("done in %.3f seconds" % (time.time() - tstart),
                         color='magenta'))
        else:
            yield

    def allmean(x):
        assert isinstance(x, np.ndarray)
        out = np.empty_like(x)
        MPI.COMM_WORLD.Allreduce(x, out, op=MPI.SUM)
        out /= nworkers
        return out

    U.initialize()
    th_init = get_flat()
    MPI.COMM_WORLD.Bcast(th_init, root=0)
    set_from_flat(th_init)
    d_adam.sync()
    guidance_adam.sync()
    vfadam.sync()
    if rank == 0:
        print("Init param sum", th_init.sum(), flush=True)

    # Prepare for rollouts
    # ----------------------------------------
    seg_gen = traj_segment_generator(pi,
                                     env,
                                     reward_giver,
                                     reward_guidance,
                                     timesteps_per_batch,
                                     stochastic=True,
                                     algo=algo,
                                     loss_percent=loss_percent)

    episodes_so_far = 0
    timesteps_so_far = 0
    iters_so_far = 0
    tstart = time.time()
    lenbuffer = deque(maxlen=40)  # rolling buffer for episode lengths
    rewbuffer = deque(maxlen=40)  # rolling buffer for episode rewards
    true_rewbuffer = deque(maxlen=40)

    assert sum([max_iters > 0, max_timesteps > 0, max_episodes > 0]) == 1

    g_loss_stats = stats(loss_names)
    d_loss_stats = stats(reward_giver.loss_name)
    ep_stats = stats(["True_rewards", "Rewards", "Episode_length"])
    # if provide pretrained weight
    if pretrained_weight is not None:
        U.load_state(pretrained_weight, var_list=pi.get_variables())

    while True:
        if callback: callback(locals(), globals())
        if max_timesteps and timesteps_so_far >= max_timesteps:
            break
        elif max_episodes and episodes_so_far >= max_episodes:
            break
        elif max_iters and iters_so_far >= max_iters:
            break

        # Save model
        if rank == 0 and iters_so_far % save_per_iter == 0 and ckpt_dir is not None:
            fname = os.path.join(ckpt_dir, task_name)
            os.makedirs(os.path.dirname(fname), exist_ok=True)
            saver = tf.train.Saver()
            saver.save(tf.get_default_session(), fname)

        logger.log("********** Iteration %i ************" % iters_so_far)

        # global flag_render
        # if iters_so_far > 0 and iters_so_far % 10 ==0:
        #     flag_render = True
        # else:
        #     flag_render = False

        def fisher_vector_product(p):
            return allmean(compute_fvp(p, *fvpargs)) + cg_damping * p

        # ------------------ Update G ------------------
        logger.log("Optimizing Policy...")
        for _ in range(g_step):
            with timed("sampling"):
                seg = seg_gen.__next__()
            print('rewards', seg['rew'])
            add_vtarg_and_adv(seg, gamma, lam)
            # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets))
            ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg[
                "tdlamret"]
            vpredbefore = seg[
                "vpred"]  # predicted value function before udpate
            atarg = (atarg - atarg.mean()) / atarg.std(
            )  # standardized advantage function estimate

            if hasattr(pi, "ob_rms"):
                pi.ob_rms.update(ob)  # update running mean/std for policy

            args = seg["ob"], seg["ac"], atarg
            fvpargs = [arr[::5] for arr in args]

            assign_old_eq_new(
            )  # set old parameter values to new parameter values
            with timed("computegrad"):
                *lossbefore, g = compute_lossandgrad(*args)
            lossbefore = allmean(np.array(lossbefore))
            g = allmean(g)
            if np.allclose(g, 0):
                logger.log("Got zero gradient. not updating")
            else:
                with timed("cg"):
                    stepdir = cg(fisher_vector_product,
                                 g,
                                 cg_iters=cg_iters,
                                 verbose=rank == 0)
                assert np.isfinite(stepdir).all()
                shs = .5 * stepdir.dot(fisher_vector_product(stepdir))
                lm = np.sqrt(shs / max_kl)
                # logger.log("lagrange multiplier:", lm, "gnorm:", np.linalg.norm(g))
                fullstep = stepdir / lm
                expectedimprove = g.dot(fullstep)
                surrbefore = lossbefore[0]
                stepsize = 1.0
                thbefore = get_flat()
                for _ in range(10):
                    thnew = thbefore + fullstep * stepsize
                    set_from_flat(thnew)
                    meanlosses = surr, kl, *_ = allmean(
                        np.array(compute_losses(*args)))
                    improve = surr - surrbefore
                    logger.log("Expected: %.3f Actual: %.3f" %
                               (expectedimprove, improve))
                    if not np.isfinite(meanlosses).all():
                        logger.log("Got non-finite value of losses -- bad!")
                    elif kl > max_kl * 1.5:
                        logger.log("violated KL constraint. shrinking step.")
                    elif improve < 0:
                        logger.log("surrogate didn't improve. shrinking step.")
                    else:
                        logger.log("Stepsize OK!")
                        break
                    stepsize *= .5
                else:
                    logger.log("couldn't compute a good step")
                    set_from_flat(thbefore)
                if nworkers > 1 and iters_so_far % 20 == 0:
                    paramsums = MPI.COMM_WORLD.allgather(
                        (thnew.sum(),
                         vfadam.getflat().sum()))  # list of tuples
                    assert all(
                        np.allclose(ps, paramsums[0]) for ps in paramsums[1:])
            with timed("vf"):
                for _ in range(vf_iters):
                    for (mbob, mbret) in dataset.iterbatches(
                        (seg["ob"], seg["tdlamret"]),
                            include_final_partial_batch=False,
                            batch_size=128):
                        if hasattr(pi, "ob_rms"):
                            pi.ob_rms.update(
                                mbob)  # update running mean/std for policy
                        g = allmean(compute_vflossandgrad(mbob, mbret))
                        vfadam.update(g, vf_stepsize)

        g_losses = meanlosses
        for (lossname, lossval) in zip(loss_names, meanlosses):
            logger.record_tabular(lossname, lossval)
        logger.record_tabular("ev_tdlam_before",
                              explained_variance(vpredbefore, tdlamret))

        # ------------------ Update D ------------------
        logger.log("Optimizing Discriminator...")
        logger.log(fmt_row(13, reward_giver.loss_name))
        ob_expert, ac_expert = expert_dataset.get_next_batch(
            batch_size=len(ob))
        batch_size = 128
        d_losses = [
        ]  # list of tuples, each of which gives the loss for a minibatch
        with timed("Discriminator"):
            for (ob_batch, ac_batch) in dataset.iterbatches(
                (ob, ac),
                    include_final_partial_batch=False,
                    batch_size=batch_size):
                ob_expert, ac_expert = expert_dataset.get_next_batch(
                    batch_size=batch_size)
                # update running mean/std for reward_giver
                if hasattr(reward_giver, "obs_rms"):
                    reward_giver.obs_rms.update(
                        np.concatenate((ob_batch, ob_expert), 0))
                *newlosses, g = reward_giver.lossandgrad(ob_batch, ob_expert)
                d_adam.update(allmean(g), d_stepsize)
                d_losses.append(newlosses)
        logger.log(fmt_row(13, np.mean(d_losses, axis=0)))

        # ------------------ Update Guidance ------------
        logger.log("Optimizing Guidance...")

        logger.log(fmt_row(13, reward_guidance.loss_name))
        batch_size = 128
        guidance_losses = [
        ]  # list of tuples, each of which gives the loss for a minibatch
        with timed("Guidance"):
            for ob_batch, ac_batch in dataset.iterbatches(
                (ob, ac),
                    include_final_partial_batch=False,
                    batch_size=batch_size):
                ob_expert, ac_expert = expert_dataset.get_next_batch(
                    batch_size=batch_size)

                idx_condition = process_expert(ob_expert, ac_expert)
                pick_idx = (idx_condition >= loss_percent)
                # pick_idx = idx_condition

                ob_expert_p = ob_expert[pick_idx]
                ac_expert_p = ac_expert[pick_idx]

                ac_batch_p = []
                for each_ob in ob_expert_p:
                    tmp_ac, _, _, _ = pi.step(each_ob, stochastic=True)
                    ac_batch_p.append(tmp_ac)

                # update running mean/std for reward_giver
                if hasattr(reward_guidance, "obs_rms"):
                    reward_guidance.obs_rms.update(ob_expert_p)
                # reward_guidance.train(expert_s=ob_batch_p, agent_a=ac_batch_p, expert_a=ac_expert_p)
                *newlosses, g = reward_guidance.lossandgrad(
                    ob_expert_p, ac_batch_p, ac_expert_p)
                guidance_adam.update(allmean(g), d_stepsize)
                guidance_losses.append(newlosses)
        logger.log(fmt_row(13, np.mean(guidance_losses, axis=0)))

        lrlocal = (seg["ep_lens"], seg["ep_rets"], seg["ep_true_rets"]
                   )  # local values
        listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal)  # list of tuples
        lens, rews, true_rets = map(flatten_lists, zip(*listoflrpairs))
        true_rewbuffer.extend(true_rets)
        lenbuffer.extend(lens)
        rewbuffer.extend(rews)

        logger.record_tabular("EpLenMean", np.mean(lenbuffer))
        logger.record_tabular("EpRewMean", np.mean(rewbuffer))
        logger.record_tabular("EpTrueRewMean", np.mean(true_rewbuffer))
        logger.record_tabular("EpThisIter", len(lens))
        episodes_so_far += len(lens)
        timesteps_so_far += sum(lens) * g_step
        iters_so_far += 1

        logger.record_tabular("EpisodesSoFar", episodes_so_far)
        logger.record_tabular("TimestepsSoFar", timesteps_so_far)
        logger.record_tabular("TimeElapsed", time.time() - tstart)

        if rank == 0:
            logger.dump_tabular()
    def __init__(self,
                 observations,
                 action_space,
                 latent,
                 optimizer=None,
                 sess=None,
                 train=True,
                 beta=1.0,
                 l2=0.,
                 lr=0.001,
                 init_scale=0.01,
                 init_bias=0.0,
                 trainable_variance=True,
                 trainable_bias=True,
                 init_logstd=0.,
                 scope_name="pi",
                 clip=None,
                 state_dependent_variance=True,
                 **tensors):
        """
        Parameters:
        ----------
        env             RL environment

        observations    tensorflow placeholder in which the observations will be fed

        latent          latent state from which policy distribution parameters should be inferred

        sess            tensorflow session to run calculations in (if None, default session is used)

        **tensors       tensorflow tensors for additional attributes such as state or mask

        """

        self.X = observations
        self.state = tf.constant([])
        self.initial_state = None
        self.__dict__.update(tensors)

        latent = tf.layers.flatten(latent)

        self.action_space = action_space
        self.pdtype = make_pdtype(action_space)
        self.pd, self.pi = self.pdtype.pdfromlatent(
            latent,
            init_scale=init_scale,
            init_bias=init_bias,
            trainable_variance=trainable_variance,
            trainable_bias=trainable_bias,
            init_logstd=init_logstd,
            clip=clip,
            beta=beta)  # init_bias=0.0

        self.stochastic = tf.placeholder(dtype=tf.bool, shape=())
        self.action = tf_util.switch(self.stochastic, self.pd.sample(),
                                     self.pd.mode())
        self.neglogp = self.pd.neglogp(self.action)
        if beta == 1.0:
            self.prob = tf.nn.softmax(self.pd.flatparam())
        else:
            self.prob = boltzmann(self.pd.flatparam(), beta=beta)
        if optimizer is None:
            self.optimizer = tf.train.AdamOptimizer(learning_rate=lr)
        else:
            self.optimizer = optimizer
        self.sess = sess or tf.get_default_session()
        self.vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                      scope=scope_name)
        try:
            self.action_ph = tf.placeholder(tf.int64, [None],
                                            name='targets_placeholder')
            self.action_selected = action_selected = tf.one_hot(
                self.action_ph, self.action_space.n)
            #out = tf.reduce_sum(tf.reduce_sum(tf.log(self.logits+1e-5)*action_selected, axis=1))
            out = tf.reduce_mean(
                tf.log(tf.reduce_sum(self.prob * action_selected, axis=1)))
            gradients = tf.gradients(out, self.vars)
        except:
            self.action_ph = tf.placeholder(dtype=tf.float32,
                                            shape=(None, ) +
                                            action_space.shape,
                                            name='targets_placeholder')
            gradients = tf.gradients(-self.pd.neglogp(self.action_ph),
                                     self.vars)
        self.cont = cont = not isinstance(self.action_space, Discrete)
        flat_grad = tf_util.GetFlat(gradients).op
        self.compute_gradients = tf_util.function(
            inputs=[self.X, self.action_ph], outputs=[flat_grad])
        '''self.compute_cont_gradients = tf_util.function(
            inputs=[self.X, self.action_ph],
            outputs=tf.gradients(-self.pd.neglogp(self.action_ph), self.vars)
        )'''
        self.debug = tf_util.function(
            inputs=[self.X, self.action_ph],
            outputs=[gradients, self.prob, self.action_ph])
        self.set_from_flat = tf_util.SetFromFlat(self.vars)
        if self.cont:
            total_error = tf.reduce_sum(
                tf.square(self.action_ph -
                          tf.reduce_mean(self.action_ph, axis=0)),
                axis=0)
            unexplained_error = tf.reduce_sum(tf.square(self.action_ph -
                                                        self.pd.mean),
                                              axis=0)
            R_squared = 1 - (unexplained_error / total_error)
            self.accuracy = accuracy = R_squared
        else:
            self.accuracy = accuracy = tf.reduce_mean(
                tf.cast(tf.math.equal(self.pd.mode(), self.action_ph),
                        tf.float32))
        self.entropy = entropy = tf.reduce_mean(self.pd.entropy())
        if train:
            self.gamma = l2
            self._build_train(
                cont=cont, state_dependent_variance=state_dependent_variance)
        self.pdf = tf.exp(self.pd.logp(self.action_ph))
def learn(comm, env, bc_agent_wrapper, experiment_name, ckpt_dir, summary_dir,
          expert_dataset, lr, batch_size, max_iters):

    rank = comm.Get_rank()

    # Create the BC agent
    pol = bc_agent_wrapper('pol')

    # Create mpi adam optimizer for the policy
    pol_optimizer = MpiAdamOptimizer(comm,
                                     clip_norm=pol.hps.clip_norm,
                                     learning_rate=lr,
                                     name='pol_adam')
    _optimize_pol = pol_optimizer.minimize(pol.loss,
                                           var_list=pol.trainable_vars)

    # Retrieve already-existing placeholders
    e_obs = U.get_placeholder_cached(name='e_obs')
    e_acs = U.get_placeholder_cached(name='e_acs')

    # Create Theano-like ops
    optimize_pol = U.function([e_obs, e_acs], _optimize_pol)

    # Initialize variables
    U.initialize()
    # Sync params of all processes with the params of the root process
    pol_optimizer.sync_from_root(pol.trainable_vars)

    if rank == 0:
        # Create summary writer
        writer = U.file_writer(summary_dir)
        # Create the summary
        _names = ['train_loss', 'val_loss']
        _summary = CustomSummary(scalar_keys=_names, family="bc")

    # Define the origin of time
    tstart = time.time()

    # Define rolling buffers for loss collection
    maxlen = 100
    pol_train_loss_buffer = deque(maxlen=maxlen)
    pol_val_loss_buffer = deque(maxlen=maxlen)

    for iters_so_far in range(max_iters):

        # Verify that the processes are still in sync
        if iters_so_far > 0 and iters_so_far % 10 == 0:
            pol_optimizer.check_synced(pol.trainable_vars)

        # Save the model
        if rank == 0 and iters_so_far % int(1e4) == 0 and ckpt_dir is not None:
            model_path = osp.join(ckpt_dir, experiment_name)
            U.save_state(model_path, iters_so_far=iters_so_far)
            logger.info("saving model")
            logger.info("  @: {}".format(model_path))

        # Make non-zero-rank workers wait for rank zero
        comm.Barrier()

        # Go through mini-batches of the demonstration dataset, training fraction
        obs, acs = expert_dataset.get_next_pair_batch(batch_size, 'train')
        # Update running mean and std on states
        if hasattr(pol, "obs_rms"):
            pol.obs_rms.update(obs, comm)
        # Perform a gradient step to update the policy parameters
        optimize_pol(obs, acs)
        # Compute training loss
        pol_train_loss = pol.compute_pol_loss(obs, acs)
        pol_train_loss_buffer.append(pol_train_loss)
        # Go through mini-batches of the demonstration dataset, validation fraction
        obs, acs = expert_dataset.get_next_pair_batch(-1, 'val')
        # Compute validation loss
        pol_val_loss = pol.compute_pol_loss(obs, acs)
        pol_val_loss_buffer.append(pol_val_loss)

        if iters_so_far % 100 == 0:
            # Log training and validation losses
            logger.info(
                ('iter #{} '
                 '| train loss: {} '
                 '| val loss: {} '
                 '| elapsed: {}').format(iters_so_far, pol_train_loss,
                                         pol_val_loss,
                                         prettify_time(time.time() - tstart)))

        # Prepare losses to be dumped in summaries
        all_summaries = [
            np.mean(pol_train_loss_buffer),
            np.mean(pol_val_loss_buffer)
        ]  # must be visible by all workers
        if rank == 0:
            assert len(_names) == len(
                all_summaries), "mismatch in list lengths"
            _summary.add_all_summaries(writer, all_summaries, iters_so_far)
def learn(
        *,
        network,
        env,
        eval_policy,
        total_timesteps,
        timesteps_per_batch=1024,  # what to train on
        max_kl=0.001,
        cg_iters=10,
        gamma=0.99,
        lam=1.0,  # advantage estimation
        seed=None,
        ent_coef=0.0,
        cg_damping=1e-2,
        vf_stepsize=3e-4,
        vf_iters=3,
        max_episodes=0,
        max_iters=0,  # time constraint
        callback=None,
        load_path=None,
        checkpoint_path_in=None,
        checkpoint_dir_out=None,
        checkpoint_freq=100,  # In iterations!!,
        from_iter=0,
        eval_episodes=20,
        **network_kwargs):
    '''
    learn a policy function with TRPO algorithm

    Parameters:
    ----------

    network                 neural network to learn. Can be either string ('mlp', 'cnn', 'lstm', 'lnlstm' for basic types)
                            or function that takes input placeholder and returns tuple (output, None) for feedforward nets
                            or (output, (state_placeholder, state_output, mask_placeholder)) for recurrent nets

    env                     environment (one of the gym environments or wrapped via baselines.common.vec_env.VecEnv-type class

    timesteps_per_batch     timesteps per gradient estimation batch

    max_kl                  max KL divergence between old policy and new policy ( KL(pi_old || pi) )

    ent_coef                coefficient of policy entropy term in the optimization objective

    cg_iters                number of iterations of conjugate gradient algorithm

    cg_damping              conjugate gradient damping

    vf_stepsize             learning rate for adam optimizer used to optimie value function loss

    vf_iters                number of iterations of value function optimization iterations per each policy optimization step

    total_timesteps           max number of timesteps

    max_episodes            max number of episodes

    max_iters               maximum number of policy optimization iterations

    callback                function to be called with (locals(), globals()) each policy optimization step

    load_path               str, path to load the model from (default: None, i.e. no model is loaded)

    **network_kwargs        keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network

    Returns:
    -------

    learnt model

    '''

    nworkers = MPI.COMM_WORLD.Get_size()
    rank = MPI.COMM_WORLD.Get_rank()

    cpus_per_worker = 1
    U.get_session(
        config=tf.ConfigProto(allow_soft_placement=True,
                              inter_op_parallelism_threads=cpus_per_worker,
                              intra_op_parallelism_threads=cpus_per_worker))

    policy = build_policy(env, network, value_network='copy', **network_kwargs)

    set_global_seeds(seed)

    np.set_printoptions(precision=3)
    # Setup losses and stuff
    # ----------------------------------------
    # ob_space = Box(low=-np.inf, high=np.inf, shape=(env.observation_space.n,))
    ob_space = env.observation_space
    ac_space = env.action_space

    ob = observation_placeholder(ob_space)
    with tf.variable_scope("pi"):
        pi = policy(observ_placeholder=ob)
    with tf.variable_scope("oldpi"):
        oldpi = policy(observ_placeholder=ob)
    # Loading checkpoint
    if checkpoint_path_in is not None and os.path.isfile(checkpoint_path_in):
        pi.load(checkpoint_path_in)
        logger.log('Loaded policy weights from %s' % checkpoint_path_in)

    atarg = tf.placeholder(
        dtype=tf.float32,
        shape=[None])  # Target advantage function (if applicable)
    ret = tf.placeholder(dtype=tf.float32, shape=[None])  # Empirical return

    ac = pi.pdtype.sample_placeholder([None])

    kloldnew = oldpi.pd.kl(pi.pd)
    ent = pi.pd.entropy()
    meankl = tf.reduce_mean(kloldnew)
    meanent = tf.reduce_mean(ent)
    entbonus = ent_coef * meanent

    vferr = tf.reduce_mean(tf.square(pi.vf - ret))

    ratio = tf.exp(pi.pd.logp(ac) -
                   oldpi.pd.logp(ac))  # advantage * pnew / pold
    surrgain = tf.reduce_mean(ratio * atarg)

    optimgain = surrgain + entbonus
    losses = [optimgain, meankl, entbonus, surrgain, meanent]
    loss_names = ["optimgain", "meankl", "entloss", "surrgain", "entropy"]

    dist = meankl

    all_var_list = get_trainable_variables("pi")
    # var_list = [v for v in all_var_list if v.name.split("/")[1].startswith("pol")]
    # vf_var_list = [v for v in all_var_list if v.name.split("/")[1].startswith("vf")]
    var_list = get_pi_trainable_variables("pi")
    vf_var_list = get_vf_trainable_variables("pi")

    vfadam = MpiAdam(vf_var_list)

    get_flat = U.GetFlat(var_list)
    set_from_flat = U.SetFromFlat(var_list)
    klgrads = tf.gradients(dist, var_list)
    flat_tangent = tf.placeholder(dtype=tf.float32,
                                  shape=[None],
                                  name="flat_tan")
    shapes = [var.get_shape().as_list() for var in var_list]
    start = 0
    tangents = []
    for shape in shapes:
        sz = U.intprod(shape)
        tangents.append(tf.reshape(flat_tangent[start:start + sz], shape))
        start += sz
    gvp = tf.add_n([
        tf.reduce_sum(g * tangent)
        for (g, tangent) in zipsame(klgrads, tangents)
    ])  # pylint: disable=E1111
    fvp = U.flatgrad(gvp, var_list)

    assign_old_eq_new = U.function(
        [], [],
        updates=[
            tf.assign(oldv, newv)
            for (oldv,
                 newv) in zipsame(get_variables("oldpi"), get_variables("pi"))
        ])

    compute_losses = U.function([ob, ac, atarg], losses)
    compute_lossandgrad = U.function([ob, ac, atarg], losses +
                                     [U.flatgrad(optimgain, var_list)])
    compute_fvp = U.function([flat_tangent, ob, ac, atarg], fvp)
    compute_vflossandgrad = U.function([ob, ret],
                                       U.flatgrad(vferr, vf_var_list))

    @contextmanager
    def timed(msg):
        if rank == 0:
            print(colorize(msg, color='magenta'))
            tstart = time.time()
            yield
            print(
                colorize("done in %.3f seconds" % (time.time() - tstart),
                         color='magenta'))
        else:
            yield

    def allmean(x):
        assert isinstance(x, np.ndarray)
        out = np.empty_like(x)
        MPI.COMM_WORLD.Allreduce(x, out, op=MPI.SUM)
        out /= nworkers
        return out

    U.initialize()
    # s = env.reset()
    # start = time.time()
    # for i in range(10000):
    #     pi.step(s, stochastic=True)
    # duration = time.time() - start
    # print(duration)
    # return
    if load_path is not None:
        pi.load(load_path)

    th_init = get_flat()
    MPI.COMM_WORLD.Bcast(th_init, root=0)
    set_from_flat(th_init)
    vfadam.sync()
    print("Init param sum", th_init.sum(), flush=True)

    # Prepare for rollouts
    # ----------------------------------------
    seg_gen = traj_segment_generator(pi,
                                     env,
                                     timesteps_per_batch,
                                     stochastic=True,
                                     gamma=gamma)

    episodes_so_far = 0
    timesteps_so_far = 0
    iters_so_far = 0
    iters_eval = 0
    all_logs = []
    best_rew = -np.inf

    tstart = time.time()
    lenbuffer = deque(maxlen=40)  # rolling buffer for episode lengths
    rewbuffer = deque(maxlen=40)  # rolling buffer for episode rewards

    online_scores = []
    offline_scores = []
    if sum([max_iters > 0, total_timesteps > 0, max_episodes > 0]) == 0:
        # noththing to be done
        return pi

    assert sum([max_iters > 0, total_timesteps > 0, max_episodes > 0]) < 2, \
        'out of max_iters, total_timesteps, and max_episodes only one should be specified'

    while True:
        if callback: callback(locals(), globals())
        if total_timesteps and timesteps_so_far >= total_timesteps:
            break
        elif max_episodes and episodes_so_far >= max_episodes:
            break
        elif max_iters and iters_so_far >= max_iters:
            break
        logger.log("********** Iteration %i ************" % iters_so_far)

        if iters_so_far % checkpoint_freq == 0 and checkpoint_dir_out is not None:
            if not os.path.exists(checkpoint_dir_out):
                os.makedirs(checkpoint_dir_out)
            pi.save(
                os.path.join(checkpoint_dir_out,
                             'checkpoint_%d' % iters_so_far))
            logger.log('Saved policy weights as %s' % os.path.join(
                checkpoint_dir_out, 'checkpoint_%d.npy' % iters_so_far))

            def pi_wrapper(ob):
                ac, vpred, _, _ = pi.step(ob, stochastic=True)
                return ac

            rew, _, logs, disc_rets, num_stops, avg_damages = eval_policy(
                pi=pi_wrapper, n_episodes=eval_episodes, verbose=True)
            offline_scores.append(
                [np.mean(disc_rets),
                 np.mean(num_stops),
                 np.mean(avg_damages)])
            np.save(os.path.join(checkpoint_dir_out, 'offline_scores.npy'),
                    offline_scores)
            for log in logs:
                log['iter'] = iters_eval
            all_logs = all_logs + logs

            iters_eval += 1

        with timed("sampling"):
            seg = seg_gen.__next__()
        add_vtarg_and_adv(seg, gamma, lam)

        # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets))
        ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg[
            "tdlamret"]
        vpredbefore = seg["vpred"]  # predicted value function before udpate

        atarg = (atarg - atarg.mean()
                 ) / atarg.std()  # standardized advantage function estimate

        if hasattr(pi, "ret_rms"): pi.ret_rms.update(tdlamret)
        if hasattr(pi, "ob_rms"):
            pi.ob_rms.update(ob)  # update running mean/std for policy

        args = seg["ob"], seg["ac"], atarg
        fvpargs = [arr[::5] for arr in args]

        def fisher_vector_product(p):
            return allmean(compute_fvp(p, *fvpargs)) + cg_damping * p

        assign_old_eq_new()  # set old parameter values to new parameter values
        with timed("computegrad"):
            *lossbefore, g = compute_lossandgrad(*args)
        lossbefore = allmean(np.array(lossbefore))
        g = allmean(g)
        if np.allclose(g, 0):
            logger.log("Got zero gradient. not updating")
        else:
            with timed("cg"):
                stepdir = cg(fisher_vector_product,
                             g,
                             cg_iters=cg_iters,
                             verbose=rank == 0)
            assert np.isfinite(stepdir).all()
            shs = .5 * stepdir.dot(fisher_vector_product(stepdir))
            lm = np.sqrt(shs / max_kl)
            # logger.log("lagrange multiplier:", lm, "gnorm:", np.linalg.norm(g))
            fullstep = stepdir / lm
            expectedimprove = g.dot(fullstep)
            surrbefore = lossbefore[0]
            stepsize = 1.0
            thbefore = get_flat()
            for _ in range(10):
                thnew = thbefore + fullstep * stepsize
                set_from_flat(thnew)
                meanlosses = surr, kl, *_ = allmean(
                    np.array(compute_losses(*args)))
                improve = surr - surrbefore
                logger.log("Expected: %.3f Actual: %.3f" %
                           (expectedimprove, improve))
                if not np.isfinite(meanlosses).all():
                    logger.log("Got non-finite value of losses -- bad!")
                elif kl > max_kl * 1.5:
                    logger.log("violated KL constraint. shrinking step.")
                elif improve < 0:
                    logger.log("surrogate didn't improve. shrinking step.")
                else:
                    logger.log("Stepsize OK!")
                    break
                stepsize *= .5
            else:
                logger.log("couldn't compute a good step")
                set_from_flat(thbefore)
            if nworkers > 1 and iters_so_far % 20 == 0:
                paramsums = MPI.COMM_WORLD.allgather(
                    (thnew.sum(), vfadam.getflat().sum()))  # list of tuples
                assert all(
                    np.allclose(ps, paramsums[0]) for ps in paramsums[1:])

        for (lossname, lossval) in zip(loss_names, meanlosses):
            logger.record_tabular(lossname, lossval)

        with timed("vf"):

            for _ in range(vf_iters):
                for (mbob, mbret) in dataset.iterbatches(
                    (seg["ob"], seg["tdlamret"]),
                        include_final_partial_batch=False,
                        batch_size=64):
                    g = allmean(compute_vflossandgrad(mbob, mbret))
                    vfadam.update(g, vf_stepsize)

        logger.record_tabular("ev_tdlam_before",
                              explained_variance(vpredbefore, tdlamret))

        lrlocal = (seg["ep_lens"], seg["ep_rets"])  # local values
        listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal)  # list of tuples
        lens, rews = map(flatten_lists, zip(*listoflrpairs))
        lenbuffer.extend(lens)
        rewbuffer.extend(rews)

        logger.record_tabular("EpLenMean", np.mean(lenbuffer))
        logger.record_tabular("EpRewMean", np.mean(rewbuffer))
        logger.record_tabular("EpThisIter", len(lens))
        episodes_so_far += len(lens)
        timesteps_so_far += sum(lens)
        iters_so_far += 1

        logger.record_tabular("EpisodesSoFar", episodes_so_far)
        logger.record_tabular("TimestepsSoFar", timesteps_so_far)
        logger.record_tabular("TimeElapsed", time.time() - tstart)

        ep_rew_mean = np.mean(rewbuffer)
        online_scores.append(ep_rew_mean)
        np.save(os.path.join(checkpoint_dir_out, 'online_scores.npy'),
                online_scores)
        # Saving best
        if iters_so_far % checkpoint_freq == 0 and ep_rew_mean > best_rew and checkpoint_dir_out is not None:
            pi.save(os.path.join(checkpoint_dir_out, 'best'))
            best_rew = ep_rew_mean
            logger.log('Saved policy weights as %s' %
                       os.path.join(checkpoint_dir_out, 'best.npy'))

        if rank == 0:
            logger.dump_tabular()

    return pi
def learn(
        env,
        policy_fn,
        *,
        timesteps_per_actorbatch,  # timesteps per actor per update
        optim_stepsize,
        optim_batchsize,  # optimization hypers
        gamma,
        lam,  # advantage estimation
        entcoeff=0.0,
        max_episodes=0,
        max_iters=0,
        max_seconds=0,  # time constraint
        callback=None,  # you can do anything in the callback, since it takes locals(), globals()
        adam_epsilon=1e-5,
        schedule='constant',  # annealing for stepsize parameters (epsilon and adam)
        args):
    # Setup losses and stuff`
    # ----------------------------------------
    ob_space = env.observation_space
    ac_space = env.action_space
    pi = policy_fn("pi", ob_space,
                   ac_space)  # Construct network for new policy
    oldpi = policy_fn("oldpi", ob_space, ac_space)  # Network for old policy

    # Ops to reassign params from new to old
    assign_old_eq_new = U.function(
        [], [],
        updates=[
            tf.assign(oldv, newv)
            for (oldv,
                 newv) in zipsame(oldpi.get_variables(), pi.get_variables())
        ])

    atarg = tf.placeholder(
        dtype=tf.float32,
        shape=[None])  # Target advantage function (if applicable)
    ret = tf.placeholder(dtype=tf.float32, shape=[None])  # Empirical return

    lrmult = tf.placeholder(
        name='lrmult', dtype=tf.float32,
        shape=[])  # learning rate multiplier, updated with schedule

    ob = U.get_placeholder_cached(name="ob")
    ac = pi.pdtype.sample_placeholder([None])

    kloldnew = oldpi.pd.kl(pi.pd)
    ent = pi.pd.entropy()
    meankl = tf.reduce_mean(kloldnew)
    meanent = tf.reduce_mean(ent)
    pol_entpen = (-entcoeff) * meanent

    newprob = tf.exp(pi.pd.logp(ac))
    oldprob = tf.exp(oldpi.pd.logp(ac))

    ratio = newprob / oldprob

    kl = pi.pd.kl(oldpi.pd)
    mean_kl = tf.reduce_mean(kl)
    get_kl = U.function([ob, ac], kl)
    get_mean_kl = U.function([ob, ac], mean_kl)

    threshold = kl < args.kl_threshold
    threshold = tf.cast(threshold, tf.float32)

    pol_surr = (kl - ratio * atarg / args.sepg_lam) * threshold

    pol_surr = tf.reduce_mean(pol_surr)

    vf_loss = tf.reduce_mean(tf.square(pi.vpred - ret))
    total_loss = pol_surr + pol_entpen + vf_loss
    losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent]
    loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"]

    var_list = pi.get_trainable_variables()
    lossandgrad = U.function([ob, ac, atarg, ret, lrmult],
                             losses + [U.flatgrad(total_loss, var_list)])

    adam = MpiAdam(var_list, epsilon=adam_epsilon)

    compute_losses = U.function([ob, ac, atarg, ret, lrmult], losses)

    U.initialize()
    adam.sync()

    # Prepare for rollouts
    # ----------------------------------------
    seg_gen = traj_segment_generator(pi,
                                     env,
                                     timesteps_per_actorbatch,
                                     stochastic=True)

    episodes_so_far = 0
    timesteps_so_far = 0
    iters_so_far = 0
    tstart = time.time()
    lenbuffer = deque(maxlen=100)  # rolling buffer for episode lengths
    rewbuffer = deque(maxlen=100)  # rolling buffer for episode rewards

    running_scores = []

    assert sum([
        max_iters > 0, args.num_timesteps > 0, max_episodes > 0,
        max_seconds > 0
    ]) == 1, "Only one time constraint permitted"

    while True:
        if callback: callback(locals(), globals())
        if args.num_timesteps and timesteps_so_far >= args.num_timesteps:
            break
        elif max_episodes and episodes_so_far >= max_episodes:
            break
        elif max_iters and iters_so_far >= max_iters:
            break
        elif max_seconds and time.time() - tstart >= max_seconds:
            break

        if schedule == 'constant':
            cur_lrmult = 1.0
        elif schedule == 'linear':
            cur_lrmult = max(
                1.0 - float(timesteps_so_far) / args.num_timesteps, 0)
        else:
            raise NotImplementedError

        if MPI.COMM_WORLD.Get_rank() == 0:
            logger.log("********** Iteration %i ************" % iters_so_far)

        seg = seg_gen.__next__()
        add_vtarg_and_adv(seg, gamma, lam)

        # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets))
        ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg[
            "tdlamret"]
        vpredbefore = seg["vpred"]  # predicted value function before udpate
        atarg = (atarg - atarg.mean()) / (
            atarg.std() + 1e-8)  # standardized advantage function estimate

        optim_batchsize = optim_batchsize or ob.shape[0]

        if hasattr(pi, "ob_rms"):
            pi.ob_rms.update(ob)  # update running mean/std for policy

        assign_old_eq_new()  # set old parameter values to new parameter values

        d = Dataset(dict(ob=ob, ac=ac, atarg=atarg, vtarg=tdlamret),
                    shuffle=not pi.recurrent)

        # Here we do a bunch of optimization epochs over the data
        for num_epoch in count():
            losses = [
            ]  # list of tuples, each of which gives the loss for a minibatch
            for batch in d.iterate_once(optim_batchsize):
                *newlosses, g = lossandgrad(batch["ob"], batch["ac"],
                                            batch["atarg"], batch["vtarg"],
                                            cur_lrmult)
                g = np.nan_to_num(g)
                adam.update(g, optim_stepsize * cur_lrmult)
                losses.append(newlosses)

            agg_mean_kl = get_mean_kl(ob, ac)

            if agg_mean_kl > args.agg_kl_threshold or num_epoch == args.optim_epochs:
                break

        lrlocal = (seg["ep_lens"], seg["ep_rets"])  # local values
        listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal)  # list of tuples
        lens, rews = map(flatten_lists, zip(*listoflrpairs))

        rewbuffer.extend(rews)

        mean_score = None

        if rewbuffer:
            mean_score = np.mean(rewbuffer)
            running_scores.append((timesteps_so_far, mean_score))

        episodes_so_far += len(lens)
        timesteps_so_far += sum(lens)
        iters_so_far += 1
        if MPI.COMM_WORLD.Get_rank() == 0:
            logger.record_tabular("EpRewMean", mean_score)
            logger.record_tabular("EpThisIter", len(lens))
            logger.record_tabular("EpisodesSoFar", episodes_so_far)
            logger.record_tabular("TimestepsSoFar", timesteps_so_far)
            logger.record_tabular("TimeElapsed", time.time() - tstart)
            logger.record_tabular("NumEpoch", num_epoch)

            logger.dump_tabular()

    return running_scores
    def __init__(self, env, hidden_size, expert_dataset):
        self.hidden_size = hidden_size
        self.expert_dataset = expert_dataset
        with tf.variable_scope('guidance'):
            self.scope = tf.get_variable_scope().name

            self.agent_s = tf.placeholder(dtype=tf.float32,
                                          shape=[None] +
                                          list(env.observation_space.shape),
                                          name='ph_agent_s')
            self.agent_a = tf.placeholder(dtype=tf.float32,
                                          shape=[None] +
                                          list(env.action_space.shape),
                                          name='ph_agent_a')
            self.expert_a = tf.placeholder(dtype=tf.float32,
                                           shape=[None] +
                                           list(env.action_space.shape),
                                           name='ph_expert_a')

            with tf.variable_scope("obfilter"):
                self.obs_rms = RunningMeanStd(
                    shape=env.observation_space.shape)
            obs_ph_rms = (self.agent_s - self.obs_rms.mean) / self.obs_rms.std

            layer_s = tf.layers.dense(inputs=obs_ph_rms,
                                      units=self.hidden_size,
                                      activation=tf.nn.leaky_relu,
                                      name='layer_s')

            layer_a = tf.layers.dense(inputs=self.agent_a,
                                      units=self.hidden_size,
                                      activation=tf.nn.leaky_relu,
                                      name='layer_a')

            layer_s_a = tf.concat([layer_s, layer_a], axis=1)

            layer = tf.layers.dense(inputs=layer_s_a,
                                    units=self.hidden_size,
                                    activation=tf.nn.leaky_relu,
                                    name='layer1')

            output = tf.layers.dense(inputs=layer,
                                     units=env.action_space.shape[0],
                                     activation=tf.identity,
                                     name='layer2')

            ##########
            # BUG
            ##########
            # loss_func = tf.contrib.gan.losses.wargs.mutual_information_penalty
            labels = tf.nn.softmax(self.expert_a)
            self.loss = tf.reduce_mean(
                tf.nn.softmax_cross_entropy_with_logits(labels=labels,
                                                        logits=output))

            optimizer = tf.train.AdamOptimizer()
            self.train_op = optimizer.minimize(self.loss)

        self.loss_name = ["guidance_loss"]
        var_list = self.get_trainable_variables()
        self.lossandgrad = U.function(
            [self.agent_s, self.agent_a, self.expert_a],
            [self.loss] + [U.flatgrad(self.loss, var_list)])
    def __init__(self, env, hidden_size, expert_dataset):
        self.obs = expert_dataset.inputs
        self.acs = expert_dataset.labels
        with tf.variable_scope('guidance'):
            self.scope = tf.get_variable_scope().name

            self.agent_s = tf.placeholder(dtype=tf.float32,
                                          shape=[None] +
                                          list(env.observation_space.shape),
                                          name='ph_agent_s')
            self.agent_a = tf.placeholder(dtype=tf.int32,
                                          shape=[None],
                                          name='ph_agent_a')
            agent_a_one_hot = tf.one_hot(self.agent_a,
                                         depth=env.action_space.n)

            self.expert_a = tf.placeholder(dtype=tf.int32,
                                           shape=[None],
                                           name='ph_expert_a')
            expert_a_one_hot = tf.one_hot(self.expert_a,
                                          depth=env.action_space.n)

            with tf.variable_scope("obfilter"):
                self.obs_rms = RunningMeanStd(
                    shape=env.observation_space.shape)
            obs_ph_rms = (self.agent_s - self.obs_rms.mean) / self.obs_rms.std

            layer_s = tf.layers.dense(inputs=obs_ph_rms,
                                      units=hidden_size,
                                      activation=tf.nn.leaky_relu,
                                      name='layer_s')

            layer_a = tf.layers.dense(inputs=agent_a_one_hot,
                                      units=hidden_size,
                                      activation=tf.nn.leaky_relu,
                                      name='layer_a')

            layer_s_a = tf.concat([layer_s, layer_a], axis=1)

            layer = tf.layers.dense(inputs=layer_s_a,
                                    units=hidden_size,
                                    activation=tf.nn.leaky_relu,
                                    name='layer1')

            output = tf.layers.dense(inputs=layer,
                                     units=env.action_space.n,
                                     activation=tf.nn.softmax,
                                     name='layer2')

            loss = tf.keras.losses.categorical_crossentropy(
                y_true=expert_a_one_hot, y_pred=output)
            # loss = tf.nn.softmax_cross_entropy_with_logits(labels=expert_a_one_hot, logits=output)
            self.loss = tf.reduce_mean(loss)
            ##########
            # BUG
            ##########
            # loss_func = tf.contrib.gan.losses.wargs.mutual_information_penalty
            # self.loss = loss_func(structured_generator_inputs=output, predicted_distributions=expert_a_one_hot)

            optimizer = tf.train.AdamOptimizer()
            self.train_op = optimizer.minimize(self.loss)

        self.loss_name = ["guidance_loss"]
        var_list = self.get_trainable_variables()
        self.lossandgrad = U.function(
            [self.agent_s, self.agent_a, self.expert_a],
            [self.loss] + [U.flatgrad(self.loss, var_list)])
Exemple #12
0
    def __init__(self, env, observations, latent, estimate_q=False, vf_latent=None,
                 sess=None,trainable_variance=True,init_logstd=0, clip=None, **tensors):
        """
        Parameters:
        ----------
        env             RL environment

        observations    tensorflow placeholder in which the observations will be fed

        latent          latent state from which policy distribution parameters should be inferred

        vf_latent       latent state from which value function should be inferred (if None, then latent is used)

        sess            tensorflow session to run calculations in (if None, default session is used)

        **tensors       tensorflow tensors for additional attributes such as state or mask

        """

        self.X = observations
        self.state = tf.constant([])
        self.initial_state = None
        self.__dict__.update(tensors)

        vf_latent = vf_latent if vf_latent is not None else latent

        vf_latent = tf.layers.flatten(vf_latent)
        latent = tf.layers.flatten(latent)

        self.pdtype = make_pdtype(env.action_space)

        self.pd, self.pi = self.pdtype.pdfromlatent(latent, init_scale=0.01,
                                                    trainable_variance=trainable_variance,
                                                    init_logstd=init_logstd,
                                                    clip=clip)

        self.stochastic = tf.placeholder(dtype=tf.bool, shape=())
        self.action = tf_util.switch(self.stochastic, self.pd.sample(), self.pd.mode())
        self.neglogp = self.pd.neglogp(self.action)
        self.logits=tf.nn.softmax(self.pd.flatparam())
        self.sess = sess
        self.prob = tf.nn.softmax(self.pd.flatparam())
        #out = tf.reduce_mean(tf.log(tf.reduce_sum(self.prob * action_selected, axis=1)))
        self.vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope="pi/pi")
        if len(self.vars) == 0:
            self.vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope="pi")
        self.set_from_flat = tf_util.SetFromFlat(self.vars)
        try:
            self.action_ph = tf.placeholder(tf.int64, [None], name='targets_placeholder')
            self.action_selected = action_selected = tf.one_hot(self.action_ph, env.action_space.n)
        #out = tf.reduce_sum(tf.reduce_sum(tf.log(self.logits+1e-5)*action_selected, axis=1))
            out = tf.reduce_mean(tf.log(tf.reduce_sum(self.prob*action_selected, axis=1)))
            gradients = tf.gradients(out, self.vars)
        except:
            self.action_ph = tf.placeholder(dtype=tf.float32, shape=(None,) + env.action_space.shape,
                                            name='targets_placeholder')
            gradients = tf.gradients(-self.pd.neglogp(self.action_ph), self.vars)
        #gradients = tf.gradients(out, self.vars)
        if gradients[0] is not None:
            flat_grad = tf_util.GetFlat(gradients).op
            self.compute_gradients = tf_util.function(
                inputs=[self.X, self.action_ph],
                outputs=[flat_grad]
            )
        if estimate_q:
            assert isinstance(env.action_space, gym.spaces.Discrete)
            self.q = fc(vf_latent, 'q', env.action_space.n)
            self.vf = self.q
        else:
            self.vf = fc(vf_latent, 'vf', 1)
            self.vf = self.vf[:,0]