Ejemplo n.º 1
0
def test_MpiAdam():
    np.random.seed(0)
    tf.set_random_seed(0)

    a = tf.Variable(np.random.randn(3).astype('float32'))
    b = tf.Variable(np.random.randn(2, 5).astype('float32'))
    loss = tf.reduce_sum(tf.square(a)) + tf.reduce_sum(tf.sin(b))

    stepsize = 1e-2
    update_op = tf.train.AdamOptimizer(stepsize).minimize(loss)
    do_update = U.function([], loss, updates=[update_op])

    tf.get_default_session().run(tf.global_variables_initializer())
    for i in range(10):
        print(i, do_update())

    tf.set_random_seed(0)
    tf.get_default_session().run(tf.global_variables_initializer())

    var_list = [a, b]
    lossandgrad = U.function([], [loss, U.flatgrad(loss, var_list)],
                             updates=[update_op])
    adam = MpiAdam(var_list)

    for i in range(10):
        l, g = lossandgrad()
        adam.update(g, stepsize)
        print(i, l)
Ejemplo n.º 2
0
def validate_probtype(probtype, pdparam):
    N = 100000
    # Check to see if mean negative log likelihood == differential entropy
    Mval = np.repeat(pdparam[None, :], N, axis=0)
    M = probtype.param_placeholder([N])
    X = probtype.sample_placeholder([N])
    pd = probtype.pdclass()(M)
    calcloglik = U.function([X, M], pd.logp(X))
    calcent = U.function([M], pd.entropy())
    Xval = U.eval(pd.sample(), feed_dict={M: Mval})
    logliks = calcloglik(Xval, Mval)
    entval_ll = -logliks.mean()  #pylint: disable=E1101
    entval_ll_stderr = logliks.std() / np.sqrt(N)  #pylint: disable=E1101
    entval = calcent(Mval).mean()  #pylint: disable=E1101
    assert np.abs(entval - entval_ll) < 3 * entval_ll_stderr  # within 3 sigmas

    # Check to see if kldiv[p,q] = - ent[p] - E_p[log q]
    M2 = probtype.param_placeholder([N])
    pd2 = probtype.pdclass()(M2)
    q = pdparam + np.random.randn(pdparam.size) * 0.1
    Mval2 = np.repeat(q[None, :], N, axis=0)
    calckl = U.function([M, M2], pd.kl(pd2))
    klval = calckl(Mval, Mval2).mean()  #pylint: disable=E1101
    logliks = calcloglik(Xval, Mval2)
    klval_ll = -entval - logliks.mean()  #pylint: disable=E1101
    klval_ll_stderr = logliks.std() / np.sqrt(N)  #pylint: disable=E1101
    assert np.abs(klval - klval_ll) < 3 * klval_ll_stderr  # within 3 sigmas
Ejemplo n.º 3
0
 def __init__(self,
              env,
              hidden_size,
              sequence_size,
              attention_size,
              cell_type,
              entcoeff=0.001,
              lr_rate=0.0,
              scope="adversary"):
     self.scope = scope
     self.observation_shape = env.observation_space.shape
     self.action_shape = env.action_space.shape
     self.num_observations = self.observation_shape[0]
     self.num_actions = self.action_shape[0]
     self.embedding_size = self.num_observations + self.num_actions
     self.hidden_size = hidden_size
     self.sequence_size = sequence_size
     self.attention_size = attention_size
     self.cell_type = cell_type
     self.build_ph()
     #Build graph
     generator_logits, self.rewards_op = self.build_graph(
         self.generator_traj_ph, self.generator_traj_seq_len, reuse=False)
     expert_logits, _ = self.build_graph(self.expert_traj_ph,
                                         self.expert_traj_seq_len,
                                         reuse=True)
     # Build accuracy
     generator_acc = tf.reduce_mean(
         tf.to_float(tf.nn.sigmoid(generator_logits) < 0.5))
     expert_acc = tf.reduce_mean(
         tf.to_float(tf.nn.sigmoid(expert_logits) > 0.5))
     # Build regression loss
     # let x = logits, z = targets.
     # z * -log(sigmoid(x)) + (1 - z) * -log(1 - sigmoid(x))
     generator_loss = tf.nn.sigmoid_cross_entropy_with_logits(
         logits=generator_logits, labels=tf.zeros_like(generator_logits))
     generator_loss = tf.reduce_mean(generator_loss)
     expert_loss = tf.nn.sigmoid_cross_entropy_with_logits(
         logits=expert_logits, labels=tf.ones_like(expert_logits))
     expert_loss = tf.reduce_mean(expert_loss)
     # Build entropy loss
     logits = tf.concat([generator_logits, expert_logits], 0)
     entropy = tf.reduce_mean(logit_bernoulli_entropy(logits))
     entropy_loss = -entcoeff * entropy
     # Loss + Accuracy terms
     self.losses = [
         generator_loss, expert_loss, entropy, entropy_loss, generator_acc,
         expert_acc
     ]
     self.loss_name = [
         "generator_loss", "expert_loss", "entropy", "entropy_loss",
         "generator_acc", "expert_acc"
     ]
     self.total_loss = generator_loss + expert_loss + entropy_loss
     var_list = self.get_trainable_variables()
     self.lossandgrad = U.function([
         self.generator_traj_ph, self.generator_traj_seq_len,
         self.expert_traj_ph, self.expert_traj_seq_len,
         self.dropout_keep_prob
     ], self.losses + [U.flatgrad(self.total_loss, var_list)])
Ejemplo n.º 4
0
    def _init(self, ob_space, ac_space, kind):
        assert isinstance(ob_space, gym.spaces.Box)

        self.pdtype = pdtype = make_pdtype(ac_space)
        sequence_length = None

        ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape))

        x = ob / 255.0
        if kind == 'small': # from A3C paper
            x = tf.nn.relu(U.conv2d(x, 16, "l1", [8, 8], [4, 4], pad="VALID"))
            x = tf.nn.relu(U.conv2d(x, 32, "l2", [4, 4], [2, 2], pad="VALID"))
            x = U.flattenallbut0(x)
            x = tf.nn.relu(U.dense(x, 256, 'lin', U.normc_initializer(1.0)))
        elif kind == 'large': # Nature DQN
            x = tf.nn.relu(U.conv2d(x, 32, "l1", [8, 8], [4, 4], pad="VALID"))
            x = tf.nn.relu(U.conv2d(x, 64, "l2", [4, 4], [2, 2], pad="VALID"))
            x = tf.nn.relu(U.conv2d(x, 64, "l3", [3, 3], [1, 1], pad="VALID"))
            x = U.flattenallbut0(x)
            x = tf.nn.relu(U.dense(x, 512, 'lin', U.normc_initializer(1.0)))
        else:
            raise NotImplementedError

        logits = U.dense(x, pdtype.param_shape()[0], "logits", U.normc_initializer(0.01))
        self.pd = pdtype.pdfromflat(logits)
        self.vpred = U.dense(x, 1, "value", U.normc_initializer(1.0))[:,0]

        self.state_in = []
        self.state_out = []

        stochastic = tf.placeholder(dtype=tf.bool, shape=())
        ac = self.pd.sample() # XXX
        self._act = U.function([stochastic, ob], [ac, self.vpred])
Ejemplo n.º 5
0
  def __init__(self,
               env,
               hidden_size,
               discriminatorStepSize=3e-4,
               entcoeff=0.001,
               scope="adversary"):
    self.scope = scope
    self.observation_shape = env.observation_space.shape
    self.actions_shape = env.action_space.shape
    self.input_shape = tuple([o+a for o,a in zip(self.observation_shape, self.actions_shape)])
    self.num_actions = env.action_space.shape[0]
    self.hidden_size = hidden_size
    self.discriminatorStepSize = discriminatorStepSize
    self.build_ph()
    # Build grpah
    generator_logits = self.build_graph(self.generator_obs_ph, self.generator_acs_ph, reuse=False)
    expert_logits = self.build_graph(self.expert_obs_ph, self.expert_acs_ph, reuse=True)
    gl = self.get_trainable_variables()
    # Build accuracy
    generator_acc = tf.reduce_mean(tf.to_float(tf.nn.sigmoid(generator_logits) < 0.5))
    expert_acc = tf.reduce_mean(tf.to_float(tf.nn.sigmoid(expert_logits) > 0.5))
    # Build regression loss
    # let x = logits, z = targets.
    # z * -log(sigmoid(x)) + (1 - z) * -log(1 - sigmoid(x))
    generator_loss = tf.nn.sigmoid_cross_entropy_with_logits(logits=generator_logits, labels=tf.zeros_like(generator_logits))
    generator_loss = tf.reduce_mean(generator_loss)
    expert_loss = tf.nn.sigmoid_cross_entropy_with_logits(logits=expert_logits, labels=tf.ones_like(expert_logits))
    expert_loss = tf.reduce_mean(expert_loss)
    # Build entropy loss
    logits = tf.concat([generator_logits, expert_logits], 0)
    entropy = tf.reduce_mean(logit_bernoulli_entropy(logits))
    entropy_loss = -entcoeff*entropy
    # Loss + Accuracy terms
    self.losses = [generator_loss, expert_loss, entropy, entropy_loss, generator_acc, expert_acc]
    self.loss_name = ["generator_loss", "expert_loss", "entropy", "entropy_loss", "generator_acc", "expert_acc"]
    self.total_loss = generator_loss + expert_loss + entropy_loss
    # Build Reward for policy
    self.reward_op = -tf.log(1-tf.nn.sigmoid(generator_logits)+1e-8)
    var_list = self.get_trainable_variables()
    self.lossandgrad = U.function([self.generator_obs_ph, self.generator_acs_ph, self.expert_obs_ph, self.expert_acs_ph],
                         self.losses + [U.flatgrad(self.total_loss, var_list)])

    self.get_expert_logits = U.function([self.expert_obs_ph, self.expert_acs_ph], expert_logits)

    self.get_logits = U.function(
      [self.generator_obs_ph, self.generator_acs_ph, self.expert_obs_ph, self.expert_acs_ph],
      [expert_logits] + [generator_logits])
Ejemplo n.º 6
0
def learn(args,
          env,
          policy_func,
          dataset,
          optim_batch_size=128,
          adam_epsilon=1e-5,
          optim_stepsize=3e-4):

    # ============================== INIT FROM ARGS ==================================
    max_iters = args.BC_max_iter
    pretrained = args.pretrained
    ckpt_dir = args.checkpoint_dir
    log_dir = args.log_dir
    task_name = args.task_name

    val_per_iter = int(max_iters / 10)
    pi = policy_func(args, "pi", env)  # Construct network for new policy
    oldpi = policy_func(args, "oldpi", env)
    # placeholder
    ob = U.get_placeholder_cached(name="ob")
    ac = pi.pdtype.sample_placeholder([None])
    stochastic = U.get_placeholder_cached(name="stochastic")
    loss = tf.reduce_mean(tf.square(ac - pi.ac))
    var_list = pi.get_trainable_variables()
    adam = MpiAdam(var_list, epsilon=adam_epsilon)
    lossandgrad = U.function([ob, ac, stochastic],
                             [loss] + [U.flatgrad(loss, var_list)])

    if not pretrained:
        writer = U.FileWriter(log_dir)
        ep_stats = stats(["Loss"])
    U.initialize()
    adam.sync()
    logger.log("Pretraining with Behavior Cloning...")
    for iter_so_far in tqdm(range(int(max_iters + 1))):
        ob_expert, ac_expert = dataset.get_next_batch(optim_batch_size,
                                                      'train')
        loss, g = lossandgrad(ob_expert, ac_expert, True)
        adam.update(g, optim_stepsize)
        if not pretrained:
            ep_stats.add_all_summary(writer, [loss], iter_so_far)
        if iter_so_far % val_per_iter == 0:
            ob_expert, ac_expert = dataset.get_next_batch(-1, 'val')
            loss, g = lossandgrad(ob_expert, ac_expert, False)
            logger.log("Validation:")
            logger.log("Loss: %f" % loss)
            if not pretrained:
                U.save_state(os.path.join(ckpt_dir, task_name),
                             counter=iter_so_far)
    if pretrained:
        savedir_fname = tempfile.TemporaryDirectory().name
        U.save_state(savedir_fname, max_to_keep=args.max_to_keep)
        return savedir_fname
Ejemplo n.º 7
0
def learn(env,
          policy_func,
          dataset,
          pretrained,
          optim_batch_size=128,
          max_iters=1e4,
          adam_epsilon=1e-5,
          optim_stepsize=3e-4,
          ckpt_dir=None,
          log_dir=None,
          task_name=None):
    val_per_iter = int(max_iters / 10)
    ob_space = env.observation_space
    ac_space = env.action_space
    pi = policy_func("pi", ob_space,
                     ac_space)  # Construct network for new policy
    # placeholder
    ob = U.get_placeholder_cached(name="ob")
    ac = pi.pdtype.sample_placeholder([None])
    stochastic = U.get_placeholder_cached(name="stochastic")
    loss = tf.reduce_mean(tf.square(ac - pi.ac))  #エキスパート行動と方策行動の差の2乗の平均
    var_list = pi.get_trainable_variables()
    adam = MpiAdam(var_list, epsilon=adam_epsilon)
    lossandgrad = U.function([ob, ac, stochastic],
                             [loss] + [U.flatgrad(loss, var_list)])
    #状態,行動,確率的方策(bool)を入力,loss(エキスパート行動と方策行動の差の2乗の平均)andその勾配を出力

    if not pretrained:
        writer = U.FileWriter(log_dir)
        ep_stats = stats(["Loss"])
    U.initialize()
    adam.sync()
    logger.log("Pretraining with Behavior Cloning...")
    for iter_so_far in tqdm(range(int(max_iters))):
        ob_expert, ac_expert = dataset.get_next_batch(optim_batch_size,
                                                      'train')
        loss, g = lossandgrad(ob_expert, ac_expert, True)
        adam.update(g, optim_stepsize)
        if not pretrained:
            ep_stats.add_all_summary(writer, [loss], iter_so_far)
        if iter_so_far % val_per_iter == 0:
            ob_expert, ac_expert = dataset.get_next_batch(-1, 'val')
            loss, g = lossandgrad(ob_expert, ac_expert, False)
            logger.log("Validation:")
            logger.log("Loss: %f" % loss)
            if not pretrained:
                U.save_state(os.path.join(ckpt_dir, task_name),
                             counter=iter_so_far)
    if pretrained:
        savedir_fname = tempfile.TemporaryDirectory().name
        U.save_state(savedir_fname, var_list=pi.get_variables())
        return savedir_fname
Ejemplo n.º 8
0
def test_function():
    tf.reset_default_graph()
    x = tf.placeholder(tf.int32, (), name="x")
    y = tf.placeholder(tf.int32, (), name="y")
    z = 3 * x + 2 * y
    lin = function([x, y], z, givens={y: 0})

    with single_threaded_session():
        initialize()

        assert lin(2) == 6
        assert lin(x=3) == 9
        assert lin(2, 2) == 10
        assert lin(x=2, y=3) == 12
Ejemplo n.º 9
0
def test_multikwargs():
    tf.reset_default_graph()
    x = tf.placeholder(tf.int32, (), name="x")
    with tf.variable_scope("other"):
        x2 = tf.placeholder(tf.int32, (), name="x")
    z = 3 * x + 2 * x2

    lin = function([x, x2], z, givens={x2: 0})
    with single_threaded_session():
        initialize()
        assert lin(2) == 6
        assert lin(2, 2) == 10
        expt_caught = False
        try:
            lin(x=2)
        except AssertionError:
            expt_caught = True
        assert expt_caught
Ejemplo n.º 10
0
    def __init__(self, epsilon=1e-2, shape=()):

        self._sum = tf.get_variable(dtype=tf.float64,
                                    shape=shape,
                                    initializer=tf.constant_initializer(0.0),
                                    name="runningsum",
                                    trainable=False)
        self._sumsq = tf.get_variable(
            dtype=tf.float64,
            shape=shape,
            initializer=tf.constant_initializer(epsilon),
            name="runningsumsq",
            trainable=False)
        self._count = tf.get_variable(
            dtype=tf.float64,
            shape=(),
            initializer=tf.constant_initializer(epsilon),
            name="count",
            trainable=False
        )  #_count = 0.01, mei you shape yi si shi mei shape wei 1

        self.shape = shape

        self.mean = tf.to_float(self._sum / self._count)
        #print ("this is self.mean", self.mean)
        self.std = tf.sqrt(
            tf.maximum(
                tf.to_float(self._sumsq / self._count) - tf.square(self.mean),
                1e-2))  #max(1-n^2, 0.01) zheng ge std d zhi >0.1
        #print ("this is self.std", self.std)

        newsum = tf.placeholder(shape=self.shape, dtype=tf.float64, name='sum')
        newsumsq = tf.placeholder(shape=self.shape,
                                  dtype=tf.float64,
                                  name='var')
        newcount = tf.placeholder(shape=[], dtype=tf.float64, name='count')
        self.incfiltparams = U.function(
            [newsum, newsumsq, newcount], [],
            updates=[
                tf.assign_add(self._sum, newsum),
                tf.assign_add(self._sumsq, newsumsq),
                tf.assign_add(self._count, newcount)
            ]
        )  #ba assing_add(1, 2) di or ge zhi zeng jia dao di yi ge zhi shang mian
Ejemplo n.º 11
0
    def __init__(self, epsilon=1e-2, shape=()):

        self._sum = tf.get_variable(dtype=tf.float64,
                                    shape=shape,
                                    initializer=tf.constant_initializer(0.0),
                                    name="runningsum",
                                    trainable=False)
        self._sumsq = tf.get_variable(
            dtype=tf.float64,
            shape=shape,
            initializer=tf.constant_initializer(epsilon),
            name="runningsumsq",
            trainable=False)
        self._count = tf.get_variable(
            dtype=tf.float64,
            shape=(),
            initializer=tf.constant_initializer(epsilon),
            name="count",
            trainable=False)
        self.shape = shape

        self.mean = tf.to_float(self._sum / self._count)
        self.std = tf.sqrt(
            tf.maximum(
                tf.to_float(self._sumsq / self._count) - tf.square(self.mean),
                1e-2))

        newsum = tf.placeholder(shape=self.shape, dtype=tf.float64, name='sum')
        newsumsq = tf.placeholder(shape=self.shape,
                                  dtype=tf.float64,
                                  name='var')
        newcount = tf.placeholder(shape=[], dtype=tf.float64, name='count')
        self.incfiltparams = U.function(
            [newsum, newsumsq, newcount], [],
            updates=[
                tf.assign_add(self._sum, newsum),
                tf.assign_add(self._sumsq, newsumsq),
                tf.assign_add(self._count, newcount)
            ])
Ejemplo n.º 12
0
    def _init(self, ob_space, ac_space, hid_size, num_hid_layers, gaussian_fixed_var=True):
        assert isinstance(ob_space, gym.spaces.Box)

        self.pdtype = pdtype = make_pdtype(ac_space)
        sequence_length = None

        ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape))

        with tf.variable_scope("obfilter"):
            self.ob_rms = RunningMeanStd(shape=ob_space.shape)

        obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0)
        last_out = obz
        for i in range(num_hid_layers):
            last_out = tf.nn.tanh(U.dense(last_out, hid_size, "vffc%i"%(i+1), weight_init=U.normc_initializer(1.0)))
        self.vpred = U.dense(last_out, 1, "vffinal", weight_init=U.normc_initializer(1.0))[:,0]

        last_out = obz
        for i in range(num_hid_layers):
            last_out = tf.nn.tanh(U.dense(last_out, hid_size, "polfc%i"%(i+1), weight_init=U.normc_initializer(1.0)))
        if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box):
            mean = U.dense(last_out, pdtype.param_shape()[0]//2, "polfinal", U.normc_initializer(0.01))
            logstd = tf.get_variable(name="logstd", shape=[1, pdtype.param_shape()[0]//2], initializer=tf.zeros_initializer())
            pdparam = U.concatenate([mean, mean * 0.0 + logstd], axis=1)
        else:
            pdparam = U.dense(last_out, pdtype.param_shape()[0], "polfinal", U.normc_initializer(0.01))

        self.pd = pdtype.pdfromflat(pdparam)

        self.state_in = []
        self.state_out = []

        # change for BC
        #stochastic = tf.placeholder(dtype=tf.bool, shape=())
        stochastic = U.get_placeholder(name="stochastic", dtype=tf.bool, shape=())
        ac = U.switch(stochastic, self.pd.sample(), self.pd.mode())
        self.ac = ac
        self._act = U.function([stochastic, ob], [ac, self.vpred])
Ejemplo n.º 13
0
    def _init(self, ob_space, ac_space):
        assert isinstance(ob_space, gym.spaces.Box)

        self.pdtype = pdtype = make_proba_dist_type(ac_space)
        sequence_length = None

        ob = U.get_placeholder(name="ob",
                               dtype=tf.float32,
                               shape=[sequence_length] + list(ob_space.shape))

        obscaled = ob / 255.0

        with tf.variable_scope("pol"):
            x = obscaled
            x = tf.nn.relu(U.conv2d(x, 8, "l1", [8, 8], [4, 4], pad="VALID"))
            x = tf.nn.relu(U.conv2d(x, 16, "l2", [4, 4], [2, 2], pad="VALID"))
            x = U.flattenallbut0(x)
            x = tf.nn.relu(U.dense(x, 128, 'lin', U.normc_initializer(1.0)))
            logits = U.dense(x,
                             pdtype.param_shape()[0], "logits",
                             U.normc_initializer(0.01))
            self.pd = pdtype.proba_distribution_from_flat(logits)
        with tf.variable_scope("vf"):
            x = obscaled
            x = tf.nn.relu(U.conv2d(x, 8, "l1", [8, 8], [4, 4], pad="VALID"))
            x = tf.nn.relu(U.conv2d(x, 16, "l2", [4, 4], [2, 2], pad="VALID"))
            x = U.flattenallbut0(x)
            x = tf.nn.relu(U.dense(x, 128, 'lin', U.normc_initializer(1.0)))
            self.vpred = U.dense(x, 1, "value", U.normc_initializer(1.0))
            self.vpredz = self.vpred

        self.state_in = []
        self.state_out = []

        stochastic = tf.placeholder(dtype=tf.bool, shape=())
        ac = self.pd.sample()  # XXX
        self._act = U.function([stochastic, ob], [ac, self.vpred])
Ejemplo n.º 14
0
    def __init__(self,
                 env,
                 hidden_size,
                 discriminatorStepSize=3e-4,
                 entcoeff=0.001,
                 scope="adversary"):

        global old_gen_loss, old_exp_loss
        print("Init Wasserstein discriminator")
        self.scope = scope
        self.observation_shape = env.observation_space.shape
        self.actions_shape = env.action_space.shape
        self.input_shape = tuple([
            o + a for o, a in zip(self.observation_shape, self.actions_shape)
        ])

        self.num_actions = env.action_space.n if isinstance(
            env.action_space, Discrete) else env.action_space.shape[0]

        self.hidden_size = hidden_size
        self.discriminatorStepSize = discriminatorStepSize

        # PLACEHOLDERS
        self.generator_obs_ph = tf.placeholder(tf.float32, (None, ) +
                                               self.observation_shape,
                                               name="observations_ph")
        self.generator_acs_ph = tf.placeholder(tf.float32,
                                               (None, ) + self.actions_shape,
                                               name="actions_ph")
        self.expert_obs_ph = tf.placeholder(tf.float32,
                                            (None, ) + self.observation_shape,
                                            name="expert_observations_ph")
        self.expert_acs_ph = tf.placeholder(tf.float32,
                                            (None, ) + self.actions_shape,
                                            name="expert_actions_ph")
        # Build graph
        gen_logits = self.build_graph(self.generator_obs_ph,
                                      self.generator_acs_ph,
                                      reuse=False)
        exp_logits = self.build_graph(self.expert_obs_ph,
                                      self.expert_acs_ph,
                                      reuse=True)
        # Build accuracy
        generator_acc = tf.reduce_mean(
            tf.to_float(tf.nn.sigmoid(gen_logits) < 0.5))
        expert_acc = tf.reduce_mean(
            tf.to_float(tf.nn.sigmoid(exp_logits) > 0.5))

        # regression losses to control progress:
        old_gen_loss = regression_loss(gen_logits)
        old_exp_loss = regression_loss(exp_logits)

        # NR1. Use Wasserstein loss
        discriminator_loss = tf.contrib.gan.losses.wargs.wasserstein_discriminator_loss(
            exp_logits, gen_logits)
        # --- not sure about this loss function, but it doesn't take part in calculations:
        generator_loss = -tf.reduce_mean(gen_logits)

        # Build entropy loss
        logits = tf.concat([gen_logits, exp_logits], 0)
        entropy = tf.reduce_mean(logit_bernoulli_entropy(logits))
        entropy_loss = -entcoeff * entropy
        # Loss + Accuracy terms
        self.losses = [
            generator_loss, discriminator_loss, old_gen_loss, old_exp_loss,
            entropy, entropy_loss, generator_acc, expert_acc,
            discriminator_loss + entropy_loss
        ]
        self.loss_name = [
            "gen_loss", "disc_loss", "old_gen_loss", "old_exp_loss", "entropy",
            "entropy_loss", "generator_acc", "expert_acc", "total_loss"
        ]

        self.total_loss = discriminator_loss + entropy_loss
        # Build Reward for policy
        self.reward_op = -tf.log(1 - tf.nn.sigmoid(gen_logits) + 1e-8)

        # NR2. Use RMSPropOptimizer
        self.optimizer = tf.train.RMSPropOptimizer(
            learning_rate=discriminatorStepSize).minimize(
                self.total_loss, var_list=self.get_trainable_variables())

        # NR3. Clip weights in range [-.01, .01]
        clip_ops = []
        for var in self.get_trainable_variables():
            clip_bounds = [-.01, .01]
            clip_ops.append(
                tf.assign(
                    var, tf.clip_by_value(var, clip_bounds[0],
                                          clip_bounds[1])))
        self.clip_disc_weights = tf.group(*clip_ops)

        self.dict = [
            self.generator_obs_ph, self.generator_acs_ph, self.expert_obs_ph,
            self.expert_acs_ph
        ]

        # ================================ FUNCTIONS =====================================
        self.disc_train_op = U.function(self.dict, self.optimizer)

        self.losses = U.function(self.dict, self.losses)

        self.get_expert_logits = U.function(
            [self.expert_obs_ph, self.expert_acs_ph], exp_logits)

        self.get_logits = U.function(self.dict, [exp_logits] + [gen_logits])

        self.clip = U.function(self.dict, self.clip_disc_weights)
Ejemplo n.º 15
0
def learn(env,
          policy_func,
          dataset,
          pretrained,
          optim_batch_size=128,
          max_iters=1e3,
          adam_epsilon=1e-6,
          optim_stepsize=2e-4,
          ckpt_dir=None,
          log_dir=None,
          task_name=None,
          high_level=False):
    val_per_iter = int(max_iters / 100)
    ob_space = env.observation_space
    ac_space = env.action_space
    start_time = time.time()
    if not high_level:

        pi_low = policy_func("pi_low", ob_space, ac_space.spaces[1])

        # placeholder
        # ob_low = U.get_placeholder_cached(name="ob")
        ob_low = pi_low.ob
        ac_low = pi_low.pdtype.sample_placeholder([None])
        # stochastic_low = U.get_placeholder_cached(name="stochastic")
        stochastic_low = pi_low.stochastic
        loss_low = tf.reduce_mean(tf.square(ac_low - pi_low.ac))
        var_list_low = pi_low.get_trainable_variables()
        adam_low = MpiAdam(var_list_low, epsilon=adam_epsilon)
        lossandgrad_low = U.function([ob_low, ac_low, stochastic_low],
                                     [loss_low] +
                                     [U.flatgrad(loss_low, var_list_low)])

        if not pretrained:
            writer = U.FileWriter(log_dir)
            ep_stats_low = stats(["Loss_low"])
        U.initialize()
        adam_low.sync()
        logger.log("Pretraining with Behavior Cloning Low...")
        for iter_so_far in tqdm(range(int(max_iters))):

            ob_expert, ac_expert = dataset.get_next_batch(
                optim_batch_size, 'train', high_level)
            loss, g = lossandgrad_low(ob_expert, ac_expert, True)
            adam_low.update(g, optim_stepsize)
            if not pretrained:
                ep_stats_low.add_all_summary(writer, [loss], iter_so_far)
            if iter_so_far % val_per_iter == 0:
                ob_expert, ac_expert = dataset.get_next_batch(
                    -1, 'val', high_level)
                loss, g = lossandgrad_low(ob_expert, ac_expert, False)
                logger.log("Validation:")
                logger.log("Loss: %f" % loss)
                if not pretrained:
                    U.save_state(os.path.join(ckpt_dir, task_name),
                                 counter=iter_so_far)

        if pretrained:
            savedir_fname = tempfile.TemporaryDirectory().name
            U.save_state(savedir_fname, var_list=pi_low.get_variables())
            return savedir_fname

    else:
        pi_high = policy_func("pi_high", ob_space,
                              ac_space.spaces[0])  # high -> action_label
        # ob_high = U.get_placeholder_cached(name="ob")
        ob_high = pi_high.ob
        ac_high = pi_high.pdtype.sample_placeholder([None, 1])
        onehot_labels = tf.one_hot(indices=tf.cast(ac_high, tf.int32), depth=3)
        # stochastic_high = U.get_placeholder_cached(name="stochastic")
        stochastic_high = pi_high.stochastic
        cross_entropy = tf.nn.softmax_cross_entropy_with_logits(
            logits=pi_high.logits, labels=onehot_labels)
        loss_high = tf.reduce_mean(cross_entropy)
        var_list_high = pi_high.get_trainable_variables()
        adam_high = MpiAdam(var_list_high, epsilon=adam_epsilon)
        lossandgrad_high = U.function([ob_high, ac_high, stochastic_high],
                                      [loss_high] +
                                      [U.flatgrad(loss_high, var_list_high)])

        # train high level policy
        if not pretrained:
            writer = U.FileWriter(log_dir)
            # ep_stats_low = stats(["Loss_low"])
            ep_stats_high = stats(["loss_high"])
        U.initialize()
        adam_high.sync()
        logger.log("Pretraining with Behavior Cloning High...")
        for iter_so_far in tqdm(range(int(max_iters))):

            ob_expert, ac_expert = dataset.get_next_batch(
                optim_batch_size, 'train', high_level)
            loss, g = lossandgrad_high(ob_expert, ac_expert, True)
            adam_high.update(g, optim_stepsize)
            if not pretrained:
                ep_stats_high.add_all_summary(writer, [loss], iter_so_far)
            if iter_so_far % val_per_iter == 0:
                ob_expert, ac_expert = dataset.get_next_batch(
                    -1, 'val', high_level)
                loss, g = lossandgrad_high(ob_expert, ac_expert, False)
                logger.log("Validation:")
                logger.log("Loss: %f" % loss)
                if not pretrained:
                    U.save_state(os.path.join(ckpt_dir, task_name),
                                 counter=iter_so_far)
        if pretrained:
            savedir_fname = tempfile.TemporaryDirectory().name
            U.save_state(savedir_fname, var_list=pi_high.get_variables())
            return savedir_fname

    print("--- %s seconds ---" % (time.time() - start_time))
Ejemplo n.º 16
0
    def _init(self,
              ob_space,
              ac_space,
              hid_size,
              num_hid_layers,
              gaussian_fixed_var=True):
        assert isinstance(ob_space, gym.spaces.Box)

        self.pdtype = pdtype = make_pdtype(ac_space)
        sequence_length = None

        ob = U.get_placeholder(name="ob",
                               dtype=tf.float32,
                               shape=[sequence_length] + list(ob_space.shape))

        with tf.variable_scope("obfilter"):
            self.ob_rms = RunningMeanStd(shape=ob_space.shape)

        obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std,
                               -20.0, 20.0)
        last_out = obz
        for i in range(num_hid_layers):
            last_out = tf.nn.tanh(
                U.dense(last_out,
                        hid_size,
                        "vffc%i" % (i + 1),
                        weight_init=U.normc_initializer(1.0)))
        self.vpred = U.dense(last_out,
                             1,
                             "vffinal",
                             weight_init=U.normc_initializer(1.0))[:, 0]

        # last_out = obz
        # for i in range(num_hid_layers):
        #     last_out = tf.nn.tanh(U.dense(last_out, hid_size, "polfc%i"%(i+1), weight_init=U.normc_initializer(1.0)))
        ### add conv net instead of using dense
        self.msize = 64  # change to 64 later
        self.ssize = 64
        self.isize = 11
        self.available_action_size = 524
        minimap = obz[:, 0:5 * self.msize * self.msize]
        screen = obz[:,
                     5 * self.msize * self.msize:5 * self.msize * self.msize +
                     10 * self.ssize * self.ssize]
        info = obz[:, (5 * self.msize * self.msize +
                       10 * self.ssize * self.ssize):(
                           5 * self.msize * self.msize +
                           10 * self.ssize * self.ssize + self.isize)]
        available_action = obz[:, (5 * self.msize * self.msize +
                                   10 * self.ssize * self.ssize +
                                   self.isize):(5 * self.msize * self.msize +
                                                10 * self.ssize * self.ssize +
                                                self.isize +
                                                self.available_action_size)]

        conv1_minimap = tf.layers.conv2d(inputs=tf.reshape(
            minimap, [-1, self.msize, self.msize, 5]),
                                         filters=10,
                                         kernel_size=5,
                                         strides=1,
                                         padding='same',
                                         activation=tf.nn.leaky_relu,
                                         name="polmconv1")  # -> (64, 64, 10)
        pool1_minimap = tf.layers.max_pooling2d(
            conv1_minimap, pool_size=4, strides=4,
            name="polmpool1")  # -> (16, 16, 10)
        conv2_minimap = tf.layers.conv2d(pool1_minimap,
                                         10,
                                         5,
                                         1,
                                         'same',
                                         activation=tf.nn.relu,
                                         name="polmconv2")  # -> (16, 16, 10)
        pool2_minimap = tf.layers.max_pooling2d(
            conv2_minimap, 2, 2, name="polmpool2")  # -> (8, 8, 10)
        flat_minimap = tf.reshape(pool2_minimap,
                                  [-1, 8 * 8 * 10])  # -> (8*8*10, )
        # dense_minimap = tf.layers.dense(inputs=flat_minimap, units=1024, activation=tf.nn.relu)
        # # dropout_mininmap = tf.layers.dropout(
        # #     inputs=dense_minimap, rate=0.4, training=mode == tf.estimator.ModeKeys.TRAIN)
        # minimap_output = tf.layers.dense(dense_minimap, 64)

        conv1_screen = tf.layers.conv2d(
            inputs=tf.reshape(screen,
                              [-1, self.ssize, self.ssize, 10]),  # (64,64,10)
            filters=20,
            kernel_size=5,
            strides=1,
            padding='same',
            activation=tf.nn.leaky_relu,
            name="polsconv1")  # -> (64, 64, 20)
        pool1_screen = tf.layers.max_pooling2d(
            conv1_screen, pool_size=4, strides=4,
            name="polspool1")  # -> (16, 16, 20)
        conv2_screen = tf.layers.conv2d(pool1_screen,
                                        20,
                                        5,
                                        1,
                                        'same',
                                        activation=tf.nn.relu,
                                        name="polsconv2")  # -> (16, 16, 20)
        pool2_screen = tf.layers.max_pooling2d(
            conv2_screen, 2, 2, name="polspool2")  # -> (8, 8, 20)
        flat_screen = tf.reshape(pool2_screen,
                                 [-1, 8 * 8 * 20])  # -> (8*8*20, )
        # dense_screen = tf.layers.dense(inputs=flat_screen, units=1024, activation=tf.nn.relu)
        # # dropout_screen = tf.layers.dropout(
        # #     inputs=dense_screen, rate=0.4, training=mode == tf.estimator.ModeKeys.TRAIN)
        # screen_output = tf.layers.dense(dense_screen, 64, tf.nn.relu)

        info_fc = tf.layers.dense(inputs=layers.flatten(info),
                                  units=4,
                                  activation=tf.tanh,
                                  name="poldense1")

        aa_fc = tf.layers.dense(inputs=layers.flatten(available_action),
                                units=16,
                                activation=tf.tanh,
                                name="poldense2")

        last_out = tf.concat([flat_minimap, flat_screen, info_fc, aa_fc],
                             axis=1,
                             name="polconcat")
        # last_out = tf.layers.dense(inputs=last_out,units=600,name="poldense3")
        # last_out = tf.nn.tanh(U.dense(last_out, hid_size, "polfc1", weight_init=U.normc_initializer(1.0)))

        if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box):
            mean = U.dense(last_out,
                           pdtype.param_shape()[0] // 2, "polfinal",
                           U.normc_initializer(0.01))
            logstd = tf.get_variable(name="logstd",
                                     shape=[1, pdtype.param_shape()[0] // 2],
                                     initializer=tf.zeros_initializer())
            pdparam = U.concatenate([mean, mean * 0.0 + logstd], axis=1)
        else:
            pdparam = U.dense(last_out,
                              pdtype.param_shape()[0], "polfinal",
                              U.normc_initializer(0.01))

        self.pd = pdtype.pdfromflat(pdparam)

        self.state_in = []
        self.state_out = []

        # change for BC
        #stochastic = tf.placeholder(dtype=tf.bool, shape=())
        stochastic = U.get_placeholder(name="stochastic",
                                       dtype=tf.bool,
                                       shape=())
        ac = U.switch(stochastic, self.pd.sample(), self.pd.mode())
        self.ac = ac
        self._act = U.function([stochastic, ob], [ac, self.vpred])
Ejemplo n.º 17
0
def learn(
        env,
        policy_func,
        discriminator,
        expert_dataset,
        pretrained,
        pretrained_weight,
        *,
        g_step,
        d_step,
        episodes_per_batch,  # what to train on
        dropout_keep_prob,
        sequence_size,  #rnn parameters
        max_kl,
        cg_iters,
        gamma,
        lam,  # advantage estimation
        entcoeff=0.0,
        cg_damping=1e-2,
        vf_stepsize=3e-4,
        d_stepsize=3e-4,
        vf_iters=3,
        max_timesteps=0,
        max_episodes=0,
        max_iters=0,  # time constraint
        callback=None,
        save_per_iter=100,
        ckpt_dir=None,
        log_dir=None,
        load_model_path=None,
        task_name=None):

    nworkers = MPI.COMM_WORLD.Get_size()
    rank = MPI.COMM_WORLD.Get_rank()
    np.set_printoptions(precision=3)
    # Setup losses and stuff
    # ----------------------------------------
    ob_space = env.observation_space
    ac_space = env.action_space
    pi = policy_func("pi",
                     ob_space,
                     ac_space,
                     reuse=(pretrained_weight != None))
    oldpi = policy_func("oldpi", ob_space, ac_space)
    atarg = tf.placeholder(
        dtype=tf.float32,
        shape=[None])  # Target advantage function (if applicable)
    ret = tf.placeholder(dtype=tf.float32, shape=[None])  # Empirical return

    ob = U.get_placeholder_cached(name="ob")
    ac = pi.pdtype.sample_placeholder([None])

    kloldnew = oldpi.pd.kl(pi.pd)
    ent = pi.pd.entropy()
    meankl = U.mean(kloldnew)
    meanent = U.mean(ent)
    entbonus = entcoeff * meanent

    vferr = U.mean(tf.square(pi.vpred - ret))

    ratio = tf.exp(pi.pd.logp(ac) -
                   oldpi.pd.logp(ac))  # advantage * pnew / pold
    surrgain = U.mean(ratio * atarg)

    optimgain = surrgain + entbonus
    losses = [optimgain, meankl, entbonus, surrgain, meanent]
    loss_names = ["optimgain", "meankl", "entloss", "surrgain", "entropy"]

    dist = meankl

    all_var_list = pi.get_trainable_variables()
    var_list = [
        v for v in all_var_list if v.name.split("/")[1].startswith("pol")
    ]
    vf_var_list = [
        v for v in all_var_list if v.name.split("/")[1].startswith("vf")
    ]
    d_adam = MpiAdam(discriminator.get_trainable_variables())
    vfadam = MpiAdam(vf_var_list)

    get_flat = U.GetFlat(var_list)
    set_from_flat = U.SetFromFlat(var_list)
    klgrads = tf.gradients(dist, var_list)
    flat_tangent = tf.placeholder(dtype=tf.float32,
                                  shape=[None],
                                  name="flat_tan")
    shapes = [var.get_shape().as_list() for var in var_list]
    start = 0
    tangents = []
    for shape in shapes:
        sz = U.intprod(shape)
        tangents.append(tf.reshape(flat_tangent[start:start + sz], shape))
        start += sz
    gvp = tf.add_n(
        [U.sum(g * tangent) for (g, tangent) in zipsame(klgrads, tangents)])  #pylint: disable=E1111
    fvp = U.flatgrad(gvp, var_list)

    assign_old_eq_new = U.function(
        [], [],
        updates=[
            tf.assign(oldv, newv)
            for (oldv,
                 newv) in zipsame(oldpi.get_variables(), pi.get_variables())
        ])
    compute_losses = U.function([ob, ac, atarg], losses)
    compute_lossandgrad = U.function([ob, ac, atarg], losses +
                                     [U.flatgrad(optimgain, var_list)])
    compute_fvp = U.function([flat_tangent, ob, ac, atarg], fvp)
    compute_vflossandgrad = U.function([ob, ret],
                                       U.flatgrad(vferr, vf_var_list))

    @contextmanager
    def timed(msg):
        if rank == 0:
            print(colorize(msg, color='magenta'))
            tstart = time.time()
            yield
            print(
                colorize("done in %.3f seconds" % (time.time() - tstart),
                         color='magenta'))
        else:
            yield

    def allmean(x):
        assert isinstance(x, np.ndarray)
        out = np.empty_like(x)
        MPI.COMM_WORLD.Allreduce(x, out, op=MPI.SUM)
        out /= nworkers
        return out

    writer = U.FileWriter(log_dir)
    U.initialize()
    th_init = get_flat()
    MPI.COMM_WORLD.Bcast(th_init, root=0)
    set_from_flat(th_init)
    d_adam.sync()
    vfadam.sync()
    print("Init param sum", th_init.sum(), flush=True)

    # Prepare for rollouts
    # ----------------------------------------
    seg_gen = traj_segment_generator(pi,
                                     env,
                                     discriminator,
                                     episodes_per_batch,
                                     stochastic=True,
                                     seq_length=sequence_size)

    episodes_so_far = 0
    timesteps_so_far = 0
    iters_so_far = 0
    tstart = time.time()
    lenbuffer = deque(maxlen=40)  # rolling buffer for episode lengths
    rewbuffer = deque(maxlen=40)  # rolling buffer for episode rewards
    true_rewbuffer = deque(maxlen=40)

    assert sum([max_iters > 0, max_timesteps > 0, max_episodes > 0]) == 1

    g_loss_stats = stats(loss_names)
    d_loss_stats = stats(discriminator.loss_name)
    ep_stats = stats(["True_rewards", "Rewards", "Episode_length"])
    # if provide pretrained weight
    if pretrained_weight is not None:
        U.load_state(pretrained_weight, var_list=pi.get_variables())
    # if provieded model path
    if load_model_path is not None:
        U.load_state(load_model_path)

    while True:
        if callback: callback(locals(), globals())
        if max_timesteps and timesteps_so_far >= max_timesteps:
            break
        elif max_episodes and episodes_so_far >= max_episodes:
            break
        elif max_iters and iters_so_far >= max_iters:
            break

        # Save model
        if iters_so_far % save_per_iter == 0 and ckpt_dir is not None:
            U.save_state(os.path.join(ckpt_dir, task_name),
                         counter=iters_so_far)

        logger.log("********** Iteration %i ************" % iters_so_far)

        def fisher_vector_product(p):
            return allmean(compute_fvp(p, *fvpargs)) + cg_damping * p

        # ------------------ Update G ------------------
        logger.log("Optimizing Policy...")
        for _ in range(g_step):
            with timed("sampling"):
                seg = seg_gen.__next__()
            add_vtarg_and_adv(seg, gamma, lam)
            # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets))
            ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg[
                "tdlamret"]
            vpredbefore = seg[
                "vpred"]  # predicted value function before udpate
            atarg = (atarg - atarg.mean()) / atarg.std(
            )  # standardized advantage function estimate

            if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob)

            args = seg["ob"], seg["ac"], atarg
            fvpargs = [arr[::5] for arr in args]

            assign_old_eq_new(
            )  # set old parameter values to new parameter values
            with timed("computegrad"):
                *lossbefore, g = compute_lossandgrad(*args)
            lossbefore = allmean(np.array(lossbefore))
            g = allmean(g)
            if np.allclose(g, 0):
                logger.log("Got zero gradient. not updating")
            else:
                with timed("cg"):
                    stepdir = cg(fisher_vector_product,
                                 g,
                                 cg_iters=cg_iters,
                                 verbose=rank == 0)
                assert np.isfinite(stepdir).all()
                shs = .5 * stepdir.dot(fisher_vector_product(stepdir))
                lm = np.sqrt(shs / max_kl)
                # logger.log("lagrange multiplier:", lm, "gnorm:", np.linalg.norm(g))
                fullstep = stepdir / lm
                expectedimprove = g.dot(fullstep)
                surrbefore = lossbefore[0]
                stepsize = 1.0
                thbefore = get_flat()
                for _ in range(10):
                    thnew = thbefore + fullstep * stepsize
                    set_from_flat(thnew)
                    meanlosses = surr, kl, *_ = allmean(
                        np.array(compute_losses(*args)))
                    improve = surr - surrbefore
                    logger.log("Expected: %.3f Actual: %.3f" %
                               (expectedimprove, improve))
                    if not np.isfinite(meanlosses).all():
                        logger.log("Got non-finite value of losses -- bad!")
                    elif kl > max_kl * 1.5:
                        logger.log("violated KL constraint. shrinking step.")
                    elif improve < 0:
                        logger.log("surrogate didn't improve. shrinking step.")
                    else:
                        logger.log("Stepsize OK!")
                        break
                    stepsize *= .5
                else:
                    logger.log("couldn't compute a good step")
                    set_from_flat(thbefore)
                if nworkers > 1 and iters_so_far % 20 == 0:
                    paramsums = MPI.COMM_WORLD.allgather(
                        (thnew.sum(),
                         vfadam.getflat().sum()))  # list of tuples
                    assert all(
                        np.allclose(ps, paramsums[0]) for ps in paramsums[1:])
            with timed("vf"):
                for _ in range(vf_iters):
                    for (mbob, mbret) in dataset.iterbatches(
                        (seg["ob"], seg["tdlamret"]),
                            include_final_partial_batch=False,
                            batch_size=128):
                        if hasattr(pi, "ob_rms"):
                            pi.ob_rms.update(
                                mbob)  # update running mean/std for policy
                        g = allmean(compute_vflossandgrad(mbob, mbret))
                        vfadam.update(g, vf_stepsize)

        g_losses = meanlosses
        for (lossname, lossval) in zip(loss_names, meanlosses):
            logger.record_tabular(lossname, lossval)
        logger.record_tabular("ev_tdlam_before",
                              explained_variance(vpredbefore, tdlamret))
        # ------------------ Update D ------------------
        logger.log("Optimizing Discriminator...")
        logger.log(fmt_row(13, discriminator.loss_name))
        traj_gen, traj_len_gen = seg["ep_trajs"], seg["ep_lens"]
        #traj_expert, traj_len_expert = expert_dataset.get_next_traj_batch()
        batch_size = len(traj_gen) // d_step
        d_losses = [
        ]  # list of tuples, each of which gives the loss for a minibatch
        for traj_batch, traj_len_batch in dataset.iterbatches(
            (traj_gen, traj_len_gen),
                include_final_partial_batch=False,
                batch_size=batch_size):
            traj_expert, traj_len_expert = expert_dataset.get_next_traj_batch(
                len(traj_batch))
            # update running mean/std for discriminator
            ob_batch, _ = traj2trans(traj_batch, traj_len_batch,
                                     ob_space.shape[0])
            ob_expert, _ = traj2trans(traj_expert, traj_len_expert,
                                      ob_space.shape[0])
            if hasattr(discriminator, "obs_rms"):
                discriminator.obs_rms.update(
                    np.concatenate((ob_batch, ob_expert), 0))
            *newlosses, g = discriminator.lossandgrad(traj_batch,
                                                      traj_len_batch,
                                                      traj_expert,
                                                      traj_len_expert,
                                                      dropout_keep_prob)
            d_adam.update(allmean(g), d_stepsize)
            d_losses.append(newlosses)
        logger.log(fmt_row(13, np.mean(d_losses, axis=0)))

        lrlocal = (seg["ep_lens"], seg["ep_rets"], seg["ep_true_rets"]
                   )  # local values
        listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal)  # list of tuples
        lens, rews, true_rets = map(flatten_lists, zip(*listoflrpairs))
        true_rewbuffer.extend(true_rets)
        lenbuffer.extend(lens)
        rewbuffer.extend(rews)

        logger.record_tabular("EpLenMean", np.mean(lenbuffer))
        logger.record_tabular("EpRewMean", np.mean(rewbuffer))
        logger.record_tabular("EpTrueRewMean", np.mean(true_rewbuffer))
        logger.record_tabular("EpThisIter", len(lens))
        episodes_so_far += len(lens)
        timesteps_so_far += sum(lens)
        iters_so_far += 1

        logger.record_tabular("EpisodesSoFar", episodes_so_far)
        logger.record_tabular("TimestepsSoFar", timesteps_so_far)
        logger.record_tabular("TimeElapsed", time.time() - tstart)

        if rank == 0:
            logger.dump_tabular()
            g_loss_stats.add_all_summary(writer, g_losses, iters_so_far)
            d_loss_stats.add_all_summary(writer, np.mean(d_losses, axis=0),
                                         iters_so_far)
            ep_stats.add_all_summary(writer, [
                np.mean(true_rewbuffer),
                np.mean(rewbuffer),
                np.mean(lenbuffer)
            ], iters_so_far)
Ejemplo n.º 18
0
    def _init(self, ob_space, ac_space, hid_size, num_hid_layers, gaussian_fixed_var=True):
        assert isinstance(ob_space, gym.spaces.Box)

        self.pdtype = pdtype = make_pdtype(ac_space)
        sequence_length = None

        ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape))

        last_action = U.get_placeholder(shape=(None, 524), dtype=tf.float32, name="last_action_one_hot")
        self.msize = 64 # change to 64 later
        self.ssize = 64 
        self.isize = 11
        self.available_action_size = 524

        available_action = ob[:, (5*self.msize*self.msize+10*self.ssize*self.ssize+self.isize):(5*self.msize*self.msize+10*self.ssize*self.ssize+self.isize+self.available_action_size)]
        # ob = ob[:,:-(self.available_action_size)]

        with tf.variable_scope("obfilter"):
            self.ob_rms = RunningMeanStd(shape=ob_space.shape)

        # obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -20.0, 20.0)
        obz = (ob - self.ob_rms.mean) / self.ob_rms.std

        minimap = obz[:, 0:5*self.msize*self.msize]
        # minimap /= 2
        screen = obz[:, 5*self.msize*self.msize: 5*self.msize*self.msize+ 10*self.ssize*self.ssize]
        # screen /= 2
        info = obz[:, (5*self.msize*self.msize+10*self.ssize*self.ssize):(5*self.msize*self.msize+10*self.ssize*self.ssize+self.isize)]
        # info /= 2


        # get value prediction, crtic
        mconv1 = tf.layers.conv2d(
            inputs=tf.reshape(minimap, [-1,self.msize,self.msize,5]),
            filters=32,
            kernel_size=[5, 5],
            padding="same",
            kernel_initializer=U.normc_initializer(0.01),
            activation=tf.nn.leaky_relu)
        mpool1 = tf.layers.max_pooling2d(inputs=mconv1, pool_size=[2, 2], strides=2)
        mconv2 = tf.layers.conv2d(
            inputs=mpool1,
            filters=64,
            kernel_size=[5, 5],
            padding="same",
            kernel_initializer=U.normc_initializer(0.01),
            activation=tf.nn.leaky_relu,
            name="vffcmconv2")
        mpool2 = tf.layers.max_pooling2d(inputs=mconv2, pool_size=[2, 2], strides=2)
        mpool2_flat = tf.reshape(mpool2, [-1, 16 * 16 * 64])

        sconv1 = tf.layers.conv2d(
            inputs=tf.reshape(screen, [-1,self.ssize, self.ssize,10]),
            filters=48,
            kernel_size=[5, 5],
            padding="same",
            kernel_initializer=U.normc_initializer(0.01),
            activation=tf.nn.leaky_relu)
        spool1 = tf.layers.max_pooling2d(inputs=sconv1, pool_size=[2, 2], strides=2)
        sconv2 = tf.layers.conv2d(
            inputs=spool1,
            filters=80,
            kernel_size=[5, 5],
            padding="same",
            kernel_initializer=U.normc_initializer(0.01),
            activation=tf.nn.leaky_relu)
        spool2 = tf.layers.max_pooling2d(inputs=sconv2, pool_size=[2, 2], strides=2)
        spool2_flat = tf.reshape(spool2, [-1, 16 * 16 * 80])

        info_fc = tf.layers.dense(inputs=layers.flatten(info),
                   units=8,
                   activation=tf.tanh)
        
        aa_fc = tf.layers.dense(inputs=layers.flatten(available_action),
                   units=32,
                   activation=tf.tanh)

        HIDDEN_SIZE = 128
        l1_action = tf.layers.dense(layers.flatten(last_action), 256, tf.nn.relu)
        input_to_rnn = tf.reshape(l1_action, [-1, 16, 16])
        action_lstm_cell = tf.nn.rnn_cell.BasicLSTMCell(num_units=HIDDEN_SIZE, 
            forget_bias=1.0, state_is_tuple=True)
        inputs_rnn = tf.unstack(input_to_rnn, num=16, axis=1)
        rnn_outputs,rnn_state= tf.contrib.rnn.static_rnn(action_lstm_cell,
            inputs_rnn, dtype=tf.float32)
        l2_action = tf.layers.dense(rnn_state[-1], 
            128, tf.nn.tanh)          # hidden layer
        last_acs_ph_lstm = tf.layers.dense(l2_action, 
            32, tf.nn.tanh)

        last_out = tf.concat([mpool2_flat, spool2_flat, info_fc, aa_fc, last_acs_ph_lstm], 
            axis=1)
        vf_last_out = tf.nn.tanh(U.dense(last_out, 1024, 'vf_last_out',
            weight_init=U.normc_initializer(1.0)))
        # vf_last_out_2 = tf.nn.tanh(U.dense(vf_last_out, 64, 'vf_last_out_2',
        #     weight_init=U.normc_initializer(1.0)))
        self.vpred = U.dense(vf_last_out, 1, "vffinal", weight_init=U.normc_initializer(1.0))[:,0]

        if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box):
            mean = U.dense(last_out, pdtype.param_shape()[0]//2, "polfinal", U.normc_initializer(0.01))
            logstd = tf.get_variable(name="logstd", shape=[1, pdtype.param_shape()[0]//2], initializer=tf.zeros_initializer())
            pdparam = U.concatenate([mean, mean * 0.0 + logstd], axis=1)
        else:
            pol_last_out = U.dense(last_out, (pdtype.param_shape()[0])*5, "polfinaldense", U.normc_initializer(0.01))
            pdparam = U.dense(pol_last_out, pdtype.param_shape()[0], "polfinal", U.normc_initializer(0.01))

        self.pd = pdtype.pdfromflat(pdparam)

        self.state_in = []
        self.state_out = []

        # change for BC
        #stochastic = tf.placeholder(dtype=tf.bool, shape=())
        stochastic = U.get_placeholder(name="stochastic", dtype=tf.bool, shape=())
        ac = U.switch(stochastic, self.pd.sample(available_action), self.pd.mode(available_action))
        self.ac = ac
        self._act = U.function([stochastic, ob, last_action], [ac, self.vpred])
Ejemplo n.º 19
0
    def __init__(self,
                 hidden_size,
                 entcoeff=0.001,
                 lr_rate=1e-3,
                 scope="adversary"):
        self.scope = scope
        # self.observation_shape = env.observation_space.shape
        # self.actions_shape = env.action_space.shape

        # print('~~~~~~~~~~', self.observation_shape, self.actions_space)
        self.msize = 64  # change to 64 later
        self.ssize = 64
        self.isize = 11
        self.available_action_size = 524
        from gym import spaces
        self.ob_space = spaces.Box(
            low=-1000,
            high=10000,
            shape=(5 * self.msize * self.msize + 10 * self.ssize * self.ssize +
                   self.isize + self.available_action_size, ))
        self.ac_space = spaces.Discrete(self.available_action_size)
        self.observation_shape = self.ob_space.shape
        self.actions_shape = self.ac_space.shape
        self.hidden_size = hidden_size

        self.build_ph()

        # Build grpah
        generator_logits = self.build_graph(self.generator_obs_ph,
                                            self.generator_acs_ph,
                                            self.generator_last_action_ph,
                                            reuse=False)
        expert_logits = self.build_graph(self.expert_obs_ph,
                                         self.expert_acs_ph,
                                         self.expert_last_action_ph,
                                         reuse=True)
        # Build accuracy
        generator_acc = tf.reduce_mean(
            tf.to_float(tf.nn.sigmoid(generator_logits) < 0.5))
        self.generator_acc = generator_acc
        expert_acc = tf.reduce_mean(
            tf.to_float(tf.nn.sigmoid(expert_logits) > 0.5))
        self.expert_acc = expert_acc
        # Build regression loss
        # let x = logits, z = targets.
        # z * -log(sigmoid(x)) + (1 - z) * -log(1 - sigmoid(x))
        generator_loss = tf.nn.sigmoid_cross_entropy_with_logits(
            logits=generator_logits, labels=tf.zeros_like(generator_logits))
        generator_loss = tf.reduce_mean(generator_loss)
        expert_loss = tf.nn.sigmoid_cross_entropy_with_logits(
            logits=expert_logits, labels=tf.ones_like(expert_logits))
        expert_loss = tf.reduce_mean(expert_loss)
        # Build entropy loss
        logits = tf.concat([generator_logits, expert_logits], 0)
        entropy = tf.reduce_mean(logit_bernoulli_entropy(logits))
        entropy_loss = -entcoeff * entropy
        # Loss + Accuracy terms
        self.losses = [
            generator_loss, expert_loss, entropy, entropy_loss, generator_acc,
            expert_acc
        ]
        self.loss_name = [
            "generator_loss", "expert_loss", "entropy", "entropy_loss",
            "generator_acc", "expert_acc"
        ]
        self.total_loss = generator_loss + expert_loss + entropy_loss
        # Build Reward for policy
        # make it larger, the network is large, it may vanish if reward is small
        # take generator_loss into consideration, since logits = 0.4 and logits equal to 0.1 are considered same otherwise
        self.reward_op = 100 * (
            -tf.log(1 - tf.nn.sigmoid(generator_logits) + 1e-8) +
            generator_loss)
        var_list = self.get_trainable_variables()
        self.lossandgrad = U.function([
            self.generator_obs_ph, self.generator_acs_ph,
            self.generator_last_action_ph, self.expert_obs_ph,
            self.expert_acs_ph, self.expert_last_action_ph
        ], self.losses + [U.flatgrad(self.total_loss, var_list)])
Ejemplo n.º 20
0
def learn(
        env,
        policy_func,
        discriminator,
        expert_dataset,
        pretrained,
        pretrained_weight,
        *,
        g_step,
        d_step,
        timesteps_per_batch,  # what to train on
        max_kl,
        cg_iters,
        gamma,
        lam,  # advantage estimation
        entcoeff=0.001,
        cg_damping=1e-2,
        vf_stepsize=3e-4,
        d_stepsize=1.5e-4,
        vf_iters=3,
        max_timesteps=0,
        max_episodes=0,
        max_iters=0,
        max_seconds=0,  # time constraint
        callback=None,
        save_per_iter=100,
        ckpt_dir=None,
        log_dir=None,
        load_model_path=None,
        task_name=None,
        timesteps_per_actorbatch=16,
        clip_param=1e-5,
        adam_epsilon=4e-4,
        optim_epochs=1,
        optim_stepsize=4e-4,
        optim_batchsize=16,
        schedule='linear'):
    nworkers = MPI.COMM_WORLD.Get_size()
    print("##### nworkers: ", nworkers)
    rank = MPI.COMM_WORLD.Get_rank()
    np.set_printoptions(precision=3)
    # Setup losses and stuff
    # ----------------------------------------
    # ob_space = np.array([5*64*64 + 10*64*64 + 11 + 524]) # env.observation_space
    # ac_space = np.array([1]) #env.action_space
    from gym import spaces
    ob_space = spaces.Box(low=-1000,
                          high=10000,
                          shape=(5 * 64 * 64 + 10 * 64 * 64 + 11 + 524, ))
    ac_space = spaces.Discrete(524)
    pi = policy_func("pi",
                     ob_space,
                     ac_space,
                     reuse=(pretrained_weight != None))
    oldpi = policy_func("oldpi", ob_space, ac_space)
    atarg = tf.placeholder(
        dtype=tf.float32,
        shape=[None])  # Target advantage function (if applicable)
    ret = tf.placeholder(dtype=tf.float32, shape=[None])  # Empirical return

    lrmult = tf.placeholder(
        name='lrmult', dtype=tf.float32,
        shape=[])  # learning rate multiplier, updated with schedule
    clip_param = clip_param * lrmult  # Annealed cliping parameter epislon

    ob = U.get_placeholder_cached(name="ob")
    # ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=(None, ob_space[0]))
    ac = pi.pdtype.sample_placeholder([None])
    # prevac = pi.pdtype.sample_placeholder([None])
    prevac_placeholder = U.get_placeholder_cached(name="last_action_one_hot")

    kloldnew = oldpi.pd.kl(pi.pd)
    ent = pi.pd.entropy()
    # ent = pi.pd.entropy_usual() # see how it works, the value is the same
    meankl = U.mean(kloldnew)
    meanent = U.mean(ent)
    # entbonus = entcoeff * meanent
    # entcoeff = entcoeff * lrmult + 1e-5
    pol_entpen = (-entcoeff) * meanent

    ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac))  # pnew / pold
    surr1 = ratio * atarg  # surrogate from conservative policy iteration
    surr2 = tf.clip_by_value(ratio, 1.0 - clip_param,
                             1.0 + clip_param) * atarg  #
    pol_surr = -tf.reduce_mean(tf.minimum(
        surr1, surr2))  # PPO's pessimistic surrogate (L^CLIP)
    vf_loss = tf.reduce_mean(tf.square(pi.vpred - ret))
    total_loss = pol_surr + pol_entpen + vf_loss
    losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent]
    loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"]

    var_list = pi.get_trainable_variables()
    lossandgrad = U.function([ob, ac, prevac_placeholder, atarg, ret, lrmult],
                             losses + [U.flatgrad(total_loss, var_list)])
    g_adam = MpiAdam(var_list, epsilon=adam_epsilon)

    assign_old_eq_new = U.function(
        [], [],
        updates=[
            tf.assign(oldv, newv)
            for (oldv,
                 newv) in zipsame(oldpi.get_variables(), pi.get_variables())
        ])
    compute_losses = U.function(
        [ob, ac, prevac_placeholder, atarg, ret, lrmult], losses)

    # all_var_list = pi.get_trainable_variables()
    # var_list = [v for v in all_var_list if v.name.split("/")[1].startswith("pol")]
    # vf_var_list = [v for v in all_var_list if v.name.split("/")[1].startswith("vf")]
    d_adam = MpiAdam(discriminator.get_trainable_variables())
    # vfadam = MpiAdam(vf_var_list)

    get_flat = U.GetFlat(var_list)
    set_from_flat = U.SetFromFlat(var_list)

    @contextmanager
    def timed(msg):
        if rank == 0:
            print(colorize(msg, color='magenta'))
            tstart = time.time()
            yield
            print(
                colorize("done in %.3f seconds" % (time.time() - tstart),
                         color='magenta'))
        else:
            yield

    def allmean(x):
        assert isinstance(x, np.ndarray)
        out = np.empty_like(x)
        MPI.COMM_WORLD.Allreduce(x, out, op=MPI.SUM)
        out /= nworkers
        return out

    writer = U.FileWriter(log_dir)
    U.initialize()
    th_init = get_flat()
    MPI.COMM_WORLD.Bcast(th_init, root=0)
    set_from_flat(th_init)
    g_adam.sync()
    d_adam.sync()
    # vfadam.sync()
    print("Init param sum", th_init.sum(), flush=True)

    # Prepare for rollouts
    # ----------------------------------------
    seg_gen = traj_segment_generator(pi,
                                     env,
                                     discriminator,
                                     timesteps_per_batch,
                                     expert_dataset,
                                     stochastic=True)

    episodes_so_far = 0
    timesteps_so_far = 0
    iters_so_far = 0
    tstart = time.time()
    lenbuffer = deque(maxlen=100)  # rolling buffer for episode lengths
    rewbuffer = deque(maxlen=100)  # rolling buffer for episode rewards
    true_rewbuffer = deque(maxlen=100)

    assert sum([max_iters > 0, max_timesteps > 0, max_episodes > 0]) == 1

    g_loss_stats = stats(loss_names)
    d_loss_stats = stats(discriminator.loss_name)
    ep_stats = stats(["True_rewards", "Rewards", "Episode_length"])
    # # if provide pretrained weight
    # if pretrained_weight is not None:
    #     U.load_state(pretrained_weight, var_list=pi.get_variables())
    # # if provieded model path
    # if load_model_path is not None:
    #     U.load_state(load_model_path)

    while True:
        if callback: callback(locals(), globals())
        if max_timesteps and timesteps_so_far >= max_timesteps:
            break
        elif max_episodes and episodes_so_far >= max_episodes:
            break
        elif max_iters and iters_so_far >= max_iters:
            break

        if schedule == 'constant':
            cur_lrmult = 1.0
        elif schedule == 'linear':
            cur_lrmult = max(
                1.0 - float(timesteps_so_far) / (max_timesteps + 1e7),
                0.1)  # make the smallest number as 0.1 instead of 0
        else:
            raise NotImplementedError

        # Save model
        if iters_so_far % save_per_iter == 0 and ckpt_dir is not None:
            U.save_state(os.path.join(ckpt_dir, task_name),
                         counter=iters_so_far)

        logger.log("********** Iteration %i ************" % iters_so_far)

        # def fisher_vector_product(p):
        #     return allmean(compute_fvp(p, *fvpargs)) + cg_damping * p
        # # ------------------ Update G ------------------
        logger.log("Optimizing Policy...")
        meanlosses = []
        for _ in range(g_step):
            with timed("sampling"):
                seg = seg_gen.__next__()
            add_vtarg_and_adv(seg, gamma, lam)
            # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets))
            ob, ac, prevac, atarg, tdlamret = seg["ob"], seg["ac"], seg[
                "prevac"], seg["adv"], seg["tdlamret"]
            vpredbefore = seg[
                "vpred"]  # predicted value function before udpate
            # print("before standardize atarg value: ", atarg)
            if atarg.std() != 0:
                atarg = (atarg - atarg.mean()) / atarg.std(
                )  # standardized advantage function estimate
            else:
                with open("debug.txt", "a+") as f:
                    print("atarg.std() is equal to 0", atarg, file=f)
            # print("atarg value: ", atarg)

            # convert prevac to one hot
            one_hot_prevac = []
            if type(prevac) is np.ndarray:
                depth = prevac.size
                one_hot_prevac = np.zeros((depth, 524))
                one_hot_prevac[np.arange(depth), prevac] = 1
            else:
                one_hot_prevac = np.zeros(524)
                one_hot_prevac[prevac] = 1
                one_hot_prevac = [one_hot_prevac]
            prevac = one_hot_prevac

            d = Dataset(dict(ob=ob,
                             ac=ac,
                             prevac=prevac,
                             atarg=atarg,
                             vtarg=tdlamret),
                        shuffle=not pi.recurrent)
            optim_batchsize = optim_batchsize or ob.shape[0]
            # print("optim_batchsize: ", optim_batchsize)

            if hasattr(pi, "ob_rms"):
                pi.ob_rms.update(ob)  # update running mean/std for policy

            assign_old_eq_new(
            )  # set old parameter values to new parameter values
            logger.log(fmt_row(13, loss_names))
            for _ in range(optim_epochs):
                losses = [
                ]  # list of tuples, each of which gives the loss for a minibatch
                for batch in d.iterate_once(optim_batchsize):
                    *newlosses, g = lossandgrad(batch["ob"], batch["ac"],
                                                batch['prevac'],
                                                batch["atarg"], batch["vtarg"],
                                                cur_lrmult)
                    g_adam.update(g, optim_stepsize * cur_lrmult)  # allmean(g)

                    x_newlosses = compute_losses(batch["ob"], batch["ac"],
                                                 batch["prevac"],
                                                 batch["atarg"],
                                                 batch["vtarg"], cur_lrmult)
                    meanlosses = [x_newlosses]
                    losses.append(x_newlosses)
                logger.log(fmt_row(13, np.mean(losses, axis=0)))
                # meanlosses = losses

        # # logger.log("Evaluating losses...")
        # losses = []
        # for batch in d.iterate_once(optim_batchsize):
        #     newlosses = compute_losses(batch["ob"], batch["ac"], batch["prevac"],
        #         batch["atarg"], batch["vtarg"], cur_lrmult)
        #     losses.append(newlosses)
        # # # meanlosses,_,_ = mpi_moments(losses, axis=0) # it will be useful for multithreading
        meanlosses = np.mean(losses, axis=0)
        # logger.log(fmt_row(13, meanlosses))
        g_losses = meanlosses
        for (lossval, name) in zipsame(meanlosses, loss_names):
            logger.record_tabular("loss_" + name, lossval)
        logger.record_tabular("ev_tdlam_before",
                              explained_variance(vpredbefore, tdlamret))

        # ------------------ Update D ------------------
        logger.log("Optimizing Discriminator...")
        logger.log(fmt_row(13, discriminator.loss_name))
        global UP_TO_STEP
        ob_expert, ac_expert, prevac_expert = expert_dataset.get_next_batch(
            len(ob), UP_TO_STEP)
        batch_size = len(ob) // d_step
        d_losses = [
        ]  # list of tuples, each of which gives the loss for a minibatch
        for ob_batch, ac_batch, prevac_batch in dataset.iterbatches(
            (ob, ac, prevac),
                include_final_partial_batch=False,
                batch_size=batch_size):
            # print("###### len(ob_batch): ", len(ob_batch))
            ob_expert, ac_expert, prevac_expert = expert_dataset.get_next_batch(
                len(ob_batch), UP_TO_STEP)
            # update running mean/std for discriminator
            if hasattr(discriminator, "obs_rms"):
                discriminator.obs_rms.update(
                    np.concatenate((ob_batch, ob_expert), 0))

            depth = len(ac_batch)
            one_hot_ac_batch = np.zeros((depth, 524))
            one_hot_ac_batch[np.arange(depth), ac_batch] = 1

            # depth = len(prevac_batch)
            # one_hot_prevac_batch = np.zeros((depth, 524))
            # one_hot_prevac_batch[np.arange(depth), prevac_batch] = 1

            depth = len(ac_expert)
            one_hot_ac_expert = np.zeros((depth, 524))
            one_hot_ac_expert[np.arange(depth), ac_expert] = 1

            depth = len(prevac_expert)
            one_hot_prevac_expert = np.zeros((depth, 524))
            one_hot_prevac_expert[np.arange(depth), prevac_expert] = 1

            *newlosses, g = discriminator.lossandgrad(ob_batch,
                                                      one_hot_ac_batch,
                                                      prevac_batch, ob_expert,
                                                      one_hot_ac_expert,
                                                      one_hot_prevac_expert)
            global LAST_EXPERT_ACC, LAST_EXPERT_LOSS
            LAST_EXPERT_ACC = newlosses[5]
            LAST_EXPERT_LOSS = newlosses[1]
            d_adam.update(g, d_stepsize)  # allmean(g)
            d_losses.append(newlosses)
        logger.log(fmt_row(13, np.mean(d_losses, axis=0)))

        lrlocal = (seg["ep_lens"], seg["ep_rets"], seg["ep_true_rets"]
                   )  # local values
        listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal)  # list of tuples
        lens, rews, true_rets = map(flatten_lists, zip(*listoflrpairs))
        true_rewbuffer.extend(true_rets)
        lenbuffer.extend(lens)
        rewbuffer.extend(rews)

        logger.record_tabular("EpLenMean", np.mean(lenbuffer))
        logger.record_tabular("EpRewMean", np.mean(rewbuffer))
        logger.record_tabular("EpTrueRewMean", np.mean(true_rewbuffer))
        # logger.record_tabular("EpThisIter", len(lens))
        episodes_so_far = len(lens)
        timesteps_so_far = sum(lens)
        iters_so_far += 1

        logger.record_tabular("EpisodesSoFar", episodes_so_far)
        logger.record_tabular("TimestepsSoFar", timesteps_so_far)
        logger.record_tabular("TimeElapsed", time.time() - tstart)

        if rank == 0:
            logger.dump_tabular()
            g_loss_stats.add_all_summary(writer, g_losses, iters_so_far)
            d_loss_stats.add_all_summary(writer, np.mean(d_losses, axis=0),
                                         iters_so_far)
            ep_stats.add_all_summary(writer, [
                np.mean(true_rewbuffer),
                np.mean(rewbuffer),
                np.mean(lenbuffer)
            ], iters_so_far)

        global ITER_SOFAR_GLOBAL
        ITER_SOFAR_GLOBAL = iters_so_far

        # log ac picked
        with open('ac.txt', 'a+') as fh:
            print(ac, file=fh)
Ejemplo n.º 21
0
    def _init(self,
              ob_space,
              ac_space,
              hid_size,
              num_hid_layers,
              gaussian_fixed_var=True):
        assert isinstance(
            ob_space,
            gym.spaces.Box)  #ru guo hou mian tiao jian wei jia ze tui chu
        #print ("mlp_policy/20lines") zhi xing liang ci
        #print ("ac_space.shape[0]", ac_space.shape[0]) shu chu jie guo shi 3
        self.pdtype = pdtype = make_pdtype(
            ac_space
        )  #return DiagGaussianPdType(ac_space.shape[0]) zhe li mian zui hou you pdclass()
        sequence_length = None

        ob = U.get_placeholder(
            name="ob",
            dtype=tf.float32,
            shape=[sequence_length] + list(ob_space.shape)
        )  #return tf.placeholder(dtype=dtype, shape=shape, name=name)
        #print ("obspace.shape:::", list(ob_space.shape)) shu chu shi [11]
        with tf.variable_scope("obfilter"):
            #print("gail-tf/gailtf/baselines/ppo1/mlp_policy.py/28lines:")
            self.ob_rms = RunningMeanStd(
                shape=ob_space.shape)  #zhe ge han shu kan  bu dong

        obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0,
                               5.0)  #ob zhe ge shi hou hai shi placeholder
        last_out = obz
        for i in range(num_hid_layers):
            last_out = tf.nn.tanh(
                U.dense(last_out,
                        hid_size,
                        "vffc%i" % (i + 1),
                        weight_init=U.normc_initializer(
                            1.0)))  #da jian le quan lian jie ceng
        self.vpred = U.dense(
            last_out, 1, "vffinal", weight_init=U.normc_initializer(1.0)
        )[:,
          0]  #wen ti shi zhe li zui hou mei you shu chu dong zuo de kongjian

        last_out = obz
        for i in range(num_hid_layers):
            last_out = tf.nn.tanh(
                U.dense(last_out,
                        hid_size,
                        "polfc%i" % (i + 1),
                        weight_init=U.normc_initializer(1.0)))
        if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box):
            print("gaussian_fixed_var is used")
            mean = U.dense(last_out,
                           pdtype.param_shape()[0] // 2, "polfinal",
                           U.normc_initializer(0.01))
            logstd = tf.get_variable(name="logstd",
                                     shape=[1, pdtype.param_shape()[0] // 2],
                                     initializer=tf.zeros_initializer())
            pdparam = U.concatenate([mean, mean * 0.0 + logstd], axis=1)
        else:
            #print ("gaussian_fixed_var is not used") mei you bei yong dao
            pdparam = U.dense(last_out,
                              pdtype.param_shape()[0], "polfinal",
                              U.normc_initializer(0.01))

        self.pd = pdtype.pdfromflat(
            pdparam
        )  # mo rren shang mian de pdtype yi ding shi DiagGaussianPd return DiagGaussianPd
        #pd li mian you kl, entropy, sample deng fang fa
        self.state_in = []
        self.state_out = []

        # change for BC
        #stochastic = tf.placeholder(dtype=tf.bool, shape=())
        stochastic = U.get_placeholder(name="stochastic",
                                       dtype=tf.bool,
                                       shape=())
        ac = U.switch(stochastic, self.pd.sample(), self.pd.mode())
        self.ac = ac
        self._act = U.function([stochastic, ob], [ac, self.vpred])
Ejemplo n.º 22
0
def learn(
        env,
        policy_func,
        *,
        timesteps_per_batch,  # timesteps per actor per update
        clip_param,
        entcoeff,  # clipping parameter epsilon, entropy coeff
        optim_epochs,
        optim_stepsize,
        optim_batchsize,  # optimization hypers
        gamma,
        lam,  # advantage estimation
        max_timesteps=0,
        max_episodes=0,
        max_iters=0,
        max_seconds=0,  # time constraint
        callback=None,  # you can do anything in the callback, since it takes locals(), globals()
        adam_epsilon=1e-5,
        schedule='constant',  # annealing for stepsize parameters (epsilon and adam)
        save_per_iter=100,
        ckpt_dir=None,
        task="train",
        sample_stochastic=True,
        load_model_path=None,
        task_name=None,
        max_sample_traj=1500):
    print("max_timrsteps", max_timesteps)
    print("max_episodes", max_episodes)
    print("max_iters", max_iters)
    print("max_seconds", max_seconds)
    # Setup losses and stuff
    # ----------------------------------------
    ob_space = env.observation_space
    ac_space = env.action_space
    pi = policy_func("pi", ob_space,
                     ac_space)  # Construct network for new policy
    oldpi = policy_func("oldpi", ob_space, ac_space)  # Network for old policy
    atarg = tf.placeholder(
        dtype=tf.float32,
        shape=[None])  # Target advantage function (if applicable)
    ret = tf.placeholder(dtype=tf.float32, shape=[None])  # Empirical return

    lrmult = tf.placeholder(
        name='lrmult', dtype=tf.float32,
        shape=[])  # learning rate multiplier, updated with schedule
    clip_param = clip_param * lrmult  # Annealed cliping parameter epislon

    ob = U.get_placeholder_cached(name="ob")
    ac = pi.pdtype.sample_placeholder([None])

    kloldnew = oldpi.pd.kl(pi.pd)
    ent = pi.pd.entropy()
    meankl = U.mean(kloldnew)
    meanent = U.mean(ent)
    pol_entpen = (-entcoeff) * meanent

    ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac))  # pnew / pold
    surr1 = ratio * atarg  # surrogate from conservative policy iteration : r_t(\theta)*A_t
    surr2 = U.clip(ratio, 1.0 - clip_param,
                   1.0 + clip_param) * atarg  #更新則のCLIP項
    pol_surr = -U.mean(tf.minimum(
        surr1, surr2))  # PPO's pessimistic surrogate (L^CLIP) 目的関数
    vf_loss = U.mean(tf.square(pi.vpred - ret))
    total_loss = pol_surr + pol_entpen + vf_loss
    losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent]
    loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"]

    var_list = pi.get_trainable_variables()
    lossandgrad = U.function([ob, ac, atarg, ret, lrmult],
                             losses + [U.flatgrad(total_loss, var_list)])
    adam = MpiAdam(var_list, epsilon=adam_epsilon)

    assign_old_eq_new = U.function(
        [], [],
        updates=[
            tf.assign(oldv, newv)
            for (oldv,
                 newv) in zipsame(oldpi.get_variables(), pi.get_variables())
        ])
    compute_losses = U.function([ob, ac, atarg, ret, lrmult], losses)

    U.initialize()
    adam.sync()

    # Prepare for rollouts
    # ----------------------------------------
    seg_gen = traj_segment_generator(pi,
                                     env,
                                     timesteps_per_batch,
                                     stochastic=True)
    traj_gen = traj_episode_generator(pi,
                                      env,
                                      timesteps_per_batch,
                                      stochastic=sample_stochastic)

    episodes_so_far = 0
    timesteps_so_far = 0
    iters_so_far = 0
    tstart = time.time()
    lenbuffer = deque(maxlen=100)  # rolling buffer for episode lengths
    rewbuffer = deque(maxlen=100)  # rolling buffer for episode rewards
    assert sum(
        [max_iters > 0, max_timesteps > 0, max_episodes > 0,
         max_seconds > 0]) == 1, "Only one time constraint permitted"

    if task == 'sample_trajectory':
        # not elegant, i know :(
        sample_trajectory(load_model_path, max_sample_traj, traj_gen,
                          task_name, sample_stochastic)
        sys.exit()

    while True:
        if callback: callback(locals(), globals())
        if max_timesteps and timesteps_so_far >= max_timesteps:
            break
        elif max_episodes and episodes_so_far >= max_episodes:
            break
        elif max_iters and iters_so_far >= max_iters:
            break
        elif max_seconds and time.time() - tstart >= max_seconds:
            break
        if schedule == 'constant':
            cur_lrmult = 1.0
        elif schedule == 'linear':
            cur_lrmult = max(1.0 - float(timesteps_so_far) / max_timesteps, 0)
        else:
            raise NotImplementedError

        # Save model
        if iters_so_far % save_per_iter == 0 and ckpt_dir is not None:
            U.save_state(os.path.join(ckpt_dir, task_name),
                         counter=iters_so_far)

        logger.log("********** Iteration %i ************" % iters_so_far)

        seg = seg_gen.__next__()
        add_vtarg_and_adv(seg, gamma, lam)

        # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets))
        ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg[
            "tdlamret"]
        vpredbefore = seg["vpred"]  # predicted value function before udpate
        atarg = (atarg - atarg.mean()
                 ) / atarg.std()  # standardized advantage function estimate
        d = Dataset(dict(ob=ob, ac=ac, atarg=atarg, vtarg=tdlamret),
                    shuffle=not pi.recurrent)
        optim_batchsize = optim_batchsize or ob.shape[0]

        if hasattr(pi, "ob_rms"):
            pi.ob_rms.update(ob)  # update running mean/std for policy

        assign_old_eq_new()  # set old parameter values to new parameter values
        logger.log("Optimizing...")
        logger.log(fmt_row(13, loss_names))
        # Here we do a bunch of optimization epochs over the data
        for _ in range(optim_epochs):
            losses = [
            ]  # list of tuples, each of which gives the loss for a minibatch
            for batch in d.iterate_once(optim_batchsize):
                #更新部
                *newlosses, g = lossandgrad(batch["ob"], batch["ac"],
                                            batch["atarg"], batch["vtarg"],
                                            cur_lrmult)
                adam.update(g, optim_stepsize * cur_lrmult)
                #ADAMでgをアップデート
                losses.append(newlosses)
            logger.log(fmt_row(13, np.mean(losses, axis=0)))

        logger.log("Evaluating losses...")
        losses = []
        for batch in d.iterate_once(optim_batchsize):
            newlosses = compute_losses(batch["ob"], batch["ac"],
                                       batch["atarg"], batch["vtarg"],
                                       cur_lrmult)
            losses.append(newlosses)
        meanlosses, _, _ = mpi_moments(losses, axis=0)
        logger.log(fmt_row(13, meanlosses))
        for (lossval, name) in zipsame(meanlosses, loss_names):
            logger.record_tabular("loss_" + name, lossval)
        logger.record_tabular("ev_tdlam_before",
                              explained_variance(vpredbefore, tdlamret))
        lrlocal = (seg["ep_lens"], seg["ep_rets"])  # local values
        listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal)  # list of tuples
        lens, rews = map(flatten_lists, zip(*listoflrpairs))
        lenbuffer.extend(lens)
        rewbuffer.extend(rews)
        logger.record_tabular("EpLenMean", np.mean(lenbuffer))
        logger.record_tabular("EpRewMean", np.mean(rewbuffer))
        logger.record_tabular("EpThisIter", len(lens))
        episodes_so_far += len(lens)
        timesteps_so_far += sum(lens)
        iters_so_far += 1
        logger.record_tabular("EpisodesSoFar", episodes_so_far)
        print("... EpisodesSoFar ", episodes_so_far)
        logger.record_tabular("TimestepsSoFar", timesteps_so_far)
        print("... TimestepsSoFar ", timesteps_so_far)
        logger.record_tabular("TimeElapsed", time.time() - tstart)
        print("... TimeElapsed", time.time() - tstart)
        if MPI.COMM_WORLD.Get_rank() == 0:
            logger.dump_tabular()