Esempio n. 1
0
    def update(self, batch, update_actor=True):
        """Updates parameters of TD3 actor and critic given samples from the batch.

    Args:
       batch: A list of timesteps from environment.
       update_actor: a boolean variable, whether to perform a policy update.
    """
        obs = contrib_eager_python_tfe.Variable(
            np.stack(batch.obs).astype('float32'))
        action = contrib_eager_python_tfe.Variable(
            np.stack(batch.action).astype('float32'))
        next_obs = contrib_eager_python_tfe.Variable(
            np.stack(batch.next_obs).astype('float32'))
        mask = contrib_eager_python_tfe.Variable(
            np.stack(batch.mask).astype('float32'))

        if self.get_reward is not None:
            reward = self.get_reward(obs, action, next_obs)
        else:
            reward = contrib_eager_python_tfe.Variable(
                np.stack(batch.reward).astype('float32'))

        if self.use_td3:
            self._update_critic_td3(obs, action, next_obs, reward, mask)
        else:
            self._update_critic_ddpg(obs, action, next_obs, reward, mask)

        if self.critic_step.numpy() % self.policy_update_freq == 0:
            if update_actor:
                self._update_actor(obs, mask)
                soft_update(self.actor.variables, self.actor_target.variables,
                            self.tau)
            soft_update(self.critic.variables, self.critic_target.variables,
                        self.tau)
Esempio n. 2
0
    def __init__(self,
                 input_dim,
                 subsampling_rate,
                 lambd=10.0,
                 gail_loss='airl'):
        """Initializes actor, critic, target networks and optimizers.

    Args:
       input_dim: size of the observation space.
       subsampling_rate: subsampling rate that was used for expert trajectories.
       lambd: gradient penalty coefficient for wgan.
       gail_loss: gail loss to use.
    """

        self.subsampling_rate = subsampling_rate
        self.lambd = lambd
        self.gail_loss = gail_loss

        with tf.variable_scope('discriminator'):
            self.disc_step = contrib_eager_python_tfe.Variable(0,
                                                               dtype=tf.int64,
                                                               name='step')
            self.discriminator = Discriminator(input_dim)
            self.discriminator_optimizer = tf.train.AdamOptimizer()
            self.discriminator_optimizer._create_slots(
                self.discriminator.variables)  # pylint: disable=protected-access
Esempio n. 3
0
def scatter_update(ref: tfe.Variable, indices, updates):
	_ref = tfe.Variable(tf.cast(ref, tf.int32), trainable=False, name='hoge')
	del ref
	_updates = tf.cast(updates, tf.int32)
	x = tf.scatter_update(_ref, indices, _updates)
	update = tf.cast(x, tf.bool)
	return update
Esempio n. 4
0
def main(_):
  """Run td3/ddpg evaluation."""
  contrib_eager_python_tfe.enable_eager_execution()

  if FLAGS.use_gpu:
    tf.device('/device:GPU:0').__enter__()

  tf.gfile.MakeDirs(FLAGS.log_dir)
  summary_writer = contrib_summary.create_file_writer(
      FLAGS.log_dir, flush_millis=10000)

  env = gym.make(FLAGS.env)
  if FLAGS.wrap_for_absorbing:
    env = lfd_envs.AbsorbingWrapper(env)

  obs_shape = env.observation_space.shape
  act_shape = env.action_space.shape

  with tf.variable_scope('actor'):
    actor = Actor(obs_shape[0], act_shape[0])

  random_reward, _ = do_rollout(
      env, actor, None, num_trajectories=10, sample_random=True)

  reward_scale = contrib_eager_python_tfe.Variable(1, name='reward_scale')
  saver = contrib_eager_python_tfe.Saver(actor.variables + [reward_scale])

  last_checkpoint = tf.train.latest_checkpoint(FLAGS.load_dir)
  with summary_writer.as_default():
    while True:
      last_checkpoint = wait_for_next_checkpoint(FLAGS.load_dir,
                                                 last_checkpoint)

      total_numsteps = int(last_checkpoint.split('-')[-1])

      saver.restore(last_checkpoint)

      average_reward, average_length = do_rollout(
          env, actor, None, noise_scale=0.0, num_trajectories=FLAGS.num_trials)

      logging.info(
          'Evaluation: average episode length %d, average episode reward %f',
          average_length, average_reward)

      print('Evaluation: average episode length {}, average episode reward {}'.
            format(average_length, average_reward))

      with contrib_summary.always_record_summaries():
        if reward_scale.numpy() != 1.0:
          contrib_summary.scalar(
              'reward/scaled', (average_reward - random_reward) /
              (reward_scale.numpy() - random_reward),
              step=total_numsteps)
        contrib_summary.scalar('reward', average_reward, step=total_numsteps)
        contrib_summary.scalar('length', average_length, step=total_numsteps)
Esempio n. 5
0
def benchmark(batch_size, iters, seed=1, cuda=True, verbose=False):
    global final_loss, W_flat
    tf.set_random_seed(seed)
    np.random.seed(seed)

    images = tf.constant(u.get_mnist_images(batch_size).T)
    images = images[:batch_size]
    if cuda:
        images = images.gpu()
    data = images

    if cuda:
        device = '/gpu:0'
    else:
        device = ''

    device_ctx = tf.device(device)
    device_ctx.__enter__()

    visible_size = 28 * 28
    hidden_size = 196
    initial_val = tf.zeros([visible_size * hidden_size])
    if W_flat is None:
        W_flat = tfe.Variable(initial_val, name='W_flat')
    W_flat.assign(initial_val)

    def loss_fn(w_flat):
        w = tf.reshape(w_flat, [visible_size, hidden_size])
        x = tf.matmul(data, w)
        x = tf.sigmoid(x)
        x = tf.matmul(x, w, transpose_b=True)
        x = tf.sigmoid(x)
        return tf.reduce_mean(tf.square(x - data))

    value_and_gradients_fn = tfe.value_and_gradients_function(loss_fn)

    def opfunc(x):  # returns (value, gradient)
        value, grads = value_and_gradients_fn(x)
        return value, grads[0]

    # initialize weights
    W_flat.assign(u.ng_init(visible_size, hidden_size).flatten())

    state = Struct()
    config = Struct()
    config.maxIter = iters
    config.verbose = True
    x, f_hist, currentFuncEval = lbfgs(opfunc, W_flat, config, state, verbose)

    if verbose:
        u.summarize_time()

    return final_loss
def main(_):
    tf.enable_eager_execution()

    envs = [
        'HalfCheetah-v1', 'Hopper-v1', 'Ant-v1', 'Walker2d-v1', 'Reacher-v1'
    ]
    for ienv, env in enumerate(envs):
        print('Processing environment %d of %d: %s' %
              (ienv + 1, len(envs), env))
        h5_filename = os.path.join(FLAGS.src_data_dir, '%s.h5' % env)
        trajectories = h5py.File(h5_filename, 'r')

        if (set(trajectories.keys()) != set(
            ['a_B_T_Da', 'len_B', 'obs_B_T_Do', 'r_B_T'])):
            raise ValueError('Unexpected key set in file %s' % h5_filename)

        replay_buffer = ReplayBuffer()

        if env.find('Reacher') > -1:
            max_len = 50
        else:
            max_len = 1000

        for i in range(50):
            print('  Processing trajectory %d of 50 (len = %d)' %
                  (i + 1, trajectories['len_B'][i]))
            for j in range(trajectories['len_B'][i]):
                mask = 1
                if j + 1 == trajectories['len_B'][i]:
                    if trajectories['len_B'][i] == max_len:
                        mask = 1
                    else:
                        mask = 0
                replay_buffer.push_back(
                    trajectories['obs_B_T_Do'][i][j],
                    trajectories['a_B_T_Da'][i][j],
                    trajectories['obs_B_T_Do'][i][(j + 1) %
                                                  trajectories['len_B'][i]],
                    [trajectories['r_B_T'][i][j]], [mask],
                    j == trajectories['len_B'][i] - 1)

        replay_buffer_var = contrib_eager_python_tfe.Variable(
            '', name='expert_replay_buffer')
        saver = contrib_eager_python_tfe.Saver([replay_buffer_var])
        odir = os.path.join(FLAGS.dst_data_dir, env)
        print('Saving results to checkpoint in directory: %s' % odir)
        tf.gfile.MakeDirs(odir)
        replay_buffer_var.assign(pickle.dumps(replay_buffer))
        saver.save(os.path.join(odir, 'expert_replay_buffer'))
Esempio n. 7
0
    def __init__(self, model, learning_rate, training_iters, batch_size):
        self.model = model
        self.learning_rate = tfe.Variable(learning_rate)
        self.training_iters = training_iters
        self.batch_size = batch_size
        self.checkpoint = None
        self.lr_step = 0

        def get_lr():
            if self.lr_step > 0 and self.lr_step % training_iters == 0:
                self.learning_rate.assign_sub(self.learning_rate * 0.005)
            self.lr_step += 1
            return self.learning_rate

        self.optimizer = tf.train.AdamOptimizer(get_lr)
Esempio n. 8
0
    def __init__(self, emoji_img, **kwargs):
        super(EmojiCNN, self).__init__()

        # parameters
        self.batch_size = kwargs.get("batch_size", 128)
        self.output_size = kwargs.get("output_size", 4)

        # model
        self.img_emb = tfe.Variable(emoji_img, name="imgs", trainable=False)
        self.conv_1 = Conv2D(32, (5, 5), activation="relu")
        self.max_pool_1 = MaxPool2D((2, 2))
        self.conv_2 = Conv2D(32, (5, 5), activation="relu")
        self.max_pool_2 = MaxPool2D((4, 4))
        self.conv_3 = Conv2D(32, (5, 5), activation="relu")
        self.max_pool_3 = MaxPool2D((4, 4))
        self.flatten = Flatten()
        self.output_layer = Dense(self.output_size, activation="softmax")
Esempio n. 9
0
def main(_):
    tf.enable_eager_execution()

    if not FLAGS.data_path:
        raise ValueError("Must specify --data-path")
    corpus = Datasets(FLAGS.data_path)
    train_data = _divide_into_batches(corpus.train, FLAGS.batch_size)
    eval_data = _divide_into_batches(corpus.valid, 10)

    have_gpu = tfe.num_gpus() > 0
    use_cudnn_rnn = not FLAGS.no_use_cudnn_rnn and have_gpu

    with tf.device("/device:GPU:0" if have_gpu else None):
        # Make learning_rate a Variable so it can be included in the checkpoint
        # and we can resume training with the last saved learning_rate.
        learning_rate = tfe.Variable(20.0, name="learning_rate")
        model = PTBModel(corpus.vocab_size(), FLAGS.embedding_dim,
                         FLAGS.hidden_dim, FLAGS.num_layers, FLAGS.dropout,
                         use_cudnn_rnn)
        optimizer = tf.train.GradientDescentOptimizer(learning_rate)
        checkpoint = tfe.Checkpoint(
            learning_rate=learning_rate,
            model=model,
            # GradientDescentOptimizer has no state to checkpoint, but noting it
            # here lets us swap in an optimizer that does.
            optimizer=optimizer)
        # Restore existing variables now (learning_rate), and restore new variables
        # on creation if a checkpoint exists.
        checkpoint.restore(tf.train.latest_checkpoint(FLAGS.logdir))
        sys.stderr.write("learning_rate=%f\n" % learning_rate.numpy())

        best_loss = None
        for _ in range(FLAGS.epoch):
            train(model, optimizer, train_data, FLAGS.seq_len, FLAGS.clip)
            eval_loss = evaluate(model, eval_data)
            if not best_loss or eval_loss < best_loss:
                if FLAGS.logdir:
                    checkpoint.save(os.path.join(FLAGS.logdir, "ckpt"))
                best_loss = eval_loss
            else:
                learning_rate.assign(learning_rate / 4.0)
                sys.stderr.write(
                    "eval_loss did not reduce in this epoch, "
                    "changing learning rate to %f for the next epoch\n" %
                    learning_rate.numpy())
Esempio n. 10
0
def main(_):
    tfe.enable_eager_execution()

    if not FLAGS.data_path:
        raise ValueError("Must specify --data_path")
    corpus = Corpus(FLAGS.data_path)
    # TODO(ashankar): Remove _batchify and _get_batch and use the Datasets API
    # instead.
    train_data = _batchify(corpus.train, FLAGS.batch_size)
    eval_data = _batchify(corpus.valid, 10)

    have_gpu = tfe.num_gpus() > 0
    use_cudnn_rnn = not FLAGS.no_use_cudnn_rnn and have_gpu

    with tfe.restore_variables_on_create(
            tf.train.latest_checkpoint(FLAGS.logdir)):
        with tf.device("/device:GPU:0" if have_gpu else None):
            # Make learning_rate a Variable so it can be included in the checkpoint
            # and we can resume training with the last saved learning_rate.
            learning_rate = tfe.Variable(20.0, name="learning_rate")
            sys.stderr.write("learning_rate=%f\n" % learning_rate.numpy())
            model = PTBModel(corpus.vocab_size(), FLAGS.embedding_dim,
                             FLAGS.hidden_dim, FLAGS.num_layers, FLAGS.dropout,
                             use_cudnn_rnn)
            optimizer = tf.train.GradientDescentOptimizer(learning_rate)

            best_loss = None
            for _ in range(FLAGS.epoch):
                train(model, optimizer, train_data, FLAGS.seq_len, FLAGS.clip)
                eval_loss = evaluate(model, eval_data)
                if not best_loss or eval_loss < best_loss:
                    if FLAGS.logdir:
                        tfe.Saver(model.trainable_weights +
                                  [learning_rate]).save(
                                      os.path.join(FLAGS.logdir, "ckpt"))
                    best_loss = eval_loss
                else:
                    learning_rate.assign(learning_rate / 4.0)
                    sys.stderr.write(
                        "eval_loss did not reduce in this epoch, "
                        "changing learning rate to %f for the next epoch\n" %
                        learning_rate.numpy())
Esempio n. 11
0
  def __init__(self, input_dim, obs_dim, ac_dim, goal_dim, subsampling_rate, lambd=10.0, gail_loss='airl', use_s_p=False,
               only_s=False):
    """Initializes actor, critic, target networks and optimizers.

    Args:
       input_dim: size of the observation space.
       subsampling_rate: subsampling rate that was used for expert trajectories.
       lambd: gradient penalty coefficient for wgan.
       gail_loss: gail loss to use.
       use_s_p: if (s, s', g) is used instead of (s, a, g)
    """

    self.subsampling_rate = subsampling_rate
    self.lambd = lambd
    self.gail_loss = gail_loss
    self.use_s_p = use_s_p
    self.only_s = only_s

    with tf.variable_scope('discriminator'):
      self.disc_step = tfe.Variable(0, dtype=tf.int64, name='step')
      self.discriminator = Discriminator(input_dim)
      self.discriminator_optimizer = tf.train.AdamOptimizer()
      self.discriminator_optimizer._create_slots(self.discriminator.variables)  # pylint: disable=protected-access

    obs = self.obs = tf.placeholder(shape=(None, obs_dim), dtype=tf.float32)
    expert_obs = self.expert_obs = tf.placeholder(shape=(None, obs_dim), dtype=tf.float32)

    goal = self.goal = tf.placeholder(shape=(None, goal_dim), dtype=tf.float32)
    expert_goal = self.expert_goal = tf.placeholder(shape=(None, goal_dim), dtype=tf.float32)

    # expert_mask = tfe.Variable(np.stack(expert_batch.mask).astype('float32'))

    # Since expert trajectories were resampled but no absorbing state,
    # statistics of the states changes, we need to adjust weights accordingly.
    # expert_mask = tf.maximum(0, -expert_mask)
    # expert_weight = expert_mask / self.subsampling_rate + (1 - expert_mask)

    action = self.action = tf.placeholder(shape=(None, ac_dim), dtype=tf.float32)
    expert_action = self.expert_action = tf.placeholder(shape=(None, ac_dim), dtype=tf.float32)

    if self.only_s:
      inputs = tf.concat([obs, goal], -1)
      expert_inputs = tf.concat([expert_obs, expert_goal], -1)
    else:
      inputs = tf.concat([obs, goal, action], -1)
      expert_inputs = tf.concat([expert_obs, expert_goal, expert_action], -1)

    # Avoid using tensorflow random functions since it's impossible to get
    # the state of the random number generator used by TensorFlow.
    alpha = self.alpha =  tf.placeholder(shape=(None, 1), dtype=tf.float32)
    # alpha = tfe.Variable(alpha.astype('float32'))
    inter = alpha * inputs + (1 - alpha) * expert_inputs

    # with tf.GradientTape() as tape:
    output = self.discriminator(inputs)
    expert_output = self.discriminator(expert_inputs)



    with tf.contrib.summary.record_summaries_every_n_global_steps(
            100, self.disc_step):
      gan_loss = tfgan_losses.modified_discriminator_loss(
        expert_output,
        output,
        label_smoothing=0.0,
        #    real_weights=expert_weight
      )
      tf.contrib.summary.scalar(
        'discriminator/expert_output',
        tf.reduce_mean(expert_output),
        step=self.disc_step)
      tf.contrib.summary.scalar(
        'discriminator/policy_output',
        tf.reduce_mean(output),
        step=self.disc_step)

      # with tf.GradientTape() as tape2:
      #   tape2.watch(inter)
      output = self.discriminator(inter)
      grad = tf.gradients(output, [inter])[0]

      grad_penalty = tf.reduce_mean(tf.pow(tf.norm(grad, axis=-1) - 1, 2))

      loss = gan_loss + self.lambd * grad_penalty

    with tf.contrib.summary.record_summaries_every_n_global_steps(
            100, self.disc_step):
      tf.contrib.summary.scalar(
        'discriminator/grad_penalty', grad_penalty, step=self.disc_step)

    with tf.contrib.summary.record_summaries_every_n_global_steps(
            100, self.disc_step):
      tf.contrib.summary.scalar(
        'discriminator/loss', gan_loss, step=self.disc_step)

    grads = tf.gradients(loss, self.discriminator.variables)

    self.update_ops = self.discriminator_optimizer.apply_gradients(
      zip(grads, self.discriminator.variables), global_step=self.disc_step)

    self.airl_rew = self.discriminator(inputs)
    self.gail_rew = -tf.log(1 - tf.nn.sigmoid(self.discriminator(inputs)) + 1e-8)
    self.normalized_rew = tf.nn.sigmoid(self.discriminator(inputs))
    self.negative_rew = tf.log(tf.nn.sigmoid(self.discriminator(inputs)) + 1e-8)
 def set_images(self, content_path, style_path):
     self.content = self.read_image(content_path)
     self.style = self.read_image(
         style_path, output_size=self.content.get_shape().as_list()[1:3])
     self.output = tfe.Variable(
         tf.random_normal(self.content.get_shape(), dtype=tf.float32))
Esempio n. 13
0
    cfg = tf.ConfigProto()
    cfg.gpu_options.allow_growth = True
    # cfg.gpu_options.per_process_gpu_memory_fraction = 0.1
    return tf.Session(config=cfg)


get_session()

tfe.enable_eager_execution()
tfe.executing_eagerly()  # => True

if __name__ == '__main__':
    x = np.zeros((2, 4))
    x[0, 2:] = 1
    x[0, 0:2] = 0.5
    var_x = tfe.Variable(x)

    a = tf.not_equal(var_x, 0.5)

    y = np.zeros((2, 4))
    y[0, 0] = 1
    y[0, 1] = 2
    y[0, 2] = 3
    y[0, 3] = 4
    indices = tf.where(a)
    print(var_x)
    print(a)

    y_a = tf.gather_nd(y, indices)
    print(indices)
    print(y_a)
Esempio n. 14
0
    if y_pred is not None:
        plt.plot(X.numpy(), y_pred.numpy(), c='r', linewidth=5)

    plt.show()
    del fig


if __name__ == '__main__':

    num_samples, data_W, data_b = 1000, 3, 2

    # Load training dataset.
    X, y = load_data(n=num_samples, W=data_W, b=data_b)

    # Model variables.
    W = tfe.Variable(tf.zeros(shape=()), name="weights")
    b = tfe.Variable(tf.zeros(shape=()), name="biases")

    epochs = 500
    learning_rate = 1e-2

    for epoch in range(epochs):
        dW, db = grad(X, y, W, b)

        # Update W & b.
        W.assign_sub(learning_rate * dW)
        b.assign_sub(learning_rate * db)
        loss = loss_func(prediction(X, W, b), y)

        print(('\rEpoch {:,}\tLoss {:3f}\tW = {:.2f}'
               '\tb={:.2f}').format(epoch + 1, loss.numpy(), W.numpy(),
Esempio n. 15
0
def main(_):
	ref = tfe.Variable([False, False, False], trainable=False, name='hoge')
	indices = tf.range(3)
	updates = tf.constant([True, True, True])
	
	print(scatter_update(ref, indices, updates))
tfe.enable_eager_execution()


@tfe.custom_gradient
def my_matmul(x, y):
    result = x @ y

    def grad(dr):
        return [dr @ tf.transpose(y), tf.transpose(x) @ dr]

    return result, grad


lr = 0.25
n = 2
x = tfe.Variable(tf.ones((n, n)), name="x")
y = tf.constant([[1, 2], [3, 4]], dtype=tf.float32)


def loss_fn(x):
    return tf.reduce_sum(my_matmul(x, y))


loss_grads_fn = tfe.value_and_gradients_function(loss_fn)

for step in range(5):
    loss, grads = loss_grads_fn(x)
    print("loss =", loss.numpy())
    x.assign_sub(lr * grads[0])

assert loss.numpy() == -96
Esempio n. 17
0
import tensorflow as tf

from tensorflow.contrib.eager.python import tfe

tf.enable_eager_execution()

x_data = tf.random_normal([
    1000,
])
x_noise = tf.random_normal([
    1000,
])

y_label = 3 * x_data + x_noise

w = tfe.Variable(5.)
b = tfe.Variable(10.)


def mse(label, predict):
    loss = tf.losses.mean_squared_error(label, predict)
    return loss


optimizer = tf.train.GradientDescentOptimizer(0.003)

for step in range(3000):
    with tf.GradientTape(persistent=True) as tape:
        y_predict = w * x_data + b
        l = mse(y_label, y_predict)
        w_grad, b_grad = tape.gradient(l, [w, b])
Esempio n. 18
0
def main():
    np.random.seed(1)
    tf.set_random_seed(2)

    dtype = np.float32
    lambda_ = 3e-3
    lr = 0.2
    dsize = 2

    def t(mat):
        return tf.transpose(mat)

    def regularized_inverse(mat):
        n = int(mat.shape[0])
        return tf.linalg.inv(mat + lambda_ * tf.eye(n, dtype=dtype))

    train_images = np.asarray([[0, 1], [2, 3]])
    X = tf.constant(train_images[:, :dsize].astype(dtype))

    W1_0 = np.asarray([[0., 1], [2, 3]]).astype(dtype) / 10
    W2_0 = np.asarray([[4., 5], [6, 7]]).astype(dtype) / 10
    W1 = tfe.Variable(W1_0, name='W1')
    W2 = tfe.Variable(W2_0, name='W2')

    forward = []
    backward = []
    forward_inv = []
    backward_inv = []

    @tfe.custom_gradient
    def capturing_matmul(W, A):
        forward.append(A)

        def grad(B):
            backward.append(B)
            return [B @ tf.transpose(A), tf.transpose(W) @ B]

        return W @ A, grad

    @tfe.custom_gradient
    def kfac_matmul(W, A):
        def grad(B):
            kfac_A = forward_inv.pop() @ A
            kfac_B = backward_inv.pop() @ B
            return [kfac_B @ tf.transpose(kfac_A), tf.transpose(W) @ B]

        return W @ A, grad

    matmul = tf.matmul

    def loss_fn(synthetic=False):
        x = tf.nn.sigmoid(matmul(W1, X))
        x = tf.nn.sigmoid(matmul(W2, x))
        if synthetic:
            noise = tf.random_normal(X.shape)
            target = tf.constant((x + noise).numpy())
        else:
            target = X
        err = target - x
        loss = tf.reduce_sum(err * err) / 2 / dsize
        return loss

    loss_and_grads = tfe.implicit_value_and_gradients(loss_fn)
    optimizer = tf.train.GradientDescentOptimizer(learning_rate=lr)
    for step in range(10):
        del backward[:]
        del forward[:]
        del forward_inv[:]
        del backward_inv[:]

        matmul = capturing_matmul
        loss, grads_and_vars = loss_and_grads(True)
        backward.reverse()

        for i in range(len(backward)):
            backward[i] = backward[i] * dsize

        def cov(X):
            return X @ t(X) / dsize

        def invcov(X):
            return regularized_inverse(cov(X))

        for i in range(2):
            forward_inv.append(invcov(forward[i]))
            backward_inv.append(invcov(backward[i]))

        matmul = kfac_matmul
        loss, grads_and_vars = loss_and_grads()
        print("Step %3d loss %10.9f" % (step, loss.numpy()))
        optimizer.apply_gradients(grads_and_vars)

    target = 1.251444697  # with proper random sampling
    assert abs(loss.numpy() - target) < 1e-9, abs(loss.numpy() - target)
Esempio n. 19
0
    def _update_critic_td3(self, obs, action, next_obs, reward, mask):
        """Updates parameters of td3 critic given samples from the batch.

    Args:
       obs: A tfe.Variable with a batch of observations.
       action: A tfe.Variable with a batch of actions.
       next_obs: A tfe.Variable with a batch of next observations.
       reward: A tfe.Variable with a batch of rewards.
       mask: A tfe.Variable with a batch of masks.
    """
        # Avoid using tensorflow random functions since it's impossible to get
        # the state of the random number generator used by TensorFlow.
        target_action_noise = np.random.normal(
            size=action.get_shape(), scale=self.policy_noise).astype('float32')
        target_action_noise = contrib_eager_python_tfe.Variable(
            target_action_noise)

        target_action_noise = tf.clip_by_value(target_action_noise,
                                               -self.policy_noise_clip,
                                               self.policy_noise_clip)

        noisy_action_targets = self.actor_target(
            next_obs) + target_action_noise

        clipped_noisy_action_targets = tf.clip_by_value(
            noisy_action_targets, -1, 1)

        if self.use_absorbing_state:
            # Starting from the goal state we can execute only non-actions.
            a_mask = tf.maximum(0, mask)
            q_next1, q_next2 = self.critic_target(
                next_obs, clipped_noisy_action_targets * a_mask)
            q_next = tf.reduce_min(tf.concat([q_next1, q_next2], -1),
                                   -1,
                                   keepdims=True)
            q_target = reward + self.discount * q_next
        else:
            q_next1, q_next2 = self.critic_target(
                next_obs, clipped_noisy_action_targets)
            q_next = tf.reduce_min(tf.concat([q_next1, q_next2], -1),
                                   -1,
                                   keepdims=True)
            q_target = reward + self.discount * mask * q_next

        with tf.GradientTape() as tape:
            q_pred1, q_pred2 = self.critic(obs, action)
            critic_loss = tf.losses.mean_squared_error(
                q_target, q_pred1) + tf.losses.mean_squared_error(
                    q_target, q_pred2)

        grads = tape.gradient(critic_loss, self.critic.variables)
        self.critic_optimizer.apply_gradients(zip(grads,
                                                  self.critic.variables),
                                              global_step=self.critic_step)

        if self.use_absorbing_state:
            with contrib_summary.record_summaries_every_n_global_steps(
                    100, self.critic_step):
                a_mask = tf.maximum(0, -mask)
                if tf.reduce_sum(a_mask).numpy() > 0:
                    contrib_summary.scalar('critic/absorbing_reward',
                                           tf.reduce_sum(reward * a_mask) /
                                           tf.reduce_sum(a_mask),
                                           step=self.critic_step)

        with contrib_summary.record_summaries_every_n_global_steps(
                100, self.critic_step):
            contrib_summary.scalar('critic/loss',
                                   critic_loss,
                                   step=self.critic_step)
def main(_):
  """Run td3/ddpg training."""
  contrib_eager_python_tfe.enable_eager_execution()

  if FLAGS.use_gpu:
    tf.device('/device:GPU:0').__enter__()

  tf.gfile.MakeDirs(FLAGS.log_dir)
  summary_writer = contrib_summary.create_file_writer(
      FLAGS.log_dir, flush_millis=10000)

  tf.set_random_seed(FLAGS.seed)
  np.random.seed(FLAGS.seed)
  random.seed(FLAGS.seed)

  env = gym.make(FLAGS.env)
  env.seed(FLAGS.seed)
  if FLAGS.learn_absorbing:
    env = lfd_envs.AbsorbingWrapper(env)

  if FLAGS.env in ['HalfCheetah-v2', 'Ant-v1']:
    rand_actions = int(1e4)
  else:
    rand_actions = int(1e3)

  obs_shape = env.observation_space.shape
  act_shape = env.action_space.shape

  subsampling_rate = env._max_episode_steps // FLAGS.trajectory_size  # pylint: disable=protected-access
  lfd = gail.GAIL(
      obs_shape[0] + act_shape[0],
      subsampling_rate=subsampling_rate,
      gail_loss=FLAGS.gail_loss)

  if FLAGS.algo == 'td3':
    model = ddpg_td3.DDPG(
        obs_shape[0],
        act_shape[0],
        use_td3=True,
        policy_update_freq=2,
        actor_lr=FLAGS.actor_lr,
        get_reward=lfd.get_reward,
        use_absorbing_state=FLAGS.learn_absorbing)
  else:
    model = ddpg_td3.DDPG(
        obs_shape[0],
        act_shape[0],
        use_td3=False,
        policy_update_freq=1,
        actor_lr=FLAGS.actor_lr,
        get_reward=lfd.get_reward,
        use_absorbing_state=FLAGS.learn_absorbing)

  random_reward, _ = do_rollout(
      env, model.actor, None, num_trajectories=10, sample_random=True)

  replay_buffer_var = contrib_eager_python_tfe.Variable(
      '', name='replay_buffer')
  expert_replay_buffer_var = contrib_eager_python_tfe.Variable(
      '', name='expert_replay_buffer')

  # Save and restore random states of gym/numpy/python.
  # If the job is preempted, it guarantees that it won't affect the results.
  # And the results will be deterministic (on CPU) and reproducible.
  gym_random_state_var = contrib_eager_python_tfe.Variable(
      '', name='gym_random_state')
  np_random_state_var = contrib_eager_python_tfe.Variable(
      '', name='np_random_state')
  py_random_state_var = contrib_eager_python_tfe.Variable(
      '', name='py_random_state')

  reward_scale = contrib_eager_python_tfe.Variable(1, name='reward_scale')

  saver = contrib_eager_python_tfe.Saver(
      model.variables + lfd.variables +
      [replay_buffer_var, expert_replay_buffer_var, reward_scale] +
      [gym_random_state_var, np_random_state_var, py_random_state_var])

  tf.gfile.MakeDirs(FLAGS.save_dir)

  eval_saver = contrib_eager_python_tfe.Saver(model.actor.variables +
                                              [reward_scale])
  tf.gfile.MakeDirs(FLAGS.eval_save_dir)

  last_checkpoint = tf.train.latest_checkpoint(FLAGS.save_dir)
  if last_checkpoint is None:
    expert_saver = contrib_eager_python_tfe.Saver([expert_replay_buffer_var])
    last_checkpoint = os.path.join(FLAGS.expert_dir, 'expert_replay_buffer')
    expert_saver.restore(last_checkpoint)
    expert_replay_buffer = pickle.loads(expert_replay_buffer_var.numpy())
    expert_reward = expert_replay_buffer.get_average_reward()

    logging.info('Expert reward %f', expert_reward)
    print('Expert reward {}'.format(expert_reward))

    reward_scale.assign(expert_reward)
    expert_replay_buffer.subsample_trajectories(FLAGS.num_expert_trajectories)
    if FLAGS.learn_absorbing:
      expert_replay_buffer.add_absorbing_states(env)

    # Subsample after adding absorbing states, because otherwise we can lose
    # final states.

    print('Original dataset size {}'.format(len(expert_replay_buffer)))
    expert_replay_buffer.subsample_transitions(subsampling_rate)
    print('Subsampled dataset size {}'.format(len(expert_replay_buffer)))
    replay_buffer = ReplayBuffer()
    total_numsteps = 0
    prev_save_timestep = 0
    prev_eval_save_timestep = 0
  else:
    saver.restore(last_checkpoint)
    replay_buffer = pickle.loads(zlib.decompress(replay_buffer_var.numpy()))
    expert_replay_buffer = pickle.loads(
        zlib.decompress(expert_replay_buffer_var.numpy()))
    total_numsteps = int(last_checkpoint.split('-')[-1])
    prev_save_timestep = total_numsteps
    prev_eval_save_timestep = total_numsteps
    env.unwrapped.np_random.set_state(
        pickle.loads(gym_random_state_var.numpy()))
    np.random.set_state(pickle.loads(np_random_state_var.numpy()))
    random.setstate(pickle.loads(py_random_state_var.numpy()))

  with summary_writer.as_default():
    while total_numsteps < FLAGS.training_steps:
      # Decay helps to make the model more stable.
      # TODO(agrawalk): Use tf.train.exponential_decay
      model.actor_lr.assign(
          model.initial_actor_lr * pow(0.5, total_numsteps // 100000))
      logging.info('Learning rate %f', model.actor_lr.numpy())
      rollout_reward, rollout_timesteps = do_rollout(
          env,
          model.actor,
          replay_buffer,
          noise_scale=FLAGS.exploration_noise,
          rand_actions=rand_actions,
          sample_random=(model.actor_step.numpy() == 0),
          add_absorbing_state=FLAGS.learn_absorbing)
      total_numsteps += rollout_timesteps

      logging.info('Training: total timesteps %d, episode reward %f',
                   total_numsteps, rollout_reward)

      print('Training: total timesteps {}, episode reward {}'.format(
          total_numsteps, rollout_reward))

      with contrib_summary.always_record_summaries():
        contrib_summary.scalar(
            'reward/scaled', (rollout_reward - random_reward) /
            (reward_scale.numpy() - random_reward),
            step=total_numsteps)
        contrib_summary.scalar('reward', rollout_reward, step=total_numsteps)
        contrib_summary.scalar('length', rollout_timesteps, step=total_numsteps)

      if len(replay_buffer) >= FLAGS.min_samples_to_start:
        for _ in range(rollout_timesteps):
          time_step = replay_buffer.sample(batch_size=FLAGS.batch_size)
          batch = TimeStep(*zip(*time_step))

          time_step = expert_replay_buffer.sample(batch_size=FLAGS.batch_size)
          expert_batch = TimeStep(*zip(*time_step))

          lfd.update(batch, expert_batch)

        for _ in range(FLAGS.updates_per_step * rollout_timesteps):
          time_step = replay_buffer.sample(batch_size=FLAGS.batch_size)
          batch = TimeStep(*zip(*time_step))
          model.update(
              batch,
              update_actor=model.critic_step.numpy() >=
              FLAGS.policy_updates_delay)

        if total_numsteps - prev_save_timestep >= FLAGS.save_interval:
          replay_buffer_var.assign(zlib.compress(pickle.dumps(replay_buffer)))
          expert_replay_buffer_var.assign(
              zlib.compress(pickle.dumps(expert_replay_buffer)))
          gym_random_state_var.assign(
              pickle.dumps(env.unwrapped.np_random.get_state()))
          np_random_state_var.assign(pickle.dumps(np.random.get_state()))
          py_random_state_var.assign(pickle.dumps(random.getstate()))
          saver.save(
              os.path.join(FLAGS.save_dir, 'checkpoint'),
              global_step=total_numsteps)
          prev_save_timestep = total_numsteps

        if total_numsteps - prev_eval_save_timestep >= FLAGS.eval_save_interval:
          eval_saver.save(
              os.path.join(FLAGS.eval_save_dir, 'checkpoint'),
              global_step=total_numsteps)
          prev_eval_save_timestep = total_numsteps
Esempio n. 21
0
    def __init__(self,
                 input_dim,
                 action_dim,
                 discount=0.99,
                 tau=0.005,
                 actor_lr=1e-3,
                 critic_lr=1e-3,
                 use_td3=True,
                 policy_noise=0.2,
                 policy_noise_clip=0.5,
                 policy_update_freq=2,
                 get_reward=None,
                 use_absorbing_state=False):
        """Initializes actor, critic, target networks and optimizers.

    The class handles absorbing state properly. Absorbing state corresponds to
    a state which a policy gets in after reaching a goal state and stays there
    forever. For most RL problems, we can just assign 0 to all reward after
    the goal. But for GAIL, we need to have an actual absorbing state.

    Args:
       input_dim: size of the observation space.
       action_dim: size of the action space.
       discount: reward discount.
       tau: target networks update coefficient.
       actor_lr: actor learning rate.
       critic_lr: critic learning rate.
       use_td3: whether to use standard ddpg or td3.
       policy_noise: std of gaussian added to critic action input.
       policy_noise_clip: clip added gaussian noise.
       policy_update_freq: perform policy update once per n steps.
       get_reward: a function that given (s,a,s') returns a reward.
       use_absorbing_state: whether to use an absorbing state or not.
    """
        self.discount = discount
        self.tau = tau

        self.use_td3 = use_td3
        self.policy_noise = policy_noise
        self.policy_noise_clip = policy_noise_clip
        self.policy_update_freq = policy_update_freq
        self.get_reward = get_reward
        self.use_absorbing_state = use_absorbing_state

        with tf.variable_scope('actor'):
            self.actor = Actor(input_dim, action_dim)
            with tf.variable_scope('target'):
                self.actor_target = Actor(input_dim, action_dim)

            self.initial_actor_lr = actor_lr
            self.actor_lr = contrib_eager_python_tfe.Variable(actor_lr,
                                                              name='lr')
            self.actor_step = contrib_eager_python_tfe.Variable(0,
                                                                dtype=tf.int64,
                                                                name='step')
            self.actor_optimizer = tf.train.AdamOptimizer(
                learning_rate=self.actor_lr)
            self.actor_optimizer._create_slots(self.actor.variables)  # pylint: disable=protected-access

        soft_update(self.actor.variables, self.actor_target.variables)

        with tf.variable_scope('critic'):
            if self.use_td3:
                self.critic = CriticTD3(input_dim + action_dim)
                with tf.variable_scope('target'):
                    self.critic_target = CriticTD3(input_dim + action_dim)
            else:
                self.critic = CriticDDPG(input_dim + action_dim)
                with tf.variable_scope('target'):
                    self.critic_target = CriticDDPG(input_dim + action_dim)

            self.critic_step = contrib_eager_python_tfe.Variable(
                0, dtype=tf.int64, name='step')
            self.critic_optimizer = tf.train.AdamOptimizer(
                learning_rate=critic_lr)
            self.critic_optimizer._create_slots(self.critic.variables)  # pylint: disable=protected-access

        soft_update(self.critic.variables, self.critic_target.variables)
Esempio n. 22
0
def do_rollout(env,
               actor,
               replay_buffer,
               noise_scale=0.1,
               num_trajectories=1,
               rand_actions=0,
               sample_random=False,
               add_absorbing_state=False):
  """Do N rollout.

  Args:
      env: environment to train on.
      actor: policy to take actions.
      replay_buffer: replay buffer to collect samples.
      noise_scale: std of gaussian noise added to a policy output.
      num_trajectories: number of trajectories to collect.
      rand_actions: number of random actions before using policy.
      sample_random: whether to sample a random trajectory or not.
      add_absorbing_state: whether to add an absorbing state.
  Returns:
    An episode reward and a number of episode steps.
  """
  total_reward = 0
  total_timesteps = 0

  for _ in range(num_trajectories):
    obs = env.reset()
    episode_timesteps = 0
    while True:
      if (replay_buffer is not None and
          len(replay_buffer) < rand_actions) or sample_random:
        action = env.action_space.sample()
      else:
        tfe_obs = contrib_eager_python_tfe.Variable([obs.astype('float32')])
        action = actor(tfe_obs).numpy()[0]
        if noise_scale > 0:
          action += np.random.normal(size=action.shape) * noise_scale
        action = action.clip(-1, 1)

      next_obs, reward, done, _ = env.step(action)
      # Extremely important, otherwise Q function is not stationary!
      # Taken from: https://github.com/sfujim/TD3/blob/master/main.py#L123
      if not done or episode_timesteps + 1 == env._max_episode_steps:  # pylint: disable=protected-access
        done_mask = Mask.NOT_DONE.value
      else:
        done_mask = Mask.DONE.value

      total_reward += reward
      episode_timesteps += 1
      total_timesteps += 1

      if replay_buffer is not None:
        if (add_absorbing_state and done and
            episode_timesteps < env._max_episode_steps):  # pylint: disable=protected-access
          next_obs = env.get_absorbing_state()
        replay_buffer.push_back(obs, action, next_obs, [reward], [done_mask],
                                done)

      if done:
        break

      obs = next_obs

    # Add an absorbing state that is extremely important for GAIL.
    if add_absorbing_state and (replay_buffer is not None and
                                episode_timesteps < env._max_episode_steps):  # pylint: disable=protected-access
      action = np.zeros(env.action_space.shape)
      absorbing_state = env.get_absorbing_state()

      # done=False is set to the absorbing state because it corresponds to
      # a state where gym environments stopped an episode.
      replay_buffer.push_back(absorbing_state, action, absorbing_state, [0.0],
                              [Mask.ABSORBING.value], False)
  return total_reward / num_trajectories, total_timesteps // num_trajectories
Esempio n. 23
0
import tensorflow as tf
from tensorflow.contrib.eager.python import tfe
tf.enable_eager_execution()

x = tfe.Variable(initial_value=tf.random_uniform([1], -1., 1.), name='x')


def loss(input):
    return tf.sigmoid(input)


grad_vars = tfe.implicit_gradients(loss)
opt = tf.train.GradientDescentOptimizer(learning_rate=1)

for i in range(1000):
    for j in range(50):
        opt.apply_gradients(grad_vars(x))

    if i % 50 == 0:
        loss_val = loss(x)
        print(i, "Optimal Value : ", loss_val.numpy(), "Val (X) : ", x.numpy())
Esempio n. 24
0
    def update(self, batch, expert_batch):
        """Updates the WGAN potential function or GAN discriminator.

    Args:
       batch: A batch from training policy.
       expert_batch: A batch from the expert.
    """
        obs = contrib_eager_python_tfe.Variable(
            np.stack(batch.obs).astype('float32'))
        expert_obs = contrib_eager_python_tfe.Variable(
            np.stack(expert_batch.obs).astype('float32'))

        expert_mask = contrib_eager_python_tfe.Variable(
            np.stack(expert_batch.mask).astype('float32'))

        # Since expert trajectories were resampled but no absorbing state,
        # statistics of the states changes, we need to adjust weights accordingly.
        expert_mask = tf.maximum(0, -expert_mask)
        expert_weight = expert_mask / self.subsampling_rate + (1 - expert_mask)

        action = contrib_eager_python_tfe.Variable(
            np.stack(batch.action).astype('float32'))
        expert_action = contrib_eager_python_tfe.Variable(
            np.stack(expert_batch.action).astype('float32'))

        inputs = tf.concat([obs, action], -1)
        expert_inputs = tf.concat([expert_obs, expert_action], -1)

        # Avoid using tensorflow random functions since it's impossible to get
        # the state of the random number generator used by TensorFlow.
        alpha = np.random.uniform(size=(inputs.get_shape()[0], 1))
        alpha = contrib_eager_python_tfe.Variable(alpha.astype('float32'))
        inter = alpha * inputs + (1 - alpha) * expert_inputs

        with tf.GradientTape() as tape:
            output = self.discriminator(inputs)
            expert_output = self.discriminator(expert_inputs)

            with contrib_summary.record_summaries_every_n_global_steps(
                    100, self.disc_step):
                gan_loss = contrib_gan_python_losses_python_losses_impl.modified_discriminator_loss(
                    expert_output,
                    output,
                    label_smoothing=0.0,
                    real_weights=expert_weight)
                contrib_summary.scalar('discriminator/expert_output',
                                       tf.reduce_mean(expert_output),
                                       step=self.disc_step)
                contrib_summary.scalar('discriminator/policy_output',
                                       tf.reduce_mean(output),
                                       step=self.disc_step)

            with tf.GradientTape() as tape2:
                tape2.watch(inter)
                output = self.discriminator(inter)
                grad = tape2.gradient(output, [inter])[0]

            grad_penalty = tf.reduce_mean(tf.pow(
                tf.norm(grad, axis=-1) - 1, 2))

            loss = gan_loss + self.lambd * grad_penalty

        with contrib_summary.record_summaries_every_n_global_steps(
                100, self.disc_step):
            contrib_summary.scalar('discriminator/grad_penalty',
                                   grad_penalty,
                                   step=self.disc_step)

        with contrib_summary.record_summaries_every_n_global_steps(
                100, self.disc_step):
            contrib_summary.scalar('discriminator/loss',
                                   gan_loss,
                                   step=self.disc_step)

        grads = tape.gradient(loss, self.discriminator.variables)

        self.discriminator_optimizer.apply_gradients(
            zip(grads, self.discriminator.variables),
            global_step=self.disc_step)
Esempio n. 25
0
def main(_):
    """Run td3/ddpg training."""
    tfe.enable_eager_execution()

    if FLAGS.use_gpu:
        tf.device('/device:GPU:0').__enter__()

    if FLAGS.expert_dir.find(FLAGS.env) == -1:
        raise ValueError('Expert directory must contain the environment name')

    tf.set_random_seed(FLAGS.seed)
    np.random.seed(FLAGS.seed)
    random.seed(FLAGS.seed)

    env = gym.make(FLAGS.env)
    env.seed(FLAGS.seed)

    obs_shape = env.observation_space.shape
    act_shape = env.action_space.shape

    expert_replay_buffer_var = tfe.Variable('', name='expert_replay_buffer')

    saver = tfe.Saver([expert_replay_buffer_var])
    tf.gfile.MakeDirs(FLAGS.save_dir)

    with tf.variable_scope('actor'):
        actor = Actor(obs_shape[0], act_shape[0])
    expert_saver = tfe.Saver(actor.variables)

    best_checkpoint = None
    best_reward = float('-inf')

    checkpoint_state = tf.train.get_checkpoint_state(FLAGS.expert_dir)

    for checkpoint in checkpoint_state.all_model_checkpoint_paths:
        expert_saver.restore(checkpoint)
        expert_reward, _ = do_rollout(env,
                                      actor,
                                      replay_buffer=None,
                                      noise_scale=0.0,
                                      num_trajectories=10)

        if expert_reward > best_reward:
            best_reward = expert_reward
            best_checkpoint = checkpoint

    expert_saver.restore(best_checkpoint)

    expert_replay_buffer = ReplayBuffer()
    expert_reward, _ = do_rollout(
        env,
        actor,
        replay_buffer=expert_replay_buffer,
        noise_scale=0.0,
        num_trajectories=FLAGS.num_expert_trajectories)

    logging.info('Expert reward %f', expert_reward)
    print('Expert reward {}'.format(expert_reward))

    expert_replay_buffer_var.assign(pickle.dumps(expert_replay_buffer))
    saver.save(os.path.join(FLAGS.save_dir, 'expert_replay_buffer'))
Esempio n. 26
0
def main(_):
    """Run td3/ddpg training."""
    contrib_eager_python_tfe.enable_eager_execution()

    if FLAGS.use_gpu:
        tf.device('/device:GPU:0').__enter__()

    tf.gfile.MakeDirs(FLAGS.log_dir)
    summary_writer = contrib_summary.create_file_writer(FLAGS.log_dir,
                                                        flush_millis=10000)

    tf.set_random_seed(FLAGS.seed)
    np.random.seed(FLAGS.seed)
    random.seed(FLAGS.seed)

    env = gym.make(FLAGS.env)
    env.seed(FLAGS.seed)

    if FLAGS.env in ['HalfCheetah-v2', 'Ant-v1']:
        rand_actions = int(1e4)
    else:
        rand_actions = int(1e3)

    obs_shape = env.observation_space.shape
    act_shape = env.action_space.shape

    if FLAGS.algo == 'td3':
        model = ddpg_td3.DDPG(obs_shape[0],
                              act_shape[0],
                              use_td3=True,
                              policy_update_freq=2,
                              actor_lr=1e-3)
    else:
        model = ddpg_td3.DDPG(obs_shape[0],
                              act_shape[0],
                              use_td3=False,
                              policy_update_freq=1,
                              actor_lr=1e-4)

    replay_buffer_var = contrib_eager_python_tfe.Variable('',
                                                          name='replay_buffer')
    gym_random_state_var = contrib_eager_python_tfe.Variable(
        '', name='gym_random_state')
    np_random_state_var = contrib_eager_python_tfe.Variable(
        '', name='np_random_state')
    py_random_state_var = contrib_eager_python_tfe.Variable(
        '', name='py_random_state')

    saver = contrib_eager_python_tfe.Saver(
        model.variables + [replay_buffer_var] +
        [gym_random_state_var, np_random_state_var, py_random_state_var])
    tf.gfile.MakeDirs(FLAGS.save_dir)

    reward_scale = contrib_eager_python_tfe.Variable(1, name='reward_scale')
    eval_saver = contrib_eager_python_tfe.Saver(model.actor.variables +
                                                [reward_scale])
    tf.gfile.MakeDirs(FLAGS.eval_save_dir)

    last_checkpoint = tf.train.latest_checkpoint(FLAGS.save_dir)
    if last_checkpoint is None:
        replay_buffer = ReplayBuffer()
        total_numsteps = 0
        prev_save_timestep = 0
        prev_eval_save_timestep = 0
    else:
        saver.restore(last_checkpoint)
        replay_buffer = pickle.loads(zlib.decompress(
            replay_buffer_var.numpy()))
        total_numsteps = int(last_checkpoint.split('-')[-1])
        assert len(replay_buffer) == total_numsteps
        prev_save_timestep = total_numsteps
        prev_eval_save_timestep = total_numsteps
        env.unwrapped.np_random.set_state(
            pickle.loads(gym_random_state_var.numpy()))
        np.random.set_state(pickle.loads(np_random_state_var.numpy()))
        random.setstate(pickle.loads(py_random_state_var.numpy()))

    with summary_writer.as_default():
        while total_numsteps < FLAGS.training_steps:
            rollout_reward, rollout_timesteps = do_rollout(
                env,
                model.actor,
                replay_buffer,
                noise_scale=FLAGS.exploration_noise,
                rand_actions=rand_actions)
            total_numsteps += rollout_timesteps

            logging.info('Training: total timesteps %d, episode reward %f',
                         total_numsteps, rollout_reward)

            print('Training: total timesteps {}, episode reward {}'.format(
                total_numsteps, rollout_reward))

            with contrib_summary.always_record_summaries():
                contrib_summary.scalar('reward',
                                       rollout_reward,
                                       step=total_numsteps)
                contrib_summary.scalar('length',
                                       rollout_timesteps,
                                       step=total_numsteps)

            if len(replay_buffer) >= FLAGS.min_samples_to_start:
                for _ in range(rollout_timesteps):
                    time_step = replay_buffer.sample(
                        batch_size=FLAGS.batch_size)
                    batch = TimeStep(*zip(*time_step))
                    model.update(batch)

                if total_numsteps - prev_save_timestep >= FLAGS.save_interval:
                    replay_buffer_var.assign(
                        zlib.compress(pickle.dumps(replay_buffer)))
                    gym_random_state_var.assign(
                        pickle.dumps(env.unwrapped.np_random.get_state()))
                    np_random_state_var.assign(
                        pickle.dumps(np.random.get_state()))
                    py_random_state_var.assign(pickle.dumps(random.getstate()))

                    saver.save(os.path.join(FLAGS.save_dir, 'checkpoint'),
                               global_step=total_numsteps)
                    prev_save_timestep = total_numsteps

                if total_numsteps - prev_eval_save_timestep >= FLAGS.eval_save_interval:
                    eval_saver.save(os.path.join(FLAGS.eval_save_dir,
                                                 'checkpoint'),
                                    global_step=total_numsteps)
                    prev_eval_save_timestep = total_numsteps
Esempio n. 27
0
    plt.figure(dpi=300, figsize=(20, 20))
    plt.subplots_adjust(left=0.0, right=1.0, bottom=0.0, top=1.0)
    plt.imshow(a)
    #plt.show()
    plt.savefig('temp.png')


Y, X = np.mgrid[-1.3:1.3:0.001, -2:1:0.001]
Z = X + 1j * Y

num_gpus = tfe.num_gpus()

if num_gpus > 0:
    with tf.device('gpu:0'):
        xs = tf.constant(Z.astype(np.complex64))
        zs = tfe.Variable(xs)
        ns = tfe.Variable(tf.zeros_like(xs, tf.float32))
else:
    with tf.device('/cpu:0'):
        xs = tf.constant(Z.astype(np.complex64))
        zs = tfe.Variable(xs)
        ns = tfe.Variable(tf.zeros_like(xs, tf.float32))

# Operation to update the zs and the iteration count.
#
# Note: We keep computing zs after they diverge! This
#       is very wasteful! There are better, if a little
#       less simple, ways to do this.


def compute(zs, ns):