def nested_fill(m, n):
    mat = []
    ag.set_element_type(mat, tf.int32)
    for _ in range(m):
        l = []
        ag.set_element_type(l, tf.int32)
        for j in range(n):
            l.append(j)
        mat.append(ag.stack(l, strict=False))
    return ag.stack(mat, strict=False)
def element_update():
    l = []
    l.append(1)
    l.append(2)
    l.append(3)
    ag.set_element_type(l, tf.int32)
    l[1] = 5
    return ag.stack(l, strict=False)
def read_write_loop(n):
    l = []
    l.append(1)
    l.append(1)
    ag.set_element_type(l, tf.int32)
    for i in range(2, n):
        l.append(l[i - 1] + l[i - 2])
        l[i - 2] = -l[i - 2]
    return ag.stack(l, strict=False)
def simple_empty(n):
    l = []
    l.append(1)
    l.append(2)
    l.append(3)
    l.append(4)
    ag.set_element_type(l, tf.int32, ())
    s = 0
    for _ in range(n):
        s += l.pop()
    return ag.stack(l, strict=False), s
Beispiel #5
0
def graph_train_model(policy_network, cart_pole_env, optimizer, iterations):
    """Trains the policy network for a given number of iterations."""
    i = tf.constant(0)
    mean_steps_per_iteration = []
    ag.set_element_type(mean_steps_per_iteration, tf.int32)

    while i < iterations:
        steps_per_game = policy_network.train(cart_pole_env,
                                              optimizer,
                                              discount_rate=0.95,
                                              num_games=20,
                                              max_steps_per_game=200)
        mean_steps_per_iteration.append(tf.reduce_mean(steps_per_game))
        i += 1

    return ag.stack(mean_steps_per_iteration)
def graph_train_model(policy_network, cart_pole_env, optimizer, iterations):
  """Trains the policy network for a given number of iterations."""
  i = tf.constant(0)
  mean_steps_per_iteration = []
  ag.set_element_type(mean_steps_per_iteration, tf.int32)

  while i < iterations:
    steps_per_game = policy_network.train(
        cart_pole_env,
        optimizer,
        discount_rate=0.95,
        num_games=20,
        max_steps_per_game=200)
    mean_steps_per_iteration.append(tf.reduce_mean(steps_per_game))
    i += 1

  return ag.stack(mean_steps_per_iteration)
Beispiel #7
0
    def train(self, cart_pole_env, optimizer, discount_rate, num_games,
              max_steps_per_game):
        var_list = tf.trainable_variables()
        grad_list = [
            tf.TensorArray(tf.float32, 0, dynamic_size=True) for _ in var_list
        ]

        step_counts = []
        discounted_rewards = []
        ag.set_element_type(discounted_rewards, tf.float32)
        ag.set_element_type(step_counts, tf.int32)

        # Note: we use a shared object, cart_pole_env here. Because calls to the
        # object's method are made through py_func, TensorFlow cannot detect its
        # data dependencies. Hence we must manually synchronize access to it
        # and ensure the control dependencies are set in such a way that
        # calls to reset(), take_one_step, etc. are made in the correct order.
        sync_counter = tf.constant(0)

        for _ in tf.range(num_games):
            with tf.control_dependencies([sync_counter]):
                obs = cart_pole_env.reset()
                with tf.control_dependencies([obs]):
                    sync_counter += 1

                game_rewards = []
                ag.set_element_type(game_rewards, tf.float32)

                for step in tf.range(max_steps_per_game):
                    logits, actions = self(obs)  # pylint:disable=not-callable
                    logits = tf.reshape(logits, ())
                    actions = tf.reshape(actions, ())

                    labels = 1.0 - tf.cast(actions, tf.float32)
                    loss = tf.nn.sigmoid_cross_entropy_with_logits(
                        labels=labels, logits=logits)
                    grads = tf.gradients(loss, var_list)

                    for i in range(len(grads)):
                        grad_list[i].append(grads[i])

                    with tf.control_dependencies([sync_counter]):
                        obs, reward, done = cart_pole_env.step(actions)
                        with tf.control_dependencies([obs]):
                            sync_counter += 1
                        obs = tf.reshape(obs, (1, 4))

                    game_rewards.append(reward)
                    if reward < 0.1 or done:
                        step_counts.append(step + 1)
                        break

                discounted_rewards = graph_append_discounted_rewards(
                    discounted_rewards, game_rewards, discount_rate)

        discounted_rewards = ag.stack(discounted_rewards)
        discounted_rewards.set_shape((None, ))
        mean, variance = tf.nn.moments(discounted_rewards, [0])
        normalized_rewards = (discounted_rewards - mean) / tf.sqrt(variance)

        for i in range(len(grad_list)):
            g = ag.stack(grad_list[i])

            # This block just adjusts the shapes to match for multiplication.
            r = normalized_rewards
            if r.shape.ndims < g.shape.ndims:
                r = tf.expand_dims(r, -1)
            if r.shape.ndims < g.shape.ndims:
                r = tf.expand_dims(r, -1)

            grad_list[i] = tf.reduce_mean(g * r, axis=0)

        optimizer.apply_gradients(zip(grad_list, var_list),
                                  global_step=tf.train.get_global_step())

        return ag.stack(step_counts)
  def train(self, cart_pole_env, optimizer, discount_rate, num_games,
            max_steps_per_game):
    var_list = tf.trainable_variables()
    grad_list = [
        tf.TensorArray(tf.float32, 0, dynamic_size=True) for _ in var_list
    ]

    step_counts = []
    discounted_rewards = []
    ag.set_element_type(discounted_rewards, tf.float32)
    ag.set_element_type(step_counts, tf.int32)

    # Note: we use a shared object, cart_pole_env here. Because calls to the
    # object's method are made through py_func, TensorFlow cannot detect its
    # data dependencies. Hence we must manually synchronize access to it
    # and ensure the control dependencies are set in such a way that
    # calls to reset(), take_one_step, etc. are made in the correct order.
    sync_counter = tf.constant(0)

    for _ in tf.range(num_games):
      with tf.control_dependencies([sync_counter]):
        obs = cart_pole_env.reset()
        with tf.control_dependencies([obs]):
          sync_counter += 1

        game_rewards = []
        ag.set_element_type(game_rewards, tf.float32)

        for step in tf.range(max_steps_per_game):
          logits, actions = self(obs)  # pylint:disable=not-callable
          logits = tf.reshape(logits, ())
          actions = tf.reshape(actions, ())

          labels = 1.0 - tf.cast(actions, tf.float32)
          loss = tf.nn.sigmoid_cross_entropy_with_logits(
              labels=labels, logits=logits)
          grads = tf.gradients(loss, var_list)

          for i in range(len(grads)):
            grad_list[i].append(grads[i])

          with tf.control_dependencies([sync_counter]):
            obs, reward, done = cart_pole_env.step(actions)
            with tf.control_dependencies([obs]):
              sync_counter += 1
            obs = tf.reshape(obs, (1, 4))

          game_rewards.append(reward)
          if reward < 0.1 or done:
            step_counts.append(step + 1)
            break

        discounted_rewards = graph_append_discounted_rewards(
            discounted_rewards, game_rewards, discount_rate)

    discounted_rewards = ag.stack(discounted_rewards)
    discounted_rewards.set_shape((None,))
    mean, variance = tf.nn.moments(discounted_rewards, [0])
    normalized_rewards = (discounted_rewards - mean) / tf.sqrt(variance)

    for i in range(len(grad_list)):
      g = ag.stack(grad_list[i])

      # This block just adjusts the shapes to match for multiplication.
      r = normalized_rewards
      if r.shape.ndims < g.shape.ndims:
        r = tf.expand_dims(r, -1)
      if r.shape.ndims < g.shape.ndims:
        r = tf.expand_dims(r, -1)

      grad_list[i] = tf.reduce_mean(g * r, axis=0)

    optimizer.apply_gradients(
        zip(grad_list, var_list), global_step=tf.train.get_global_step())

    return ag.stack(step_counts)
def simple_fill(n):
    l = []
    ag.set_element_type(l, tf.int32)
    for i in range(n):
        l.append(i)
    return ag.stack(l, strict=False)
def type_not_annotated(n):
    l = []
    # TODO(mdan): Here, we ought to infer the dtype and shape when i is staged.
    for i in range(n):
        l.append(i)
    return ag.stack(l, strict=False)