Exemple #1
0
    def __init__(self, observations, env_spec):
        with tf.name_scope('fully_conv_model'):
            spatial_streams = {
                name: spatial_stream(observations[name], spec)
                for name, spec in env_spec.observation_spec.items()
                if spec.is_spatial
            }

            fc = Concatenate()(
                [Flatten()(x) for x in spatial_streams.values()])
            fc = Dense(
                256,
                activation='relu',
                name='fc',
                kernel_initializer=tf.keras.initializers.Orthogonal())(fc)

            with tf.name_scope('policy'):
                self.policy = {}
                for name, spec in env_spec.action_spec.items():
                    with tf.name_scope(name):
                        if spec.obs_space:
                            logits = Conv2D(
                                1,
                                1,
                                activation='linear',
                                data_format='channels_first',
                                kernel_initializer=tf.keras.initializers.
                                Orthogonal(gain=0.1))(
                                    spatial_streams[spec.obs_space])
                            logits = Flatten()(logits)
                        else:
                            logits = Dense(
                                np.prod(spec.sizes),
                                activation='linear',
                                kernel_initializer=tf.keras.initializers.
                                Orthogonal(gain=0.1))(fc)

                        if name == 'function_id':
                            logits = tf.where(
                                observations['available_actions'] > 0,
                                logits,
                                -1000 * tf.ones_like(logits),
                                name='mask_unavailable_functions')

                        self.policy[name] = tfp.distributions.Categorical(
                            logits=logits)

            with tf.name_scope('actions'):
                self.actions = {
                    name: dist.sample(name=name + '_sample')
                    for name, dist in self.policy.items()
                }

            with tf.name_scope('value'):
                self.value = value_output(fc)
Exemple #2
0
def _variable_summaries(var):
    """Attach a lot of summaries to a Tensor (for TensorBoard visualization)."""
    with tf.name_scope('summaries'):
        mean = tf.reduce_mean(var)
        tf.summary.scalar('mean', mean)
        with tf.name_scope('stddev'):
            stddev = tf.sqrt(tf.reduce_mean(tf.square(var - mean)))
        tf.summary.scalar('stddev', stddev)
        tf.summary.scalar('max', tf.reduce_max(var))
        tf.summary.scalar('min', tf.reduce_min(var))
        tf.summary.histogram('histogram', var)
 def _summary():
     with tf.name_scope('ActorCriticLoss'):
         tf.summary.scalar("values", tf.reduce_mean(value))
         tf.summary.scalar("returns", tf.reduce_mean(returns))
         tf.summary.scalar("advantages", tf.reduce_mean(advantages))
         tf.summary.scalar("explained_variance_of_return_by_value",
                           common.explained_variance(value, returns))
Exemple #4
0
    def _ph_op(self):
        with tf.name_scope("init_ph"):
            x, y, y_feature = self._input_shapes

            # x driving series
            self.x = tf.placeholder(dtype=tf.float32,
                                    shape=(None, ) + x,
                                    name='x')
            # future values of driving series
            self.y = tf.placeholder(dtype=tf.float32,
                                    shape=(None, ) + y,
                                    name='y')
            # future values of the ancillary series
            self.y_features = tf.placeholder(dtype=tf.float32,
                                             shape=(None, ) + y_feature,
                                             name='y_features')

            self.mu = tf.placeholder_with_default(0., shape=(), name='mu')
            self.std = tf.placeholder_with_default(1., shape=(), name='std')

            self.keep_prob = tf.placeholder_with_default(1.,
                                                         shape=(),
                                                         name='keep_prob')
            self.is_training = tf.placeholder_with_default(True,
                                                           shape=(),
                                                           name='is_training')
            self.gen_len = tf.placeholder_with_default(1,
                                                       shape=(),
                                                       name='gen_len')
            self.flag = tf.placeholder(shape=(), dtype=tf.bool)
Exemple #5
0
    def preprocess_observations(self):
        def one_hot_encode(x, scale):
            x = tf.squeeze(x, axis=1)
            x = tf.cast(x, tf.int32)
            return tf.one_hot(x, scale, axis=1)

        def preprocess_observation(input_obs, spec):
            if spec.is_spatial:
                features = Lambda(
                    lambda x: tf.split(x, x.get_shape()[1], axis=1))(input_obs)

                for f in spec.features:
                    if f.type == FeatureType.CATEGORICAL:
                        features[f.index] = Lambda(
                            lambda x: one_hot_encode(x, f.scale))(
                                features[f.index])
                    else:
                        features[f.index] = Lambda(lambda x: x / f.scale)(
                            features[f.index])

                return features
            else:
                return input_obs

        with tf.name_scope('preprocess_observations'):
            return {
                name: preprocess_observation(self.input_observations[name],
                                             spec)
                for name, spec in self.env_spec.observation_spec.items()
            }
Exemple #6
0
    def _train_op(self):
        with tf.name_scope("train_op"):
            d_opt = tf.train.GradientDescentOptimizer(self.d_lr)
            var_list = tf.trainable_variables(self.scope + "/discriminator")

            gvs, d_norm = clip_grads(self.d_loss, var_list)
            self.d_train = d_opt.minimize(self.d_loss,
                                          var_list=var_list,
                                          global_step=self._global_step)

            g_opt = tf.train.AdamOptimizer(self.g_lr)
            var_list = tf.trainable_variables(self.scope + "/generator")

            gvs, g_norm = clip_grads(self.g_loss, var_list)
            self.g_train = g_opt.minimize(self.g_loss,
                                          var_list=var_list,
                                          global_step=self._global_step)

            # g_train = g_opt.apply_gradients(gvs, global_step=self._global_step)

            self.train = tf.cond(self.flag, lambda: self.g_train,
                                 lambda: self.d_train)

            self._summary_dict.update({
                "distance":
                self._gen_norm(self.x_fake, self.y),
                "g_norm":
                g_norm,
                "d_norm":
                d_norm,
                "g_loss":
                self.g_loss,
                "d_loss":
                self.d_loss
            })
    def _build_select_slate_op(self):
        p_no_click = self._prob_no_click_ph
        p = self._doc_affinity_scores_ph
        q = self._net_outputs.q_values[0]
        with tf.name_scope('select_slate'):
            self._output_slate = self._select_slate_fn(self._slate_size,
                                                       p_no_click, p, q)

        self._output_slate = tf.Print(
            self._output_slate,
            [tf.constant('cp 1'), self._output_slate, p, q],
            summarize=10000)
        self._output_slate = tf.reshape(self._output_slate,
                                        (self._slate_size, ))

        self._action_counts = tf.get_variable(
            'action_counts',
            shape=[self._num_candidates],
            initializer=tf.zeros_initializer())
        output_slate = tf.reshape(self._output_slate, [-1])
        output_one_hot = tf.one_hot(output_slate, self._num_candidates)
        update_ops = []
        for i in range(self._slate_size):
            update_ops.append(
                tf.assign_add(self._action_counts, output_one_hot[i]))
        self._select_action_update_op = tf.group(*update_ops)
Exemple #8
0
 def _summary_op(self):
     with tf.name_scope("summary_op"):
         # self._summary_list += tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)
         metrics = regr_metrics(y=self.y, y_hat=self.y_hat)
         metrics = {k: tf.reduce_mean(v) for k, v in metrics.items()}
         self._summary_dict.update(metrics)
         self.summary = summary_op(t_dict=self._summary_dict)
Exemple #9
0
 def _build_networks(self):
     with tf.name_scope('networks'):
         self._replay_net_outputs = self._network_adapter(
             self._replay.states, 'Online')
         self._replay_next_target_net_outputs = self._network_adapter(
             self._replay.states, 'Target')
         self._net_outputs = self._network_adapter(self.state_ph, 'Online')
         self._q_argmax = tf.argmax(self._net_outputs.q_values, axis=1)[0]
 def _build_networks(self):
     with tf.name_scope('networks'):
         self._replay_net_outputs = self._network_adapter(
             self._replay.states, 'Online')
         self._replay_next_target_net_outputs = self._network_adapter(
             self._replay.states, 'Target')
         self._net_outputs = self._network_adapter(self.state_ph, 'Online')
         self._build_select_slate_op()
Exemple #11
0
    def value_loss(self):
        with tf.name_scope('value_loss'):
            loss = tf.losses.mean_squared_error(
                self.model.value, self.input_returns) * self.value_factor

        tf.summary.scalar('value_loss', loss, family='losses')

        return loss
Exemple #12
0
    def _loss_op(self):
        with tf.name_scope("loss_op"):
            self.d_loss = tf.reduce_mean(self._fake_d) - tf.reduce_mean(
                self._true_d)
            self.g_loss = -tf.reduce_mean(self._fake_d)

            # reg = self._reg(tf.shape(self.x)[0], self.d, self.x, self.x_fake)
            # self.d_loss += reg
            self.loss = [self.d_loss, self.g_loss]
Exemple #13
0
    def _train_op(self):
        with tf.name_scope("train_op"):
            opt = train_fn(global_step=self._global_step)
            gvs, norm = clip_grads(self.loss, self.vars)

            # self.train = opt.apply_gradients(gvs, global_step=self._global_step)
            self.train = opt.minimize(self.loss,
                                      var_list=self.vars,
                                      global_step=self._global_step)
            self._summary_dict.update({"norm": norm})
Exemple #14
0
    def prediction_loss(self, truths, palette):
        def spatial_loss(truth_features, predicted_features, space_desc):
            feature_losses = []
            for truth, prediction, spec in zip(truth_features,
                                               predicted_features,
                                               space_desc.features):
                if spec.type == FeatureType.CATEGORICAL:
                    truth = tf.transpose(truth, (0, 2, 3, 1))
                    prediction = tf.transpose(prediction, (0, 2, 3, 1))
                    feature_losses.append(
                        tf.losses.softmax_cross_entropy(truth, prediction))

                    summary_image = tf.argmax(
                        tf.concat([truth, prediction], 2), 3)
                    summary_image = tf.gather(
                        palette[space_desc.index][spec.index], summary_image)
                    tf.summary.image(spec.name, summary_image)
                else:
                    feature_losses.append(
                        tf.losses.mean_squared_error(truth, prediction))

                    summary_image = tf.concat([truth, prediction], 3)
                    tf.summary.image(spec.name,
                                     tf.transpose(summary_image, (0, 2, 3, 1)))

                tf.summary.scalar(spec.name, feature_losses[-1])

            return tf.reduce_mean(tf.stack(feature_losses))

        with tf.name_scope('prediction_loss'):
            spatial_losses = []
            for s in self.env_spec.spaces:
                with tf.name_scope(s.name):
                    loss = spatial_loss(truths[s.index],
                                        self.out_pred[s.index], s)
                    spatial_losses.append(loss)
                    tf.summary.scalar('loss', loss)

            loss = tf.reduce_mean(tf.stack(spatial_losses))
            tf.summary.scalar('loss', loss)

        return loss
Exemple #15
0
def selu(x):
    """
    SELU activation
    https://arxiv.org/abs/1706.02515
    :param x:
    :return:
    """
    with tf.name_scope('elu') as scope:
        alpha = 1.6732632423543772848170429916717
        scale = 1.0507009873554804934193349852946
        return scale * tf.where(x >= 0.0, x, alpha * tf.nn.elu(x))
Exemple #16
0
    def init_subagents(self,
                       model_fns,
                       obs_specs,
                       act_specs,
                       policy_clses,
                       n_subagents=0,
                       subagent_variable_scopes=[]):
        assert n_subagents == len(model_fns) == len(obs_specs) == len(
            policy_clses
        ) == len(act_specs) == len(
            subagent_variable_scopes
        ), "The number of subagents is not equal to the number of model_fns, or obs_specs, or act_specs"

        self.subagents = {}
        for model_fn, obs_spec, act_spec, policy_cls, subagent_variable_scope in zip(
                model_fns, obs_specs, act_specs, policy_clses,
                subagent_variable_scopes):
            subagent = Subagent()
            subagent_dir = self.subagent_dirs[subagent_variable_scope]

            print(LOGGING_MSG_HEADER, 'resetting tf graph for subagent: ',
                  subagent_variable_scope)
            tf.reset_default_graph()
            subagent.sess_mgr = SessionManager(
                base_path=subagent_dir,
                training_enabled=False,
                model_variable_scope=subagent_variable_scope)
            subagent.sess = subagent.sess_mgr.sess
            subagent.variable_scope = subagent_variable_scope

            with subagent.sess.graph.as_default():
                with tf.name_scope(
                        subagent.sess_mgr.main_tf_vs.original_name_scope):
                    subagent.model = model_fn(obs_spec, act_spec)
                    subagent.value = subagent.model.outputs[-1]
                    subagent.policy = policy_cls(act_spec,
                                                 subagent.model.outputs[:-1])
                    print(LOGGING_MSG_HEADER, subagent.variable_scope,
                          ' model setup successful')

                    subagent.sess_mgr.restore_or_init()
                    print(LOGGING_MSG_HEADER, subagent.variable_scope,
                          ' model restore successful')

            self.subagents[subagent_variable_scope] = subagent

        self.subagents_idx_key_dict = {}
        for idx, subagent_variable_scope in enumerate(self.subagents.keys()):
            self.subagents_idx_key_dict[idx] = subagent_variable_scope

        print(LOGGING_MSG_HEADER + "{} subagents are available: {}".format(
            self.n_subagents, self.subagents_idx_key_dict))
        print("type their respective index to select them")
Exemple #17
0
    def __init__(self,
                 observations,
                 env_spec,
                 dense_layer_size=(512, ),
                 activation='elu',
                 output_predictions_fn=None):

        with tf.name_scope('model'):
            with tf.name_scope('input'):
                spatial_features = [
                    input_block(observations[name], name, spec)
                    for name, spec in env_spec.observation_spec.items()
                    if spec.is_spatial
                ]

            with tf.name_scope('core'):
                spatial_features = [Flatten()(f) for f in spatial_features]
                spatial_features = Concatenate(
                    name='concatenate_features')(spatial_features)

                dense = spatial_features
                for i, size in enumerate(dense_layer_size):
                    op = Dense(size,
                               activation=activation,
                               name='state_dense_' + str(i))
                    dense = op(dense)

                    tf.summary.histogram(
                        'state_dense_' + str(i) + '_kernel_weights',
                        op.weights[0])

                tf.summary.scalar('dense_zero_fraction',
                                  tf.nn.zero_fraction(dense))
                tf.summary.histogram('dense_input', spatial_features)
                tf.summary.histogram('dense_output', dense)

            with tf.name_scope('value'):
                self.value = value_output(dense)

            with tf.name_scope('policy'):
                self.policy = policy_output(dense,
                                            observations['available_actions'],
                                            env_spec.action_spec)

            with tf.name_scope('actions'):
                self.actions = sample_policy(self.policy)

            if output_predictions_fn:
                with tf.name_scope('prediction'):
                    self.prediction = [
                        output_predictions_fn(dense, s)
                        for s in env_spec.observation_spec.values()
                        if s.is_spatial
                    ]
def state_rewards(states,
                  actions,
                  rewards,
                  next_states,
                  contexts,
                  weight_index=None,
                  state_indices=None,
                  weight_vector=1.0,
                  offset_vector=0.0,
                  summarize=False):
  """Returns the rewards that are linear mapping of next_states.

  Args:
    states: A [batch_size, num_state_dims] Tensor representing a batch
        of states.
    actions: A [batch_size, num_action_dims] Tensor representing a batch
      of actions.
    rewards: A [batch_size] Tensor representing a batch of rewards.
    next_states: A [batch_size, num_state_dims] Tensor representing a batch
      of next states.
    contexts: A list of [batch_size, num_context_dims] Tensor representing
      a batch of contexts.
    weight_index: (integer) Index of contexts lists that specify weighting.
    state_indices: (a list of Numpy integer array) Indices of states dimensions
      to be mapped.
    weight_vector: (a number or a list or Numpy array) The weighting vector,
      broadcastable to `next_states`.
    offset_vector: (a number or a list of Numpy array) The off vector.
    summarize: (boolean) enable summary ops.

  Returns:
    A new tf.float32 [batch_size] rewards Tensor, and
      tf.float32 [batch_size] discounts tensor.
  """
  del states, actions, rewards  # unused args
  stats = {}
  record_tensor(next_states, state_indices, stats)
  next_states = index_states(next_states, state_indices)
  weight = tf.constant(
      weight_vector, dtype=next_states.dtype, shape=next_states[0].shape)
  weights = tf.expand_dims(weight, 0)
  offset = tf.constant(
      offset_vector, dtype=next_states.dtype, shape=next_states[0].shape)
  offsets = tf.expand_dims(offset, 0)
  if weight_index is not None:
    weights *= contexts[weight_index]
  rewards = tf.to_float(tf.reduce_sum(weights * (next_states+offsets), axis=1))
  if summarize:
    with tf.name_scope('RewardFn/'):
      summarize_stats(stats)
  return rewards, tf.ones_like(rewards)
    def _network_adapter(self, states, scope):
        self._validate_states(states)

        with tf.name_scope('network'):
            q_value_list = []
            for slate in self._all_possible_slates:
                user = tf.squeeze(states[:, 0, :, :], axis=2)
                docs = []
                for i in slate:
                    docs.append(tf.squeeze(states[:, i + 1, :, :], axis=2))
                q_value_list.append(
                    self.network(user, tf.concat(docs, axis=1), scope))
            q_values = tf.concat(q_value_list, axis=1)

        return dqn_agent.DQNNetworkType(q_values)
Exemple #20
0
def preprocess_spatial_observation(input_obs, spec, categorical_embedding_dims=16, non_categorical_scaling='log'):
    with tf.name_scope('preprocess_spatial_obs'):
        features = Lambda(lambda x: tf.split(x, x.get_shape()[1], axis=1))(input_obs)

        for f in spec.features:
            if f.is_categorical:
                features[f.index] = Lambda(lambda x: tf.squeeze(x, axis=1))(features[f.index])
                features[f.index] = Embedding(f.scale, categorical_embedding_dims)(features[f.index])
                features[f.index] = Permute((3, 1, 2))(features[f.index])
            else:
                if non_categorical_scaling == 'log':
                    features[f.index] = Lambda(lambda x: tf.log(x + 1e-10))(features[f.index])
                elif non_categorical_scaling == 'normalize':
                    features[f.index] = Lambda(lambda x: x / f.scale)(features[f.index])

    return features
Exemple #21
0
    def _loss_op(self):
        with tf.name_scope("loss_op"):

            weights = tf.ones_like(self.y, name='weights')
            self.loss = sequence_loss(self.y_hat,
                                      self.y,
                                      weights=weights,
                                      loss_fn=which_loss(self._config.loss))
            self._summary_dict.update({"loss": self.loss})

            if hasattr(self, '_reg'):
                reg = tf.reduce_sum(self._reg)
                self.loss += reg
                self._summary_dict.update({"loss": self.loss, "reg": reg})
            else:
                self._summary_dict.update({"loss": self.loss})
def tanh_similarity(states,
                    actions,
                    rewards,
                    next_states,
                    contexts,
                    mse_scale=1.0,
                    state_scales=1.0,
                    goal_scales=1.0,
                    summarize=False):
  """Returns the similarity between next_states and contexts using tanh and mse.

  Args:
    states: A [batch_size, num_state_dims] Tensor representing a batch
        of states.
    actions: A [batch_size, num_action_dims] Tensor representing a batch
      of actions.
    rewards: A [batch_size] Tensor representing a batch of rewards.
    next_states: A [batch_size, num_state_dims] Tensor representing a batch
      of next states.
    contexts: A list of [batch_size, num_context_dims] Tensor representing
      a batch of contexts.
    mse_scale: A float, to scale mse before tanh.
    state_scales: multiplicative scale for (next) states. A scalar or 1D tensor,
      must be broadcastable to number of state dimensions.
    goal_scales: multiplicative scale for contexts. A scalar or 1D tensor,
      must be broadcastable to number of goal dimensions.
    summarize: (boolean) enable summary ops.


  Returns:
    A new tf.float32 [batch_size] rewards Tensor, and
      tf.float32 [batch_size] discounts tensor.
  """
  del states, actions, rewards  # Unused
  mse = tf.reduce_mean(tf.squared_difference(next_states * state_scales,
                                             contexts[0] * goal_scales), -1)
  tanh = tf.tanh(mse_scale * mse)
  if summarize:
    with tf.name_scope('RewardFn/'):
      tf.summary.scalar('mean_mse', tf.reduce_mean(mse))
      tf.summary.histogram('mse', mse)
      tf.summary.scalar('mean_tanh', tf.reduce_mean(tanh))
      tf.summary.histogram('tanh', tanh)
  rewards = tf.to_float(1 - tanh)
  return rewards, tf.ones_like(rewards)
Exemple #23
0
    def entropy_loss(self):
        with tf.name_scope('entropy_loss'):
            entropies = [
                dist.entropy() for name, dist in self.model.policy.items()
            ]
            entropy = tf.reduce_mean(tf.add_n(entropies))
            entropy_loss = -entropy * self.entropy_factor

        entropy_masked = tf.stack(entropies, axis=-1) * tf.gather(
            self.function_args_mask, self.input_actions['function_id'])
        entropy_masked = tf.reduce_mean(tf.reduce_sum(entropy_masked, axis=-1))
        tf.summary.scalar('policy_entropy', entropy, family='entropy')
        tf.summary.scalar('policy_entropy_masked',
                          entropy_masked,
                          family='entropy')
        tf.summary.scalar('entropy_loss', entropy_loss, family='losses')

        return entropy_loss
    def _network_adapter(self, states, scope):
        self._validate_states(states)

        with tf.name_scope('network'):
            # Since we decompose the slate optimization into an item-level
            # optimization problem, the observation space is the user state
            # observation plus all documents' observations. In the Dopamine DQN agent
            # implementation, there is one head for each possible action value, which
            # is designed for computing the argmax operation in the action space.
            # In our implementation, we generate one output for each document.
            q_value_list = []
            for i in range(self._num_candidates):
                user = tf.squeeze(states[:, 0, :, :], axis=2)
                doc = tf.squeeze(states[:, i + 1, :, :], axis=2)
                q_value_list.append(self.network(user, doc, scope))
            q_values = tf.concat(q_value_list, axis=1)

        return dqn_agent.DQNNetworkType(q_values)
Exemple #25
0
    def policy_loss(self):
        with tf.name_scope('policy_loss'):
            log_probs = [
                dist.log_prob(self.input_actions[name])
                for name, dist in self.model.policy.items()
            ]
            log_probs = tf.stack(log_probs, axis=-1)
            log_probs = log_probs * tf.gather(
                self.function_args_mask, self.input_actions['function_id'])

            advantage = self.input_returns - self.model.value

            policy_loss = -tf.reduce_mean(
                tf.reduce_sum(log_probs, axis=-1) *
                tf.stop_gradient(advantage)) * self.policy_factor

        tf.summary.scalar('policy_loss', policy_loss, family='losses')

        return policy_loss
  def _attend(self, query, key, value, key_class_id):
    """Transformer attention function."""
    with tf.name_scope('attend'):
      q_shape = tf.shape(query)
      v_shape = tf.shape(value)

      n_q = q_shape[0]
      h_q = q_shape[1]
      w_q = q_shape[2]
      d = q_shape[3]

      n_v = v_shape[0]
      h_v = v_shape[1]
      w_v = v_shape[2]
      c = v_shape[3]

      q = tf.reshape(query, [-1, d])  # [n_q*Hq*Wq, d]
      k = tf.reshape(key, [-1, d])

      # [n_v*Hv*Wv, d] x [Nq*Hq*Wq, d]  --> [n_v*Hv*Wv, Nq*Hq*Wq]
      logits = tf.matmul(k, q, transpose_b=True)
      d_scale = tf.rsqrt(tf.cast(d, logits.dtype))

      # logits: [n_v, Hv*Wv, n_q*Hq*Wq]
      logits = tf.reshape(d_scale * logits, [n_v, h_v * w_v, -1])

      # attn: [n_v, Hv*Wv, n_q*Hq*Wq]
      attn = self.get_support_set_softmax(logits, key_class_id)

      # aggregate:
      v = tf.reshape(value, [n_v, h_v * w_v, c])

      # [n_v, Hv*Wv, n_q*Hq*Wq] x [n_v, Hv*Wv, c]  --> [n_v, n_q*Hq*Wq, c]
      v_agg = tf.einsum('ijk,ijl->ikl', attn, v)
      v_agg = tf.reshape(v_agg, [n_v, n_q, h_q, w_q, c])
      v_agg.set_shape([None, None, None, None, value.shape[-1]])

      return v_agg  # [N_c, n_q, Hq, Wq, c]
  def create_sampling_ops(self, use_staging):
    """Creates the ops necessary to sample from the replay buffer.

    Creates the transition dictionary containing the sampling tensors.

    Args:
      use_staging: bool, when True it would use a staging area to prefetch
        the next sampling batch.
    """
    with tf.name_scope('sample_replay'):
      with tf.device('/cpu:*'):
        transition_type = self.memory.get_transition_elements()
        transition_tensors = tf.py_func(
            self.memory.sample_transition_batch, [],
            [return_entry.type for return_entry in transition_type],
            name='replay_sample_py_func')
        self._set_transition_shape(transition_tensors, transition_type)
        if use_staging:
          transition_tensors = self._set_up_staging(transition_tensors)
          self._set_transition_shape(transition_tensors, transition_type)

        # Unpack sample transition into member variables.
        self.unpack_transition(transition_tensors, transition_type)
Exemple #28
0
    def _loss_op(self):
        with tf.name_scope("loss_op"):
            # labels = tf.distributions.Uniform(low=0.7, high=1.2).sample(tf.shape(self._true_d))
            labels = tf.ones_like(self._true_d)
            d_loss_true = tf.reduce_mean(
                tf.nn.sigmoid_cross_entropy_with_logits(
                    logits=self._true_d,
                    labels=labels,
                ))

            # labels = tf.distributions.Uniform(low=0., high=0.3).sample(tf.shape(self._fake_d))
            labels = tf.zeros_like(self._fake_d)
            d_loss_fake = tf.reduce_mean(
                tf.nn.sigmoid_cross_entropy_with_logits(logits=self._fake_d,
                                                        labels=labels))

            self.d_loss = d_loss_true + d_loss_fake

            self.g_loss = tf.reduce_mean(
                tf.nn.sigmoid_cross_entropy_with_logits(logits=self._fake_d,
                                                        labels=tf.ones_like(
                                                            self._fake_d)))
            self.loss = [self.d_loss, self.g_loss]
def diff_distance(states,
                  actions,
                  rewards,
                  next_states,
                  contexts,
                  state_scales=1.0,
                  goal_scales=1.0,
                  reward_scales=1.0,
                  weight_index=None,
                  weight_vector=None,
                  summarize=False,
                  termination_epsilon=1e-4,
                  state_indices=None,
                  goal_indices=None,
                  norm='L2',
                  epsilon=1e-10):
  """Returns the difference in euclidean distance between states/next_states and contexts.

  Args:
    states: A [batch_size, num_state_dims] Tensor representing a batch
        of states.
    actions: A [batch_size, num_action_dims] Tensor representing a batch
      of actions.
    rewards: A [batch_size] Tensor representing a batch of rewards.
    next_states: A [batch_size, num_state_dims] Tensor representing a batch
      of next states.
    contexts: A list of [batch_size, num_context_dims] Tensor representing
      a batch of contexts.
    state_scales: multiplicative scale for (next) states. A scalar or 1D tensor,
      must be broadcastable to number of state dimensions.
    goal_scales: multiplicative scale for goals. A scalar or 1D tensor,
      must be broadcastable to number of goal dimensions.
    reward_scales: multiplicative scale for rewards. A scalar or 1D tensor,
      must be broadcastable to number of reward dimensions.
    weight_index: (integer) The context list index that specifies weight.
    weight_vector: (a number or a list or Numpy array) The weighting vector,
      broadcastable to `next_states`.
    summarize: (boolean) enable summary ops.
    termination_epsilon: terminate if dist is less than this quantity.
    state_indices: (a list of integers) list of state indices to select.
    goal_indices: (a list of integers) list of goal indices to select.
    vectorize: Return a vectorized form.
    norm: L1 or L2.
    epsilon: small offset to ensure non-negative/zero distance.

  Returns:
    A new tf.float32 [batch_size] rewards Tensor, and
      tf.float32 [batch_size] discounts tensor.
  """
  del actions, rewards  # Unused
  stats = {}
  record_tensor(next_states, state_indices, stats, 'next_states')
  next_states = index_states(next_states, state_indices)
  states = index_states(states, state_indices)
  goals = index_states(contexts[0], goal_indices)
  next_sq_dists = tf.squared_difference(next_states * state_scales,
                                        goals * goal_scales)
  sq_dists = tf.squared_difference(states * state_scales,
                                   goals * goal_scales)
  record_tensor(sq_dists, None, stats, 'sq_dists')
  if weight_vector is not None:
    next_sq_dists *= tf.convert_to_tensor(weight_vector, dtype=next_states.dtype)
    sq_dists *= tf.convert_to_tensor(weight_vector, dtype=next_states.dtype)
  if weight_index is not None:
    next_sq_dists *= contexts[weight_index]
    sq_dists *= contexts[weight_index]
  if norm == 'L1':
    next_dist = tf.sqrt(next_sq_dists + epsilon)
    dist = tf.sqrt(sq_dists + epsilon)
    next_dist = tf.reduce_sum(next_dist, -1)
    dist = tf.reduce_sum(dist, -1)
  elif norm == 'L2':
    next_dist = tf.reduce_sum(next_sq_dists, -1)
    next_dist = tf.sqrt(next_dist + epsilon)  # tf.gradients fails when tf.sqrt(-0.0)
    dist = tf.reduce_sum(sq_dists, -1)
    dist = tf.sqrt(dist + epsilon)  # tf.gradients fails when tf.sqrt(-0.0)
  else:
    raise NotImplementedError(norm)
  discounts = next_dist > termination_epsilon
  if summarize:
    with tf.name_scope('RewardFn/'):
      tf.summary.scalar('mean_dist', tf.reduce_mean(dist))
      tf.summary.histogram('dist', dist)
      summarize_stats(stats)
  diff = dist - next_dist
  diff *= reward_scales
  return tf.to_float(diff), tf.to_float(discounts)
Exemple #30
0
    def __init__(self,
                 num_actions,
                 observation_size,
                 stack_size,
                 use_staging=True,
                 replay_capacity=1000000,
                 batch_size=32,
                 update_horizon=1,
                 gamma=1.0,
                 wrapped_memory=None):
        """Initializes a graph wrapper for the python replay memory.

    Args:
      num_actions: int, number of possible actions.
      observation_size: int, size of an input frame.
      stack_size: int, number of frames to use in state stack.
      use_staging: bool, when True it would use a staging area to prefetch the
        next sampling batch.
      replay_capacity: int, number of transitions to keep in memory.
      batch_size: int.
      update_horizon: int, length of update ('n' in n-step update).
      gamma: int, the discount factor.
      wrapped_memory: The 'inner' memory data structure. Defaults to None, which
        creates the standard DQN replay memory.

    Raises:
      ValueError: If update_horizon is not positive.
      ValueError: If discount factor is not in [0, 1].
    """
        if replay_capacity < update_horizon + 1:
            raise ValueError(
                'Update horizon (%i) should be significantly smaller '
                'than replay capacity (%i).' %
                (update_horizon, replay_capacity))
        if not update_horizon >= 1:
            raise ValueError('Update horizon must be positive.')
        if not 0.0 <= gamma <= 1.0:
            raise ValueError('Discount factor (gamma) must be in [0, 1].')

        # Allow subclasses to create self.memory.
        if wrapped_memory is not None:
            self.memory = wrapped_memory
        else:
            self.memory = OutOfGraphReplayMemory(num_actions, observation_size,
                                                 stack_size, replay_capacity,
                                                 batch_size, update_horizon,
                                                 gamma)

        with tf.name_scope('replay'):
            with tf.name_scope('add_placeholders'):
                self.add_obs_ph = tf.placeholder(tf.uint8, [observation_size],
                                                 name='add_obs_ph')
                self.add_action_ph = tf.placeholder(tf.int32, [],
                                                    name='add_action_ph')
                self.add_reward_ph = tf.placeholder(tf.float32, [],
                                                    name='add_reward_ph')
                self.add_terminal_ph = tf.placeholder(tf.uint8, [],
                                                      name='add_terminal_ph')
                self.add_legal_actions_ph = tf.placeholder(
                    tf.float32, [num_actions], name='add_legal_actions_ph')

            add_transition_ph = [
                self.add_obs_ph, self.add_action_ph, self.add_reward_ph,
                self.add_terminal_ph, self.add_legal_actions_ph
            ]

            with tf.device('/cpu:*'):
                self.add_transition_op = tf.py_func(self.memory.add,
                                                    add_transition_ph, [],
                                                    name='replay_add_py_func')

                self.transition = tf.py_func(
                    self.memory.sample_transition_batch, [], [
                        tf.uint8, tf.int32, tf.float32, tf.uint8, tf.uint8,
                        tf.int32, tf.float32
                    ],
                    name='replay_sample_py_func')

                if use_staging:
                    # To hide the py_func latency use a staging area to pre-fetch the next
                    # batch of transitions.
                    (states, actions, rewards, next_states, terminals, indices,
                     next_legal_actions) = self.transition
                    # StagingArea requires all the shapes to be defined.
                    states.set_shape(
                        [batch_size, observation_size, stack_size])
                    actions.set_shape([batch_size])
                    rewards.set_shape([batch_size])
                    next_states.set_shape(
                        [batch_size, observation_size, stack_size])
                    terminals.set_shape([batch_size])
                    indices.set_shape([batch_size])
                    next_legal_actions.set_shape([batch_size, num_actions])

                    # Create the staging area in CPU.
                    prefetch_area = tf.contrib.staging.StagingArea([
                        tf.uint8, tf.int32, tf.float32, tf.uint8, tf.uint8,
                        tf.int32, tf.float32
                    ])

                    self.prefetch_batch = prefetch_area.put(
                        (states, actions, rewards, next_states, terminals,
                         indices, next_legal_actions))
                else:
                    self.prefetch_batch = tf.no_op()

            if use_staging:
                # Get the sample_transition_batch in GPU. This would do the copy from
                # CPU to GPU.
                self.transition = prefetch_area.get()

            (self.states, self.actions, self.rewards, self.next_states,
             self.terminals, self.indices,
             self.next_legal_actions) = self.transition

            # Since these are py_func tensors, no information about their shape is
            # present. Setting the shape only for the necessary tensors
            self.states.set_shape([None, observation_size, stack_size])
            self.next_states.set_shape([None, observation_size, stack_size])