コード例 #1
0
ファイル: ppo.py プロジェクト: VladAlexandruIlie/SC2EtherBot
    def loss_fn(self):
        adv = tf.placeholder(tf.float32, [None], name="advantages")
        returns = tf.placeholder(tf.float32, [None], name="returns")
        logli_old = tf.placeholder(tf.float32, [None], name="logli_old")
        value_old = tf.placeholder(tf.float32, [None], name="value_old")

        ratio = tf.exp(self.policy.logli - logli_old)
        clipped_ratio = tf.clip_by_value(ratio, 1 - self.clip_ratio,
                                         1 + self.clip_ratio)

        value_err = (self.value - returns)**2
        if self.clip_value > 0.0:
            clipped_value = tf.clip_by_value(self.value,
                                             value_old - self.clip_value,
                                             value_old + self.clip_value)
            clipped_value_err = (clipped_value - returns)**2
            value_err = tf.maximum(value_err, clipped_value_err)

        policy_loss = -tf.reduce_mean(
            tf.minimum(adv * ratio, adv * clipped_ratio))
        value_loss = tf.reduce_mean(value_err) * self.value_coef
        entropy_loss = tf.reduce_mean(self.policy.entropy) * self.entropy_coef
        # we want to reduce policy and value errors, and maximize entropy
        # but since optimizer is minimizing the signs are opposite
        full_loss = policy_loss + value_loss - entropy_loss

        return full_loss, [policy_loss, value_loss,
                           entropy_loss], [adv, returns, logli_old, value_old]
コード例 #2
0
ファイル: model.py プロジェクト: suhridbuddha/ntsa
    def _ph_op(self):
        with tf.name_scope("init_ph"):
            x, y, y_feature = self._input_shapes

            # x driving series
            self.x = tf.placeholder(dtype=tf.float32,
                                    shape=(None, ) + x,
                                    name='x')
            # future values of driving series
            self.y = tf.placeholder(dtype=tf.float32,
                                    shape=(None, ) + y,
                                    name='y')
            # future values of the ancillary series
            self.y_features = tf.placeholder(dtype=tf.float32,
                                             shape=(None, ) + y_feature,
                                             name='y_features')

            self.mu = tf.placeholder_with_default(0., shape=(), name='mu')
            self.std = tf.placeholder_with_default(1., shape=(), name='std')

            self.keep_prob = tf.placeholder_with_default(1.,
                                                         shape=(),
                                                         name='keep_prob')
            self.is_training = tf.placeholder_with_default(True,
                                                           shape=(),
                                                           name='is_training')
            self.gen_len = tf.placeholder_with_default(1,
                                                       shape=(),
                                                       name='gen_len')
            self.flag = tf.placeholder(shape=(), dtype=tf.bool)
コード例 #3
0
ファイル: a2c.py プロジェクト: 2To3rdPwr/reaver-pysc2
    def loss_fn(self):
        adv = tf.placeholder(tf.float32, [None], name="advantages")
        returns = tf.placeholder(tf.float32, [None], name="returns")

        policy_loss = -tf.reduce_mean(self.policy.logli * adv)
        value_loss = tf.reduce_mean(
            (self.value - returns)**2) * self.value_coef
        entropy_loss = tf.reduce_mean(self.policy.entropy) * self.entropy_coef
        # we want to reduce policy and value errors, and maximize entropy
        # but since optimizer is minimizing the signs are opposite
        full_loss = policy_loss + value_loss - entropy_loss

        try:
            with open("loss_fn.txt", "x+") as f:
                f.write("out\n")
                f.write("full_loss: {0} type: {1}\n".format(
                    type(full_loss), full_loss.dtype))
                f.write("policy_loss: {0} type: {1}\n".format(
                    type(policy_loss), policy_loss.dtype))
                f.write("value_loss: {0} type: {1}\n".format(
                    type(value_loss), value_loss.dtype))
                f.write("entropy_loss: {0} type: {1}\n".format(
                    type(entropy_loss), entropy_loss.dtype))
                f.write("adv: {0} type: {1}\n".format(type(adv), adv.dtype))
                f.write("returns: {0} type: {1}\n".format(
                    type(returns), returns.dtype))
                f.close()
        except FileExistsError:
            print("")

        return full_loss, [policy_loss, value_loss,
                           entropy_loss], [adv, returns]
コード例 #4
0
  def __init__(self,
               sess,
               observation_space,
               action_space,
               optimizer_name='',
               select_slate_fn=None,
               compute_target_fn=None,
               stack_size=1,
               eval_mode=False,
               **kwargs):
    """Initializes SlateDecompQAgent.

    Args:
      sess: a Tensorflow session.
      observation_space: A gym.spaces object that specifies the format of
        observations.
      action_space: A gym.spaces object that specifies the format of actions.
      optimizer_name: The name of the optimizer.
      select_slate_fn: A function that selects the slate.
      compute_target_fn: A function that omputes the target q value.
      stack_size: The stack size for the replay buffer.
      eval_mode: A bool for whether the agent is in training or evaluation mode.
      **kwargs: Keyword arguments to the DQNAgent.
    """
    self._response_adapter = dqn_agent.ResponseAdapter(
        observation_space.spaces['response'])
    response_names = self._response_adapter.response_names
    expected_response_names = ['click', 'watch_time']
    if not all(key in response_names for key in expected_response_names):
      raise ValueError(
          "Couldn't find all fields needed for the decomposition: %r" %
          expected_response_names)

    self._click_response_index = response_names.index('click')
    self._reward_response_index = response_names.index('watch_time')
    self._quality_response_index = response_names.index('quality')
    self._cluster_id_response_index = response_names.index('cluster_id')

    self._env_action_space = action_space
    self._num_candidates = int(action_space.nvec[0])
    abstract_agent.AbstractEpisodicRecommenderAgent.__init__(self, action_space)

    # The doc score is a [num_candidates] vector.
    self._doc_affinity_scores_ph = tf.placeholder(
        tf.float32, (self._num_candidates,), name='doc_affinity_scores_ph')
    self._prob_no_click_ph = tf.placeholder(
        tf.float32, (), name='prob_no_click_ph')

    self._select_slate_fn = select_slate_fn
    self._compute_target_fn = compute_target_fn

    dqn_agent.DQNAgentRecSim.__init__(
        self,
        sess,
        observation_space,
        num_actions=0,  # Unused.
        stack_size=1,
        optimizer_name=optimizer_name,
        eval_mode=eval_mode,
        **kwargs)
コード例 #5
0
    def build_graph(self):
        """Builds the neural network graph."""

        # define graph
        self.g = tf.Graph()
        with self.g.as_default():

            # create and store a new session for the graph
            self.sess = tf.Session()

            # define placeholders
            self.x = tf.placeholder(shape=[None, self.dim_input],
                                    dtype=tf.float32)
            self.y = tf.placeholder(shape=[None, self.num_classes],
                                    dtype=tf.float32)

            # define simple model
            with tf.variable_scope('last_layer'):
                self.z = tf.layers.dense(inputs=self.x, units=self.num_classes)

            self.loss = tf.reduce_mean(
                tf.nn.softmax_cross_entropy_with_logits_v2(labels=self.y,
                                                           logits=self.z))

            self.output_probs = tf.nn.softmax(self.z)

            # Variables of the last layer
            self.ll_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)
            self.ll_vars_concat = tf.concat(
                [self.ll_vars[0],
                 tf.expand_dims(self.ll_vars[1], axis=0)], 0)

            # Summary
            _variable_summaries(self.ll_vars_concat)

            # saving the weights of last layer when running bootstrap algorithm
            self.saver = tf.train.Saver(var_list=self.ll_vars)

            self.gd_opt = tf.train.GradientDescentOptimizer(self.step_size)

            # SGD optimizer for the last layer
            grads_vars_sgd = self.gd_opt.compute_gradients(self.loss)
            self.train_op = self.gd_opt.apply_gradients(grads_vars_sgd)

            for g, v in grads_vars_sgd:
                if g is not None:
                    s = list(v.name)
                    s[v.name.rindex(':')] = '_'
                    tf.summary.histogram(''.join(s) + '/grad_hist_boot_sgd', g)

            # Merge all the summaries and write them out
            self.all_summaries = tf.summary.merge_all()
            location = os.path.join(self.working_dir, 'logs')
            self.writer = tf.summary.FileWriter(location, graph=self.g)

            saver_network = tf.train.Saver(var_list=self.ll_vars)
            print('Loading the network...')
            # Restores from checkpoint
            saver_network.restore(self.sess, self.model_dir)
            print('Graph successfully loaded.')
コード例 #6
0
    def loss_fn(self):
        adv = tf.placeholder(tf.float32, [None], name="advantages")
        returns = tf.placeholder(tf.float32, [None], name="returns")

        policy_loss = -tf.reduce_mean(self.policy.logli * adv)
        value_loss = tf.reduce_mean((self.value - returns)**2) * self.value_coef
        entropy_loss = tf.reduce_mean(self.policy.entropy) * self.entropy_coef
        # we want to reduce policy and value errors, and maximize entropy
        # but since optimizer is minimizing the signs are opposite
        full_loss = policy_loss + value_loss - entropy_loss

        return full_loss, [policy_loss, value_loss, entropy_loss], [adv, returns]
コード例 #7
0
ファイル: a2c.py プロジェクト: stjordanis/reaver-pysc2
    def loss_fn(self):
        adv = tf.placeholder(tf.float32, [None], name="advantages")
        returns = tf.placeholder(tf.float32, [None], name="returns")

        policy_loss = -tf.reduce_mean(self.policy.logli * adv)
        value_loss = tf.reduce_mean((self.value - returns)**2) * self.value_coef
        entropy_loss = tf.reduce_mean(self.policy.entropy) * self.entropy_coef
        # we want to reduce policy and value errors, and maximize entropy
        # but since optimizer is minimizing the signs are opposite
        full_loss = policy_loss + value_loss - entropy_loss

        return full_loss, [policy_loss, value_loss, entropy_loss], [adv, returns]
コード例 #8
0
ファイル: ppo.py プロジェクト: stjordanis/reaver-pysc2
    def loss_fn(self):
        adv = tf.placeholder(tf.float32, [None], name="advantages")
        returns = tf.placeholder(tf.float32, [None], name="returns")
        logli_old = tf.placeholder(tf.float32, [None], name="logli_old")

        ratio = tf.exp(self.policy.logli - logli_old)
        clipped_ratio = tf.clip_by_value(ratio, 1-self.clip_ratio, 1+self.clip_ratio)

        policy_loss = -tf.reduce_mean(tf.minimum(adv * ratio, adv * clipped_ratio))
        # TODO clip value loss
        value_loss = tf.reduce_mean((self.value - returns)**2) * self.value_coef
        entropy_loss = tf.reduce_mean(self.policy.entropy) * self.entropy_coef
        # we want to reduce policy and value errors, and maximize entropy
        # but since optimizer is minimizing the signs are opposite
        full_loss = policy_loss + value_loss - entropy_loss

        return full_loss, [policy_loss, value_loss, entropy_loss], [adv, returns, logli_old]
コード例 #9
0
    def loss_fn(self):
        """
      Sample trajectories and fit a cost function C. Form grad estimate with C
      and take a TRPO step for next policy.
      """
        adv = tf.placeholder(tf.float32, [None], name="advantages")
        returns = tf.placeholder(tf.float32, [None], name="returns")

        policy_loss = -tf.reduce_mean(self.policy.logli * adv)
        # value_loss = tf.reduce_mean((self.value - returns)**2) * self.value_coef
        value_loss = tf.reduce_mean(self.value - returns)

        entropy_loss = tf.reduce_mean(self.policy.entropy) * self.entropy_coef
        # we want to reduce policy and value errors, and maximize entropy
        # but since optimizer is minimizing the signs are opposite
        full_loss = policy_loss + value_loss - entropy_loss

        return value_loss
コード例 #10
0
    def loss_fn(self, policy=None, value=None):
        adv = tf.placeholder(tf.float32, [None], name="advantages")
        returns = tf.placeholder(tf.float32, [None], name="returns")

        if not self.subenvs:

            policy_loss = -tf.reduce_mean(self.policy.logli * adv)
            value_loss = tf.reduce_mean(
                (self.value - returns)**2) * self.value_coef
            entropy_loss = tf.reduce_mean(
                self.policy.entropy) * self.entropy_coef
        else:
            assert policy is not None and value is not None, "Missing variables representing <policy> and <value>"
            policy_loss = -tf.reduce_mean(policy.logli * adv)
            value_loss = tf.reduce_mean((value - returns)**2) * self.value_coef
            entropy_loss = tf.reduce_mean(policy.entropy) * self.entropy_coef

        # we want to reduce policy and value errors, and maximize entropy
        # but since optimizer is minimizing the signs are opposite
        full_loss = policy_loss + value_loss - entropy_loss

        return full_loss, [policy_loss, value_loss,
                           entropy_loss], [adv, returns]
コード例 #11
0
  def _build_eval_metric(self):
    """Build a network to evaluate the metric between all prototypical states.

    For each pair of states (s, t) we return max(d(s, t), d(t, s)), since the
    approximant cannot in general guarantee symmetry.

    Returns:
      An op computing the euclidean distance between the representations of all
        pairs of states in self.eval_states_ph.
    """
    self.eval_states_ph = tf.placeholder(tf.float64, (self.num_states, 2),
                                         name='eval_states_ph')
    distances = tf.maximum(
        self.online_network(self._concat_states(self.eval_states_ph)),
        self.online_network(self._concat_states(self.eval_states_ph,
                                                transpose=True)))
    return distances
コード例 #12
0
ファイル: replay_memory.py プロジェクト: vsois/hanabi-agents
    def __init__(self,
                 num_actions,
                 observation_size,
                 stack_size,
                 use_staging=True,
                 replay_capacity=1000000,
                 batch_size=32,
                 update_horizon=1,
                 gamma=1.0,
                 wrapped_memory=None):
        """Initializes a graph wrapper for the python replay memory.

    Args:
      num_actions: int, number of possible actions.
      observation_size: int, size of an input frame.
      stack_size: int, number of frames to use in state stack.
      use_staging: bool, when True it would use a staging area to prefetch the
        next sampling batch.
      replay_capacity: int, number of transitions to keep in memory.
      batch_size: int.
      update_horizon: int, length of update ('n' in n-step update).
      gamma: int, the discount factor.
      wrapped_memory: The 'inner' memory data structure. Defaults to None, which
        creates the standard DQN replay memory.

    Raises:
      ValueError: If update_horizon is not positive.
      ValueError: If discount factor is not in [0, 1].
    """
        if replay_capacity < update_horizon + 1:
            raise ValueError(
                'Update horizon (%i) should be significantly smaller '
                'than replay capacity (%i).' %
                (update_horizon, replay_capacity))
        if not update_horizon >= 1:
            raise ValueError('Update horizon must be positive.')
        if not 0.0 <= gamma <= 1.0:
            raise ValueError('Discount factor (gamma) must be in [0, 1].')

        # Allow subclasses to create self.memory.
        if wrapped_memory is not None:
            self.memory = wrapped_memory
        else:
            self.memory = OutOfGraphReplayMemory(num_actions, observation_size,
                                                 stack_size, replay_capacity,
                                                 batch_size, update_horizon,
                                                 gamma)

        with tf.name_scope('replay'):
            with tf.name_scope('add_placeholders'):
                self.add_obs_ph = tf.placeholder(tf.uint8, [observation_size],
                                                 name='add_obs_ph')
                self.add_action_ph = tf.placeholder(tf.int32, [],
                                                    name='add_action_ph')
                self.add_reward_ph = tf.placeholder(tf.float32, [],
                                                    name='add_reward_ph')
                self.add_terminal_ph = tf.placeholder(tf.uint8, [],
                                                      name='add_terminal_ph')
                self.add_legal_actions_ph = tf.placeholder(
                    tf.float32, [num_actions], name='add_legal_actions_ph')

            add_transition_ph = [
                self.add_obs_ph, self.add_action_ph, self.add_reward_ph,
                self.add_terminal_ph, self.add_legal_actions_ph
            ]

            with tf.device('/cpu:*'):
                self.add_transition_op = tf.py_func(self.memory.add,
                                                    add_transition_ph, [],
                                                    name='replay_add_py_func')

                self.transition = tf.py_func(
                    self.memory.sample_transition_batch, [], [
                        tf.uint8, tf.int32, tf.float32, tf.uint8, tf.uint8,
                        tf.int32, tf.float32
                    ],
                    name='replay_sample_py_func')

                if use_staging:
                    # To hide the py_func latency use a staging area to pre-fetch the next
                    # batch of transitions.
                    (states, actions, rewards, next_states, terminals, indices,
                     next_legal_actions) = self.transition
                    # StagingArea requires all the shapes to be defined.
                    states.set_shape(
                        [batch_size, observation_size, stack_size])
                    actions.set_shape([batch_size])
                    rewards.set_shape([batch_size])
                    next_states.set_shape(
                        [batch_size, observation_size, stack_size])
                    terminals.set_shape([batch_size])
                    indices.set_shape([batch_size])
                    next_legal_actions.set_shape([batch_size, num_actions])

                    # Create the staging area in CPU.
                    prefetch_area = tf.contrib.staging.StagingArea([
                        tf.uint8, tf.int32, tf.float32, tf.uint8, tf.uint8,
                        tf.int32, tf.float32
                    ])

                    self.prefetch_batch = prefetch_area.put(
                        (states, actions, rewards, next_states, terminals,
                         indices, next_legal_actions))
                else:
                    self.prefetch_batch = tf.no_op()

            if use_staging:
                # Get the sample_transition_batch in GPU. This would do the copy from
                # CPU to GPU.
                self.transition = prefetch_area.get()

            (self.states, self.actions, self.rewards, self.next_states,
             self.terminals, self.indices,
             self.next_legal_actions) = self.transition

            # Since these are py_func tensors, no information about their shape is
            # present. Setting the shape only for the necessary tensors
            self.states.set_shape([None, observation_size, stack_size])
            self.next_states.set_shape([None, observation_size, stack_size])
コード例 #13
0
    def build_graph(self):
        """Builds the neural network graph."""

        # define graph
        self.g = tf.Graph()
        with self.g.as_default():

            # create and store a new session for the graph
            self.sess = tf.Session()

            # define placeholders
            self.x = tf.placeholder(shape=[None, self.dim_input],
                                    dtype=tf.float32)
            self.y = tf.placeholder(shape=[None, self.num_classes],
                                    dtype=tf.float32)

            # linear layer(WX + b)
            with tf.variable_scope('last_layer/dense') as scope:
                weights = tf.get_variable('kernel',
                                          [self.dim_input, self.num_classes],
                                          dtype=tf.float32)
                biases = tf.get_variable('bias', [self.num_classes],
                                         dtype=tf.float32)
                wb = tf.concat([weights, tf.expand_dims(biases, axis=0)], 0)
                wb_renorm = tf.matmul(self.sigma_half_inv, wb)
                weights_renorm = wb_renorm[:self.dim_input, :]
                biases_renorm = wb_renorm[-1, :]
                self.z = tf.add(tf.matmul(self.x, weights_renorm),
                                biases_renorm,
                                name=scope.name)

            # Gaussian prior
            # prior = tf.nn.l2_loss(weights) + tf.nn.l2_loss(biases)

            # Non normalized loss, because of the preconditioning
            self.loss = self.n * tf.reduce_mean(
                tf.nn.softmax_cross_entropy_with_logits_v2(labels=self.y,
                                                           logits=self.z))

            # Bayesian loss
            self.bayesian_loss = self.loss  # + prior

            self.output_probs = tf.nn.softmax(self.z)

            # Variables of the last layer
            self.ll_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)
            self.ll_vars_concat = tf.concat(
                [self.ll_vars[0],
                 tf.expand_dims(self.ll_vars[1], axis=0)], 0)

            # Summary
            _variable_summaries(self.ll_vars_concat)

            # saving the weights of last layer when running SGLD/SGD/MCMC algorithm
            self.saver = tf.train.Saver(var_list=self.ll_vars,
                                        max_to_keep=self.num_samples)

            self.gd_opt = tf.train.GradientDescentOptimizer(self.step_size)
            # SGLD optimizer for the last layer
            if self.sampler in ['sgld', 'lmc']:
                grads_vars = self.gd_opt.compute_gradients(self.bayesian_loss)
                grads_vars_sgld = []

                for g, v in grads_vars:
                    if g is not None:
                        s = list(v.name)
                        s[v.name.rindex(':')] = '_'
                        # Adding Gaussian noise to the gradient
                        gaussian_noise = (np.sqrt(2. / self.step_size) *
                                          tf.random_normal(tf.shape(g)))
                        g_sgld = g + gaussian_noise
                        tf.summary.histogram(''.join(s) + '/grad_hist_mcmc', g)
                        tf.summary.histogram(
                            ''.join(s) + '/gaussian_noise_hist_mcmc',
                            gaussian_noise)
                        tf.summary.histogram(
                            ''.join(s) + '/grad_total_hist_mcmc', g_sgld)
                        grads_vars_sgld.append((g_sgld, v))

                self.train_op = self.gd_opt.apply_gradients(grads_vars_sgld)

            # SGD optimizer for the last layer
            if self.sampler == 'sgd':
                grads_vars_sgd = self.gd_opt.compute_gradients(self.loss)
                self.train_op = self.gd_opt.apply_gradients(grads_vars_sgd)

                for g, v in grads_vars_sgd:
                    if g is not None:
                        s = list(v.name)
                        s[v.name.rindex(':')] = '_'
                        tf.summary.histogram(''.join(s) + '/grad_hist_sgd', g)

            # Merge all the summaries and write them out
            self.all_summaries = tf.summary.merge_all()
            location = os.path.join(self.working_dir, 'logs')
            self.writer = tf.summary.FileWriter(location, graph=self.g)

            saver_network = tf.train.Saver(var_list=self.ll_vars)
            print('loading the network ...')
            # Restores from checkpoint
            saver_network.restore(self.sess, self.model_dir)
            print('Graph successfully loaded.')
コード例 #14
0
 def broken(sess):
     index = tf.placeholder(tf.int32, name='index')
     slice_op = tf.range(10)[index]
     sess.run(slice_op, feed_dict={index: 11})
コード例 #15
0
    def __init__(self,
                 num_actions=None,
                 observation_size=None,
                 num_players=None,
                 gamma=0.99,
                 update_horizon=1,
                 min_replay_history=500,
                 update_period=4,
                 stack_size=1,
                 target_update_period=500,
                 epsilon_fn=linearly_decaying_epsilon,
                 epsilon_train=0.02,
                 epsilon_eval=0.001,
                 epsilon_decay_period=1000,
                 graph_template=dqn_template,
                 tf_device='/cpu:*',
                 use_staging=True,
                 optimizer=tf.train.RMSPropOptimizer(learning_rate=.0025,
                                                     decay=0.95,
                                                     momentum=0.0,
                                                     epsilon=1e-6,
                                                     centered=True)):
        """Initializes the agent and constructs its graph.

    Args:
      num_actions: int, number of actions the agent can take at any state.
      observation_size: int, size of observation vector.
      num_players: int, number of players playing this game.
      gamma: float, discount factor as commonly used in the RL literature.
      update_horizon: int, horizon at which updates are performed, the 'n' in
        n-step update.
      min_replay_history: int, number of stored transitions before training.
      update_period: int, period between DQN updates.
      stack_size: int, number of observations to use as state.
      target_update_period: Update period for the target network.
      epsilon_fn: Function expecting 4 parameters: (decay_period, step,
        warmup_steps, epsilon), and which returns the epsilon value used for
        exploration during training.
      epsilon_train: float, final epsilon for training.
      epsilon_eval: float, epsilon during evaluation.
      epsilon_decay_period: int, number of steps for epsilon to decay.
      graph_template: function for building the neural network graph.
      tf_device: str, Tensorflow device on which to run computations.
      use_staging: bool, when True use a staging area to prefetch the next
        sampling batch.
      optimizer: Optimizer instance used for learning.
    """

        self.partial_reload = False

        tf.logging.info('Creating %s agent with the following parameters:',
                        self.__class__.__name__)
        tf.logging.info('\t gamma: %f', gamma)
        tf.logging.info('\t update_horizon: %f', update_horizon)
        tf.logging.info('\t min_replay_history: %d', min_replay_history)
        tf.logging.info('\t update_period: %d', update_period)
        tf.logging.info('\t target_update_period: %d', target_update_period)
        tf.logging.info('\t epsilon_train: %f', epsilon_train)
        tf.logging.info('\t epsilon_eval: %f', epsilon_eval)
        tf.logging.info('\t epsilon_decay_period: %d', epsilon_decay_period)
        tf.logging.info('\t tf_device: %s', tf_device)
        tf.logging.info('\t use_staging: %s', use_staging)
        tf.logging.info('\t optimizer: %s', optimizer)

        # Global variables.
        self.num_actions = num_actions
        self.observation_size = observation_size
        self.num_players = num_players
        self.gamma = gamma
        self.update_horizon = update_horizon
        self.cumulative_gamma = math.pow(gamma, update_horizon)
        self.min_replay_history = min_replay_history
        self.target_update_period = target_update_period
        self.epsilon_fn = epsilon_fn
        self.epsilon_train = epsilon_train
        self.epsilon_eval = epsilon_eval
        self.epsilon_decay_period = epsilon_decay_period
        self.update_period = update_period
        self.eval_mode = False
        self.training_steps = 0
        self.batch_staged = False
        self.optimizer = optimizer

        with tf.device(tf_device):
            # Calling online_convnet will generate a new graph as defined in
            # graph_template using whatever input is passed, but will always share
            # the same weights.
            online_convnet = tf.make_template('Online', graph_template)
            target_convnet = tf.make_template('Target', graph_template)
            # The state of the agent. The last axis is the number of past observations
            # that make up the state.
            states_shape = (1, observation_size, stack_size)
            self.state = np.zeros(states_shape)
            self.state_ph = tf.placeholder(tf.uint8,
                                           states_shape,
                                           name='state_ph')
            self.legal_actions_ph = tf.placeholder(tf.float32,
                                                   [self.num_actions],
                                                   name='legal_actions_ph')
            self._q = online_convnet(state=self.state_ph,
                                     num_actions=self.num_actions)
            self._replay = self._build_replay_memory(use_staging)
            self._replay_qs = online_convnet(self._replay.states,
                                             self.num_actions)
            self._replay_next_qt = target_convnet(self._replay.next_states,
                                                  self.num_actions)
            self._train_op = self._build_train_op()
            self._sync_qt_ops = self._build_sync_op()

            self._q_argmax = tf.argmax(self._q + self.legal_actions_ph,
                                       axis=1)[0]

        # Set up a session and initialize variables.
        self._sess = tf.Session(
            '', config=tf.ConfigProto(allow_soft_placement=True))
        self._init_op = tf.global_variables_initializer()
        self._sess.run(self._init_op)

        self._saver = tf.train.Saver(max_to_keep=3)

        # This keeps tracks of the observed transitions during play, for each
        # player.
        self.transitions = [[] for _ in range(num_players)]
コード例 #16
0
  def sample_distance_pairs(self, num_samples_per_cell=2, verbose=False):
    """Sample a set of points from each cell and compute all pairwise distances.

    This method also writes the resulting distances to disk.

    Args:
      num_samples_per_cell: int, number of samples to draw per cell.
      verbose: bool, whether to print verbose messages.
    """
    paired_states_ph = tf.placeholder(tf.float64, (1, 4),
                                      name='paired_states_ph')
    online_network = tf.make_template('Online', self._network_template)
    distance = online_network(paired_states_ph)
    saver = tf.train.Saver()
    if not self.add_noise:
      num_samples_per_cell = 1
    with tf.Session() as sess:
      saver.restore(sess, os.path.join(self.base_dir, 'tf_ckpt-239900'))
      total_samples = None
      for s_idx in range(self.num_states):
        s = self.inverse_index_states[s_idx]
        s = s.astype(np.float32)
        s += 0.5  # Place in center of cell.
        s = np.tile([s], (num_samples_per_cell, 1))
        if self.add_noise:
          sampled_noise = np.clip(
              np.random.normal(0, 0.1, size=(num_samples_per_cell, 2)),
              -0.3, 0.3)
          s += sampled_noise
        if total_samples is None:
          total_samples = s
        else:
          total_samples = np.concatenate([total_samples, s])
      num_total_samples = len(total_samples)
      distances = np.zeros((num_total_samples, num_total_samples))
      if verbose:
        tf.logging.info('Will compute distances for %d samples',
                        num_total_samples)
      for i in range(num_total_samples):
        s1 = total_samples[i]
        if verbose:
          tf.logging.info('Will compute distances from sample %d', i)
        for j in range(num_total_samples):
          s2 = total_samples[j]
          paired_states_1 = np.reshape(np.append(s1, s2), (1, 4))
          paired_states_2 = np.reshape(np.append(s2, s1), (1, 4))
          distance_np_1 = sess.run(
              distance, feed_dict={paired_states_ph: paired_states_1})
          distance_np_2 = sess.run(
              distance, feed_dict={paired_states_ph: paired_states_2})
          max_dist = max(distance_np_1, distance_np_2)
          distances[i, j] = max_dist
          distances[j, i] = max_dist
    sampled_distances = {
        'samples_per_cell': num_samples_per_cell,
        'samples': total_samples,
        'distances': distances,
    }
    file_path = os.path.join(self.base_dir, 'sampled_distances.pkl')
    with tf.gfile.GFile(file_path, 'w') as f:
      pickle.dump(sampled_distances, f)
コード例 #17
0
    def build_graph(self):
        """Builds the neural network graph."""

        # define graph
        self.g = tf.Graph()
        with self.g.as_default():

            # create and store a new session for the graph
            self.sess = tf.Session()

            # define placeholders
            self.x = tf.placeholder(shape=[None, self.dim_input],
                                    dtype=tf.float32)
            self.y = tf.placeholder(shape=[None, self.num_classes],
                                    dtype=tf.float32)

            # define simple model
            with tf.variable_scope('last_layer'):
                self.z = tf.layers.dense(inputs=self.x, units=self.num_classes)

            self.loss = tf.reduce_mean(
                tf.nn.softmax_cross_entropy_with_logits_v2(labels=self.y,
                                                           logits=self.z))

            self.output_probs = tf.nn.softmax(self.z)

            # Variables of the last layer
            self.ll_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)
            self.ll_vars_concat = tf.concat(
                [self.ll_vars[0],
                 tf.expand_dims(self.ll_vars[1], axis=0)], 0)

            # Summary
            _variable_summaries(self.ll_vars_concat)

            # add regularization that acts as a unit Gaussian prior on the last layer
            regularizer = tf.contrib.layers.l2_regularizer(1.0)

            # regularization
            prior = tf.contrib.layers.apply_regularization(
                regularizer, self.ll_vars)
            self.bayesian_loss = self.n * self.loss + prior

            # saving the weights of last layer when running SGLD/SGD/MCMC algorithm
            self.saver = tf.train.Saver(var_list=self.ll_vars,
                                        max_to_keep=self.num_samples)

            # SGLD optimizer for the last layer
            if self.sampler in ['sgld', 'lmc']:
                step = self.step_size / self.n
                gd_opt = tf.train.GradientDescentOptimizer(step)
                grads_vars = gd_opt.compute_gradients(self.bayesian_loss)
                grads_vars_sgld = []

                for g, v in grads_vars:
                    if g is not None:
                        s = list(v.name)
                        s[v.name.rindex(':')] = '_'
                        # Adding Gaussian noise to the gradient
                        gaussian_noise = (np.sqrt(2. / step) *
                                          tf.random_normal(tf.shape(g)))
                        g_sgld = g + gaussian_noise
                        tf.summary.histogram(''.join(s) + '/grad_hist_mcmc',
                                             g / self.n)
                        tf.summary.histogram(
                            ''.join(s) + '/gaussian_noise_hist_mcmc',
                            gaussian_noise / self.n)
                        tf.summary.histogram(
                            ''.join(s) + '/grad_total_hist_mcmc',
                            g_sgld / self.n)
                        grads_vars_sgld.append((g_sgld, v))

                self.train_op = gd_opt.apply_gradients(grads_vars_sgld)

            # SGD optimizer for the last layer
            if self.sampler == 'sgd':
                gd_opt = tf.train.GradientDescentOptimizer(self.step_size)
                grads_vars_sgd = gd_opt.compute_gradients(self.loss)
                self.train_op = gd_opt.apply_gradients(grads_vars_sgd)

                for g, v in grads_vars_sgd:
                    if g is not None:
                        s = list(v.name)
                        s[v.name.rindex(':')] = '_'
                        tf.summary.histogram(''.join(s) + '/grad_hist_sgd', g)

            # Merge all the summaries and write them out
            self.all_summaries = tf.summary.merge_all()
            location = os.path.join(self.working_dir, 'logs')
            self.writer = tf.summary.FileWriter(location, graph=self.g)

            saver_network = tf.train.Saver(var_list=self.ll_vars)
            print('loading the network ...')
            # Restores from checkpoint
            # self.sess.run(tf.global_variables_initializer())
            saver_network.restore(self.sess, self.model_dir)
            print('Graph successfully loaded.')
コード例 #18
0
  def _build_train_op(self, optimizer):
    """Build the TensorFlow graph used to learn the bisimulation metric.

    Args:
      optimizer: a tf.train optimizer.
    Returns:
      A TensorFlow op to minimize the bisimulation loss.
    """
    self.online_network = tf.make_template('Online',
                                           self._network_template)
    self.target_network = tf.make_template('Target',
                                           self._network_template)
    self.s1_ph = tf.placeholder(tf.float64, (self.batch_size, 2),
                                name='s1_ph')
    self.s2_ph = tf.placeholder(tf.float64, (self.batch_size, 2),
                                name='s2_ph')
    self.s1_online_distances = self.online_network(
        self._concat_states(self.s1_ph))
    self.s1_target_distances = self.target_network(
        self._concat_states(self.s1_ph))
    self.s2_target_distances = self.target_network(
        self._concat_states(self.s2_ph))
    self.action_ph = tf.placeholder(tf.int32, (self.batch_size,))
    self.rewards_ph = tf.placeholder(tf.float64, (self.batch_size,))
    # We use an expanding horizon for computing the distances.
    self.bisim_horizon_ph = tf.placeholder(tf.float64, ())
    # bisimulation_target_1 = rew_diff + gamma * next_distance.
    bisimulation_target_1 = tf.stop_gradient(self._build_bisimulation_target())
    # bisimulation_target_2 = curr_distance.
    bisimulation_target_2 = tf.stop_gradient(self.s1_target_distances)
    # We slowly taper in the maximum according to the bisim horizon.
    bisimulation_target = tf.maximum(
        bisimulation_target_1, bisimulation_target_2 * self.bisim_horizon_ph)
    # We zero-out diagonal entries, since those are estimating the distance
    # between a state and itself, which we know to be 0.
    diagonal_mask = 1.0 - tf.diag(tf.ones(self.batch_size, dtype=tf.float64))
    diagonal_mask = tf.reshape(diagonal_mask, (self.batch_size**2, 1))
    bisimulation_target *= diagonal_mask
    bisimulation_estimate = self.s1_online_distances
    # We start with a mask that includes everything.
    loss_mask = tf.ones(tf.shape(bisimulation_estimate))
    # We have to enforce that states being compared are done only using the same
    # action.
    indicators = self.action_ph
    indicators = tf.cast(indicators, tf.float64)
    # indicators will initially have shape [batch_size], we first tile it:
    square_ids = tf.tile([indicators], [self.batch_size, 1])
    # We subtract square_ids from its transpose:
    square_ids = square_ids - tf.transpose(square_ids)
    # At this point all zero-entries are the ones with equal IDs.
    # Now we would like to convert the zeros in this matrix to 1s, and make
    # everything else a 0. We can do this with the following operation:
    loss_mask = 1 - tf.abs(tf.sign(square_ids))
    # Now reshape to match the shapes of the estimate and target.
    loss_mask = tf.reshape(loss_mask, (self.batch_size**2, 1))
    larger_targets = bisimulation_target - bisimulation_estimate
    larger_targets_count = tf.reduce_sum(
        tf.cast(larger_targets > 0., tf.float64))
    tf.summary.scalar('Learning/LargerTargets', larger_targets_count)
    tf.summary.scalar('Learning/NumUpdates', tf.count_nonzero(loss_mask))
    tf.summary.scalar('Learning/BisimHorizon', self.bisim_horizon_ph)
    bisimulation_loss = tf.losses.mean_squared_error(
        bisimulation_target,
        bisimulation_estimate,
        weights=loss_mask)
    tf.summary.scalar('Learning/loss', bisimulation_loss)
    # Plot average distance between sampled representations.
    average_distance = tf.reduce_mean(bisimulation_estimate)
    tf.summary.scalar('Approx/AverageDistance', average_distance)
    return optimizer.minimize(bisimulation_loss)