Beispiel #1
0
    def _build_graph(self, inputs):
        state, action, reward, next_state, isOver = inputs
        self.predict_value = self._get_DQN_prediction(state)
        action_onehot = tf.one_hot(action, NUM_ACTIONS, 1.0, 0.0)
        pred_action_value = tf.reduce_sum(self.predict_value * action_onehot,
                                          1)  #N,
        max_pred_reward = tf.reduce_mean(tf.reduce_max(self.predict_value, 1),
                                         name='predict_reward')
        add_moving_summary(max_pred_reward)

        with tf.variable_scope('target'):
            targetQ_predict_value = self._get_DQN_prediction(next_state)  # NxA

        if not DOUBLE:
            # DQN  # Select the greedy and value from the same target net.
            best_v = tf.reduce_max(targetQ_predict_value, 1)  # N,
        else:
            # Double-DQN # select the greedy from online net, get value from the target net.
            tf.get_variable_scope().reuse_variables()
            next_predict_value = self._get_DQN_prediction(next_state)
            self.greedy_choice = tf.argmax(next_predict_value, 1)  # N,
            predict_onehot = tf.one_hot(self.greedy_choice, NUM_ACTIONS, 1.0,
                                        0.0)
            best_v = tf.reduce_sum(targetQ_predict_value * predict_onehot, 1)

        target = reward + (1.0 - tf.cast(
            isOver, tf.float32)) * GAMMA * tf.stop_gradient(best_v)

        self.cost = tf.truediv(symbf.huber_loss(target - pred_action_value),
                               tf.cast(BATCH_SIZE, tf.float32),
                               name='cost')

        summary.add_param_summary([('conv.*/W', ['histogram', 'rms']),
                                   ('fc.*/W', ['histogram',
                                               'rms'])])  # monitor all W
Beispiel #2
0
    def _build_graph(self, inputs, is_training):
        state, action, reward, next_state, isOver = inputs
        self.predict_value = self._get_DQN_prediction(state, is_training)
        action_onehot = tf.one_hot(action, NUM_ACTIONS, 1.0, 0.0)
        pred_action_value = tf.reduce_sum(self.predict_value * action_onehot,
                                          1)  #N,
        max_pred_reward = tf.reduce_mean(tf.reduce_max(self.predict_value, 1),
                                         name='predict_reward')
        add_moving_summary(max_pred_reward)

        with tf.variable_scope('target'):
            targetQ_predict_value = self._get_DQN_prediction(
                next_state, False)  # NxA

        # DQN
        #best_v = tf.reduce_max(targetQ_predict_value, 1)    # N,

        # Double-DQN
        tf.get_variable_scope().reuse_variables()
        next_predict_value = self._get_DQN_prediction(next_state, is_training)
        self.greedy_choice = tf.argmax(next_predict_value, 1)  # N,
        predict_onehot = tf.one_hot(self.greedy_choice, NUM_ACTIONS, 1.0, 0.0)
        best_v = tf.reduce_sum(targetQ_predict_value * predict_onehot, 1)

        target = reward + (1.0 - tf.cast(
            isOver, tf.float32)) * GAMMA * tf.stop_gradient(best_v)

        cost = symbf.huber_loss(target - pred_action_value)
        summary.add_param_summary([('conv.*/W', ['histogram', 'rms']),
                                   ('fc.*/W', ['histogram',
                                               'rms'])])  # monitor all W
        self.cost = tf.reduce_mean(cost, name='cost')
Beispiel #3
0
    def _build_graph(self, inputs):
        state, action, reward, next_state, isOver = inputs
        predict_value = self._get_DQN_prediction(
        )  # N * NUM_ACTIONS #TODO: If we need self. here
        action_onehot = tf.one_hot(action, NUM_ACTIONS, 1.0,
                                   0.0)  # N * NUM_ACTION
        pred_action_value = tf.reduce_sum(predict_value * action_onehot,
                                          1)  # N,

        ### This is for tracking the learning process.
        # The mean max-Q across samples. Should be increasing over training
        max_pred_reward = tf.reduce_mean(tf.reduce_max(predict_value, 1),
                                         name='predict_reward')
        add_moving_summary(max_pred_reward)

        with tf.variable_scope(
                'target'
        ):  #TODO: Check the usage of variable scope in this context
            targetQ_predict_value = self._get_DQN_prediction(next_state)

        # DQN
        best_v = tf.reduce_max(targetQ_predict_value, 1)

        #TODO: Double-DQN

        #TODO: Why we need stop_gradient here
        target = reward + (1.0 - tf.cast(
            isOver, tf.float32)) * GAMMA * tf.stop_gradient(best_v)

        cost = huber_loss(target - pred_action_value)

        add_param_summary([('conv.*/W', ['histogram', 'rms']),
                           ('fc.*/W', ['histogram', 'rms'])])  #TODO
        self.cost = tf.reduce_mean(cost, name='cost')
Beispiel #4
0
    def _build_graph(self, inputs):
        state, action, reward, next_state, isOver = inputs
        self.predict_value = self._get_DQN_prediction(state)
        action_onehot = tf.one_hot(action, NUM_ACTIONS, 1.0, 0.0)
        pred_action_value = tf.reduce_sum(self.predict_value * action_onehot, 1)    #N,
        max_pred_reward = tf.reduce_mean(tf.reduce_max(
            self.predict_value, 1), name='predict_reward')
        add_moving_summary(max_pred_reward)

        with tf.variable_scope('target'):
            targetQ_predict_value = self._get_DQN_prediction(next_state)    # NxA
        # DQN
        #best_v = tf.reduce_max(targetQ_predict_value, 1)    # N,
        # Double-DQN
        tf.get_variable_scope().reuse_variables()
        next_predict_value = self._get_DQN_prediction(next_state)
        self.greedy_choice = tf.argmax(next_predict_value, 1)   # N,
        predict_onehot = tf.one_hot(self.greedy_choice, NUM_ACTIONS, 1.0, 0.0)
        best_v = tf.reduce_sum(targetQ_predict_value * predict_onehot, 1)
        target = reward + (1.0 - tf.cast(isOver, tf.float32)) * GAMMA * tf.stop_gradient(best_v)
        cost = symbf.huber_loss(target - pred_action_value)
        add_param_summary([('conv.*/W', ['histogram', 'rms']),
                           ('fc.*/W', ['histogram', 'rms']) ])   # monitor all W

        params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)
        with tf.name_scope('param_summary'):
            for p in params:
                name = p.name
                print name
                if name == 'conv0/W:0':
                    print "enter ! "
                    weights = tf.reshape(p, [-1,5,5,1])
                    tf.image_summary(name, weights)

        self.cost = tf.reduce_mean(cost, name='cost')
Beispiel #5
0
    def _build_graph(self, inputs):
        comb_state, action, reward, isOver = inputs
        comb_state = tf.cast(comb_state, tf.float32)
        state = tf.slice(comb_state, [0, 0, 0, 0], [-1, -1, -1, self.channel],
                         name='state')
        self.predict_value = self._get_DQN_prediction(state)
        if not get_current_tower_context().is_training:
            return

        reward = tf.clip_by_value(reward, -1, 1)
        next_state = tf.slice(comb_state, [0, 0, 0, 1],
                              [-1, -1, -1, self.channel],
                              name='next_state')
        action_onehot = tf.one_hot(action, self.num_actions, 1.0, 0.0)

        pred_action_value = tf.reduce_sum(self.predict_value * action_onehot,
                                          1)  # N,
        max_pred_reward = tf.reduce_mean(tf.reduce_max(self.predict_value, 1),
                                         name='predict_reward')
        summary.add_moving_summary(max_pred_reward)

        with tf.variable_scope('target'), \
                collection.freeze_collection([tf.GraphKeys.TRAINABLE_VARIABLES]):
            targetQ_predict_value = self._get_DQN_prediction(next_state)  # NxA

        if self.method != 'Double':
            # DQN
            best_v = tf.reduce_max(targetQ_predict_value, 1)  # N,
        else:
            # Double-DQN
            sc = tf.get_variable_scope()
            with tf.variable_scope(sc, reuse=True):
                next_predict_value = self._get_DQN_prediction(next_state)
            self.greedy_choice = tf.argmax(next_predict_value, 1)  # N,
            predict_onehot = tf.one_hot(self.greedy_choice, self.num_actions,
                                        1.0, 0.0)
            best_v = tf.reduce_sum(targetQ_predict_value * predict_onehot, 1)

        target = reward + (1.0 - tf.cast(
            isOver, tf.float32)) * self.gamma * tf.stop_gradient(best_v)

        self.cost = tf.reduce_mean(symbf.huber_loss(target -
                                                    pred_action_value),
                                   name='cost')
        summary.add_param_summary(
            ('conv.*/W', ['histogram', 'rms']),
            ('fc.*/W', ['histogram', 'rms']))  # monitor all W
        summary.add_moving_summary(self.cost)
    def _build_graph(self, inputs):
        comb_state, action, reward, isOver, action_o = inputs
        self.batch_size = tf.shape(comb_state)[0]

        backward_offset = ((self.channel) - self.update_step)
        action = tf.slice(action, [0, backward_offset], [-1, self.update_step])
        reward = tf.slice(reward, [0, backward_offset], [-1, self.update_step])
        isOver = tf.slice(isOver, [0, backward_offset], [-1, self.update_step])
        action_o = tf.slice(action_o, [0, backward_offset, 0],
                            [-1, self.update_step, self.num_agents])

        action = tf.reshape(action, (self.batch_size * self.update_step, ))
        reward = tf.reshape(reward, (self.batch_size * self.update_step, ))
        isOver = tf.reshape(isOver, (self.batch_size * self.update_step, ))
        action_o = tf.reshape(
            action_o, (self.batch_size * self.update_step, self.num_agents))

        comb_state = tf.cast(comb_state, tf.float32)
        state = tf.slice(comb_state, [0, 0, 0, 0], [-1, -1, -1, self.channel],
                         name='state')

        self.predict_value, pi_value, self.q_rnn_state_out, self.pi_rnn_state_out = self._get_DQN_prediction(
            state)
        if not get_current_tower_context().is_training:
            return

        reward = tf.clip_by_value(reward, -1, 1)
        next_state = tf.slice(comb_state, [0, 0, 0, 1],
                              [-1, -1, -1, self.channel],
                              name='next_state')
        action_onehot = tf.one_hot(action, self.num_actions, 1.0, 0.0)

        pred_action_value = tf.reduce_sum(self.predict_value * action_onehot,
                                          1)  # N,
        max_pred_reward = tf.reduce_mean(tf.reduce_max(self.predict_value, 1),
                                         name='predict_reward')
        summary.add_moving_summary(max_pred_reward)

        with tf.variable_scope('target'), \
                collection.freeze_collection([tf.GraphKeys.TRAINABLE_VARIABLES]):
            targetQ_predict_value, target_pi_value, _, _ = self._get_DQN_prediction(
                next_state)  # NxA

        if self.method != 'Double':
            # DQN
            best_v = tf.reduce_max(targetQ_predict_value, 1)  # N,
        else:
            # Double-DQN
            sc = tf.get_variable_scope()
            with tf.variable_scope(sc, reuse=True):
                next_predict_value, next_pi_value, _, _ = self._get_DQN_prediction(
                    next_state)
            self.greedy_choice = tf.argmax(next_predict_value, 1)  # N,
            predict_onehot = tf.one_hot(self.greedy_choice, self.num_actions,
                                        1.0, 0.0)
            best_v = tf.reduce_sum(targetQ_predict_value * predict_onehot, 1)

        target = reward + (1.0 - tf.cast(
            isOver, tf.float32)) * self.gamma * tf.stop_gradient(best_v)

        # q cost
        q_cost = (symbf.huber_loss(target - pred_action_value))
        # pi cost
        action_os = tf.unstack(action_o, self.num_agents, axis=1)
        action_o_one_hots = []
        for o in action_os:
            action_o_one_hots.append(tf.one_hot(o, self.num_actions, 1.0, 0.0))
        pi_costs = []
        for i, o in enumerate(action_o_one_hots):
            scale = 1.0
            # Coop-only: disable opponent loss
            if self.mt_type == 'coop-only' and i > 0:
                scale = 0.0
            # Opponent-only: disable collaborator loss
            if self.mt_type == 'opponent-only' and i == 0:
                scale = 0.0
            pi_costs.append(scale * tf.nn.softmax_cross_entropy_with_logits(
                labels=o, logits=pi_value[i]))
        pi_cost = self.lamb * tf.add_n(pi_costs)

        if self.reg:
            reg_coff = tf.stop_gradient(tf.sqrt(
                1.0 / (tf.reduce_mean(pi_cost) + 1e-9)),
                                        name='reg')
            self.cost = tf.reduce_mean(reg_coff * q_cost + pi_cost)
            summary.add_moving_summary(reg_coff)
        else:
            self.cost = tf.reduce_mean(q_cost + pi_cost)

        summary.add_param_summary(
            ('conv.*/W', ['histogram', 'rms']),
            ('fc.*/W', ['histogram', 'rms']))  # monitor all W
        summary.add_moving_summary(self.cost)
        summary.add_moving_summary(tf.reduce_mean(pi_cost, name='pi_cost'))
        summary.add_moving_summary(tf.reduce_mean(q_cost, name='q_cost'))

        for i, o_t in enumerate(action_os):
            pred = tf.argmax(pi_value[i], axis=1)
            summary.add_moving_summary(
                tf.contrib.metrics.accuracy(pred, o_t, name='acc-%d' % i))