def setOptimizer(self, name):
	# optimize
        with tf.variable_scope('optimizer_' + name):
            self.targets = tf.placeholder('float32', [None], name='target_q_t')
            self.actions = tf.placeholder('int64', [None], name='action')
            self.beta = tf.placeholder('float32', [None], name='beta')
            action_one_hot = tf.one_hot(self.actions, config.NUMBER_OF_ACTIONS, 1.0, 0.0, name='action_one_hot')
            q_acted = tf.reduce_sum(self.current_network.outputs * action_one_hot, reduction_indices=1, name='q_acted')
            self.delta = self.targets - q_acted
            self.delta =  tf.where(tf.greater(self.delta, tf.constant(0.0)), self.delta, self.delta*self.beta)
            with tf.name_scope('loss'):
                self.loss = tf.reduce_mean(clipped_error(self.delta), name='loss')
            self.optim = tf.train.AdamOptimizer(config.LEARNING_RATE).minimize(self.loss)
Ejemplo n.º 2
0
    def _build_optim(self):
        with tf.variable_scope('optimizer'):
            # we can evaluate this seperately since we dont have to propagate errors
            # fed in using r+gammaQ_t(s', argmax Q(s',a'))
            self.yDQN = tf.placeholder('float32', [None], name = 'yDQN')
            
            # find true q for action batch
            self.action = tf.placeholder('int32', [None], name = 'action')
            # batch, features, depth
            action_one_hot = tf.one_hot(self.action, self.action_size, axis = -1)

            # get q values for the action we chose, mask self.q with element wise mult
            # -> q for each batch
            q_for_step = tf.reduce_sum(tf.mul(self.q_train, action_one_hot), 1)

            # get loss from TD
            self.loss = clipped_error(self.yDQN-q_for_step)
            # optimize
            #self.optim = tf.train.RMSPropOptimizer(0.0015, momentum = 0.90, epsilon = 1e-08).minimize(self.loss) 
            self.optim = tf.train.AdamOptimizer().minimize(self.loss) 
Ejemplo n.º 3
0
Archivo: mydqn.py Proyecto: mxxhcm/code
    def __init__(self, env, env_name, sess=tf.InteractiveSession()):
        self.env = env
        self.env_name = env_name
        self.model_dir = self.env_name + "/"
        self.sess = sess
        self.replay_buffer = deque()
        self.state_dim = env.observation_space.shape
        self.height = 84
        self.width = 84
        self.action_dim = env.action_space.n
        self.hidden_dim = HIDDEN_UNITS

        self.ep_start = ep_start
        self.ep_end = ep_end
        self.ep_end_t = ep_end_t

        self.episodes = episodes
        self.episode_steps = steps

        self.gamma = GAMMA
        self.batch_size = BATCH_SIZE
        self.replay_buffer_size = REPLAY_BUFFER_SIZE
        self.epsilon = EPSILON

        with tf.variable_scope('step'):
            self.step_op = tf.Variable(0, trainable=False, name='step')
            self.step_input = tf.placeholder('int32', None, name='step_input')
            self.step_assign_op = self.step_op.assign(self.step_input)

        # create model
        self.initializer = tf.truncated_normal_initializer(0, 0.02)
        self.activation_fn = tf.nn.relu

        self.w = {}
        # model layers
        with tf.variable_scope('prediction'):
            # state
            tf.image.crop_and_resize()
            self.state_input = tf.placeholder('float32',
                                              (None, ) + self.state_dim,
                                              name='s_t')
            self.state = tf.image.resize_images(self.state_input, [84, 64])
            self.state = tf.image.pad_to_bounding_box(self.state, 0, 10,
                                                      self.height, self.width)

            # cnn layers
            self.l1, self.w['l1_w'], self.w['l1_b'] = conv2d(
                self.state,
                32, [8, 8], [4, 4],
                initializer=self.initializer,
                activation_fn=self.activation_fn,
                name='l1')
            self.l2, self.w['l2_w'], self.w['l2_b'] = conv2d(
                self.l1,
                32, [4, 4], [2, 2],
                initializer=self.initializer,
                activation_fn=self.activation_fn,
                name='l2')
            self.l3, self.w['l3_w'], self.w['l3_b'] = conv2d(
                self.l2,
                32, [3, 3], [1, 1],
                initializer=self.initializer,
                activation_fn=self.activation_fn,
                name='l3')
            shape = self.l3.get_shape().as_list()
            self.l3_flat = tf.reshape(
                self.l3, [-1, reduce(lambda x, y: x * y, shape[1:])])  #

            # fc layers
            self.l4, self.w['l4_w'], self.w['l4_b'] = linear(
                self.l3_flat,
                self.hidden_dim,
                activation_fn=self.activation_fn,
                name='l4')
            self.q, self.w['l5_w'], self.w['l5_b'] = linear(self.l4,
                                                            self.action_dim,
                                                            name='q')

            # policy evaluation using max action
            self.q_action = tf.argmax(self.q, dimension=1)

            q_summary = []
            avg_q = tf.reduce_mean(self.q, 0)  # 对多个batch的q值求平均
            for idx in range(self.action_dim):
                q_summary.append(tf.summary.histogram('q/%s' % idx,
                                                      avg_q[idx]))
            self.q_summary = tf.summary.merge(q_summary, 'q_summary')

        # optimizer
        with tf.variable_scope('optimizer'):
            # input action (one hot)
            self.action_one_hot = tf.placeholder("float",
                                                 [None, self.action_dim])
            ### input action (not one hot)
            # self.action_not_one_hot = tf.placeholder('int64', [None], name='action')
            ### action one hot
            # self.action_one_hot = tf.one_hot(self.action_not_one_hot, self.env.action_size, 1.0, 0.0, name='action_one_hot')

            # predicted q value, action is one hot representation
            self.predicted_q = tf.reduce_sum(tf.multiply(
                self.q, self.action_one_hot),
                                             reduction_indices=1,
                                             name='q_acted')
            # true value
            self.y = tf.placeholder("float", [None])
            # error
            self.delta = self.y - self.predicted_q
            # clipped loss function
            self.loss = tf.reduce_mean(clipped_error(self.delta), name='loss')

            self.global_step = tf.Variable(0, trainable=False)

            self.learning_rate = learning_rate
            self.learning_rate_step = tf.placeholder('int64',
                                                     None,
                                                     name='learning_rate_step')
            self.learning_rate_decay_step = learning_rate_decay_step
            self.learning_rate_decay = learning_rate_decay
            self.learning_rate_minimum = learning_rate_minimum

            self.learning_rate_op = tf.maximum(
                self.learning_rate_minimum,
                tf.train.exponential_decay(self.learning_rate,
                                           self.learning_rate_step,
                                           self.learning_rate_decay_step,
                                           self.learning_rate_decay,
                                           staircase=True))
            self.optimizer = tf.train.RMSPropOptimizer(self.learning_rate_op,
                                                       momentum=0.95,
                                                       epsilon=0.01).minimize(
                                                           self.loss)

        with tf.variable_scope('summary'):
            scalar_summary_tags = ['average.reward', 'average.loss', 'average.q', \
                'episode.max reward', 'episode.min reward', 'episode.avg reward', 'episode.num of game', 'training.learning_rate']

            self.summary_placeholders = {}
            self.summary_ops = {}

            for tag in scalar_summary_tags:
                self.summary_placeholders[tag] = tf.placeholder(
                    'float32', None, name=tag.replace(' ', '_'))
                self.summary_ops[tag] = tf.summary.scalar(
                    "%s/%s" % (self.env_name, tag),
                    self.summary_placeholders[tag])

            histogram_summary_tags = ['episode.rewards', 'episode.actions']

            for tag in histogram_summary_tags:
                self.summary_placeholders[tag] = tf.placeholder(
                    'float32', None, name=tag.replace(' ', '_'))
                self.summary_ops[tag] = tf.summary.histogram(
                    tag, self.summary_placeholders[tag])

            self.writer = tf.summary.FileWriter('./logs/%s' % self.model_dir,
                                                self.sess.graph)

        self.sess.run(tf.global_variables_initializer())

        self._saver = tf.train.Saver(list(self.w.values()) + [self.step_op],
                                     max_to_keep=30)
Ejemplo n.º 4
0
  def build_dqn(self):
    self.w = {}
    self.t_w = {}

    #initializer = tf.contrib.layers.xavier_initializer()
    initializer = tf.truncated_normal_initializer(0, 0.02)
    activation_fn = tf.nn.relu

    # training network
    with tf.variable_scope('prediction'):
      if self.cnn_format == 'NHWC':
        self.s_t = tf.placeholder('float32',
            [None, self.screen_height, self.screen_width, self.history_length], name='s_t')
      else:
        self.s_t = tf.placeholder('float32',
            [None, self.history_length, self.screen_height, self.screen_width], name='s_t')

      self.l1, self.w['l1_w'], self.w['l1_b'] = conv2d(self.s_t,
          32, [8, 8], [4, 4], initializer, activation_fn, self.cnn_format, name='l1')
      self.l2, self.w['l2_w'], self.w['l2_b'] = conv2d(self.l1,
          64, [4, 4], [2, 2], initializer, activation_fn, self.cnn_format, name='l2')
      self.l3, self.w['l3_w'], self.w['l3_b'] = conv2d(self.l2,
          64, [3, 3], [1, 1], initializer, activation_fn, self.cnn_format, name='l3')

      shape = self.l3.get_shape().as_list()
      self.l3_flat = tf.reshape(self.l3, [-1, reduce(lambda x, y: x * y, shape[1:])])

      if self.dueling:
        self.value_hid, self.w['l4_val_w'], self.w['l4_val_b'] = \
            linear(self.l3_flat, 512, activation_fn=activation_fn, name='value_hid')

        self.adv_hid, self.w['l4_adv_w'], self.w['l4_adv_b'] = \
            linear(self.l3_flat, 512, activation_fn=activation_fn, name='adv_hid')

        self.value, self.w['val_w_out'], self.w['val_w_b'] = \
          linear(self.value_hid, 1, name='value_out')

        self.advantage, self.w['adv_w_out'], self.w['adv_w_b'] = \
          linear(self.adv_hid, self.env.action_size, name='adv_out')

        # Average Dueling
        self.q = self.value + (self.advantage -
          tf.reduce_mean(self.advantage, reduction_indices=1, keep_dims=True))
      else:
        self.l4, self.w['l4_w'], self.w['l4_b'] = linear(self.l3_flat, 512, activation_fn=activation_fn, name='l4')
        self.q, self.w['q_w'], self.w['q_b'] = linear(self.l4, self.env.action_size, name='q')

      self.q_action = tf.argmax(self.q, dimension=1)

      q_summary = []
      avg_q = tf.reduce_mean(self.q, 0)
      for idx in range(self.env.action_size):
        q_summary.append(tf.summary.histogram('q/%s' % idx, avg_q[idx]))
      self.q_summary = tf.summary.merge(q_summary, 'q_summary')

    # target network
    with tf.variable_scope('target'):
      if self.cnn_format == 'NHWC':
        self.target_s_t = tf.placeholder('float32',
            [None, self.screen_height, self.screen_width, self.history_length], name='target_s_t')
      else:
        self.target_s_t = tf.placeholder('float32',
            [None, self.history_length, self.screen_height, self.screen_width], name='target_s_t')

      self.target_l1, self.t_w['l1_w'], self.t_w['l1_b'] = conv2d(self.target_s_t,
          32, [8, 8], [4, 4], initializer, activation_fn, self.cnn_format, name='target_l1')
      self.target_l2, self.t_w['l2_w'], self.t_w['l2_b'] = conv2d(self.target_l1,
          64, [4, 4], [2, 2], initializer, activation_fn, self.cnn_format, name='target_l2')
      self.target_l3, self.t_w['l3_w'], self.t_w['l3_b'] = conv2d(self.target_l2,
          64, [3, 3], [1, 1], initializer, activation_fn, self.cnn_format, name='target_l3')

      shape = self.target_l3.get_shape().as_list()
      self.target_l3_flat = tf.reshape(self.target_l3, [-1, reduce(lambda x, y: x * y, shape[1:])])

      if self.dueling:
        self.t_value_hid, self.t_w['l4_val_w'], self.t_w['l4_val_b'] = \
            linear(self.target_l3_flat, 512, activation_fn=activation_fn, name='target_value_hid')

        self.t_adv_hid, self.t_w['l4_adv_w'], self.t_w['l4_adv_b'] = \
            linear(self.target_l3_flat, 512, activation_fn=activation_fn, name='target_adv_hid')

        self.t_value, self.t_w['val_w_out'], self.t_w['val_w_b'] = \
          linear(self.t_value_hid, 1, name='target_value_out')

        self.t_advantage, self.t_w['adv_w_out'], self.t_w['adv_w_b'] = \
          linear(self.t_adv_hid, self.env.action_size, name='target_adv_out')

        # Average Dueling
        self.target_q = self.t_value + (self.t_advantage -
          tf.reduce_mean(self.t_advantage, reduction_indices=1, keep_dims=True))
      else:
        self.target_l4, self.t_w['l4_w'], self.t_w['l4_b'] = \
            linear(self.target_l3_flat, 512, activation_fn=activation_fn, name='target_l4')
        self.target_q, self.t_w['q_w'], self.t_w['q_b'] = \
            linear(self.target_l4, self.env.action_size, name='target_q')

      self.target_q_idx = tf.placeholder('int32', [None, None], 'outputs_idx')
      self.target_q_with_idx = tf.gather_nd(self.target_q, self.target_q_idx)

    with tf.variable_scope('pred_to_target'):
      self.t_w_input = {}
      self.t_w_assign_op = {}

      for name in self.w.keys():
        self.t_w_input[name] = tf.placeholder('float32', self.t_w[name].get_shape().as_list(), name=name)
        self.t_w_assign_op[name] = self.t_w[name].assign(self.t_w_input[name])

    # optimizer
    with tf.variable_scope('optimizer'):
      self.target_q_t = tf.placeholder('float32', [None], name='target_q_t')
      self.action = tf.placeholder('int64', [None], name='action')

      action_one_hot = tf.one_hot(self.action, self.env.action_size, 1.0, 0.0, name='action_one_hot')
      q_acted = tf.reduce_sum(self.q * action_one_hot, reduction_indices=1, name='q_acted')

      self.delta = self.target_q_t - q_acted

      self.global_step = tf.Variable(0, trainable=False)

      self.loss = tf.reduce_mean(clipped_error(self.delta), name='loss')
      self.learning_rate_step = tf.placeholder('int64', None, name='learning_rate_step')
      self.learning_rate_op = tf.maximum(self.learning_rate_minimum,
          tf.train.exponential_decay(
              self.learning_rate,
              self.learning_rate_step,
              self.learning_rate_decay_step,
              self.learning_rate_decay,
              staircase=True))
      self.optim = tf.train.RMSPropOptimizer(
          self.learning_rate_op, momentum=0.95, epsilon=0.01).minimize(self.loss)

    with tf.variable_scope('summary'):
      scalar_summary_tags = ['average.reward', 'average.loss', 'average.q', \
          'episode.max reward', 'episode.min reward', 'episode.avg reward', 'episode.num of game', 'training.learning_rate']

      self.summary_placeholders = {}
      self.summary_ops = {}

      for tag in scalar_summary_tags:
        self.summary_placeholders[tag] = tf.placeholder('float32', None, name=tag.replace(' ', '_'))
        self.summary_ops[tag]  = tf.summary.scalar("%s-%s/%s" % (self.env_name, self.env_type, tag), self.summary_placeholders[tag])

      histogram_summary_tags = ['episode.rewards', 'episode.actions']

      for tag in histogram_summary_tags:
        self.summary_placeholders[tag] = tf.placeholder('float32', None, name=tag.replace(' ', '_'))
        self.summary_ops[tag]  = tf.summary.histogram(tag, self.summary_placeholders[tag])

      self.writer = tf.summary.FileWriter('./logs/%s' % self.model_dir, self.sess.graph)

    tf.initialize_all_variables().run()
    # print('self.w.values()',self.w.values())

    self._saver = tf.train.Saver(list(self.w.values()) + [self.step_op], max_to_keep=30)

    self.load_model()
    self.update_target_q_network()
Ejemplo n.º 5
0
    def build_model(self):
        # model layers
        with tf.variable_scope(self.model_name + "_" + 'prediction'):
            self.state = tf.placeholder('float32', [None, 84, 84, 3],
                                        name='s_t')
            # input action (one hot)
            self.action_one_hot = tf.placeholder("float",
                                                 [None, self.action_dim])
            self.next_state = tf.placeholder('float32', (None, 84, 84, 3),
                                             name='s_t_1')
            self.reward = tf.placeholder('float32', (None, ), name='reward')
            self.done = tf.placeholder('int32', (None, ), name='done')
            self.times = tf.placeholder('float32', (None, ), name='timesteps')

            # cnn layers
            self.l1, self.w['l1_w'], self.w['l1_b'] = conv2d(
                self.state,
                5, [2, 2], [1, 1],
                initializer=self.initializer,
                activation_fn=self.activation_fn,
                name='l1',
                reuse=False)
            self.l2, self.w['l2_w'], self.w['l2_b'] = conv2d(
                self.l1,
                10, [3, 3], [1, 1],
                initializer=self.initializer,
                activation_fn=self.activation_fn,
                name='l2',
                reuse=False)
            self.l3, self.w['l3_w'], self.w['l3_b'] = conv2d(
                self.l2,
                10, [3, 3], [1, 1],
                initializer=self.initializer,
                activation_fn=self.activation_fn,
                name='l3',
                reuse=False)
            shape = self.l3.get_shape().as_list()
            self.l3_flat = tf.reshape(
                self.l3, [-1, reduce(lambda x, y: x * y, shape[1:])])

            # fc layers
            self.q, self.w['l4_w'], self.w['l4_b'] = linear(self.l3_flat,
                                                            self.action_dim,
                                                            name='q',
                                                            reuse=False)

        # optimizer
        with tf.variable_scope(self.model_name + "_" + 'optimizer'):
            # predicted q value, action is one hot representation
            # 预测值q
            self.predicted_q = tf.boolean_mask(self.q, self.action_one_hot)

            # 用pi0和pii的q值计算v
            # pi0 = self.shared_policy.select_action(self.next_state)
            self.pi0_prob = tf.placeholder(tf.float32, [None, self.action_dim],
                                           name="pi0")
            self.next_q = tf.placeholder(tf.float32, [None, self.action_dim],
                                         name="next_q")
            self.v = tf.log(
                tf.pow(self.pi0_prob, self.alpha) *
                tf.exp(self.beta * self.next_q)) / self.beta

            # 目标值(true value)
            self.y = []
            for i in range(self.batch_size):
                if self.done[i] != 0:
                    self.y.append(self.reward[i])
                else:
                    self.y.append(self.reward[i] + self.gamma * self.v[i])

            # error
            self.delta = self.y - self.predicted_q
            # clipped loss function
            self.loss = tf.reduce_mean(clipped_error(self.delta), name='loss')

            self.global_step = tf.Variable(0, dtype=tf.int64, trainable=False)

            self.learning_rate = learning_rate
            self.learning_rate_step = tf.placeholder('int64',
                                                     None,
                                                     name='learning_rate_step')
            self.learning_rate_decay_step = learning_rate_decay_step
            self.learning_rate_decay = learning_rate_decay
            self.learning_rate_minimum = learning_rate_minimum

            self.learning_rate_op = tf.maximum(
                self.learning_rate_minimum,
                tf.train.exponential_decay(self.learning_rate,
                                           self.learning_rate_step,
                                           self.learning_rate_decay_step,
                                           self.learning_rate_decay,
                                           staircase=True))
            self.optimizer = tf.train.RMSPropOptimizer(self.learning_rate_op,
                                                       momentum=0.95,
                                                       epsilon=0.01).minimize(
                                                           self.loss)
Ejemplo n.º 6
0
Archivo: agent.py Proyecto: mxxhcm/code
    def build_dqn(self):
        self.w = {}  # weights
        self.t_w = {}  # target weights

        initializer = tf.truncated_normal_initializer(0, 0.02)
        activation_fn = tf.nn.relu

        with tf.variable_scope('prediction'):
            if self.state_format == 'NHWC':
                self.s_t = tf.placeholder('float32', [
                    None, self.screen_height, self.screen_width,
                    self.history_length
                ],
                                          name='s_t')
            else:
                self.s_t = tf.placeholder('float32', [
                    None, self.history_length, self.screen_height,
                    self.screen_width
                ],
                                          name='s_t')

            self.l1, self.w['l1_w'], self.w['l1_b'] = conv2d(
                self.s_t,
                32, [8, 8], [4, 4],
                initializer=initializer,
                activation_fn=activation_fn,
                data_format=self.state_format,
                name='l1')
            self.l2, self.w['l2_w'], self.w['l2_b'] = conv2d(
                self.l1,
                32, [8, 8], [4, 4],
                initializer=initializer,
                activation_fn=activation_fn,
                data_format=self.state_format,
                name='l2')
            self.l3, self.w['l3_w'], self.w['l3_b'] = conv2d(
                self.l2,
                32, [8, 8], [4, 4],
                initializer=initializer,
                activation_fn=activation_fn,
                data_format=self.state_format,
                name='l3')
            shape = self.l3.get_shape().as_list()
            self.l3_flat = tf.reshape(
                self.l3, [-1, reduce(lambda x, y: x * y, shape[1:])])  #

            self.l4, self.w['l4_w'], self.w['l4_b'] = linear(
                self.l3_flat, 512, activation_fn=activation_fn, name='l4')
            self.l5, self.w['l5_w'], self.w['l5_b'] = linear(
                self.l4, self.env.action_size, name='q')

            # policy evaluation using max action
            self.q_action = tf.argmax(self.l5, dimension=1)

            q_summary = []
            avg_q = tf.reduce_mean(self.q, 0)  # 对多个batch的q值求平均
            for idx in range(self.env.action_size):
                q_summary.append(tf.summary.histogram('q/%s' % idx,
                                                      avg_q[idx]))
            self.q_summary = tf.summary.merge(q_summary, 'q_summary')

            # target network
            with tf.variable_scope('target'):
                if self.state_format == 'NHWC':
                    self.target_s_t = tf.placeholder('float32', [
                        None, self.screen_height, self.screen_width,
                        self.history_length
                    ],
                                                     name='target_s_t')
                else:
                    self.target_s_t = tf.placeholder('float32', [
                        None, self.history_length, self.screen_height,
                        self.screen_width
                    ],
                                                     name='target_s_t')

                self.target_l1, self.t_w['l1_w'], self.t_w['l1_b'] = conv2d(
                    self.target_s_t,
                    32, [8, 8], [4, 4],
                    initializer,
                    activation_fn,
                    self.state_format,
                    name='target_l1')
                self.target_l2, self.t_w['l2_w'], self.t_w['l2_b'] = conv2d(
                    self.target_l1,
                    64, [4, 4], [2, 2],
                    initializer,
                    activation_fn,
                    self.state_format,
                    name='target_l2')
                self.target_l3, self.t_w['l3_w'], self.t_w['l3_b'] = conv2d(
                    self.target_l2,
                    64, [3, 3], [1, 1],
                    initializer,
                    activation_fn,
                    self.state_format,
                    name='target_l3')

                shape = self.target_l3.get_shape().as_list()
                self.target_l3_flat = tf.reshape(
                    self.target_l3,
                    [-1, reduce(lambda x, y: x * y, shape[1:])])

                self.target_l4, self.t_w['l4_w'], self.t_w['l4_b'] = \
                    linear(self.target_l3_flat, 512, activation_fn=activation_fn, name='target_l4')
                self.target_q, self.t_w['q_w'], self.t_w['q_b'] = \
                    linear(self.target_l4, self.env.action_size, name='target_q')

            with tf.variable_scope('pred_to_target'):
                self.t_w_input = {}
                self.t_w_assign_op = {}

                for name in self.t_w.keys():
                    self.t_w_input[name] = tf.placeholder(
                        'float32',
                        self.t_w[name].get_shape().as_list(),
                        name=name)
                    self.t_w_assign_op[name] = self.t_w[name].assign(
                        self.t_w_input[name])

            with tf.variable_scope('optimizer'):
                self.target_q_t = tf.placeholder(
                    'float32', [None], name='target_q_t')  # target q at time t

                # self.q_action.eval(s_t)
                self.action = tf.placeholder('int64', [None], name='action')
                action_one_hot = tf.one_hot(self.action,
                                            self.env.action_size,
                                            1.0,
                                            0.0,
                                            name='action_one_hot')
                q_acted = tf.reduce_sum(self.q * action_one_hot,
                                        reduction_indices=1,
                                        name='q_acted')

                self.delta = self.target_q_t - q_acted  # target q - true q
                self.global_step = tf.Variable(0, trainable=False)

                self.loss = tf.reduce_mean(clipped_error(self.delta),
                                           name='loss')
                self.learning_rate_step = tf.placeholder('int64',
                                                         None,
                                                         name='lr_rate_step')
Ejemplo n.º 7
0
    def build(self):
        self.w = {}
        self.t_w = {}

        initializer = tf.truncated_normal_initializer(0, 0.02)
        activation_fn = tf.nn.relu

        with tf.variable_scope('prediction'):
            if (self.cnn_format == 'NHWC'):
                self.s_t = tf.placeholder('float32', [
                    None, self.screen_height, self.screen_width,
                    self.history_length
                ],
                                          name='s_t')
            else:
                self.s_t = tf.placeholder('float32', [
                    None, self.history_length, self.screen_height,
                    self.screen_width
                ],
                                          name='s_t')

            self.l1, self.w['l1_w'], self.w['l1_b'] = conv2d(self.s_t,
                                                             32, [8, 8],
                                                             [4, 4],
                                                             initializer,
                                                             activation_fn,
                                                             self.cnn_format,
                                                             name='l1')
            self.l2, self.w['l2_w'], self.w['l2_b'] = conv2d(self.l1,
                                                             64, [4, 4],
                                                             [2, 2],
                                                             initializer,
                                                             activation_fn,
                                                             self.cnn_format,
                                                             name='l2')
            self.l3, self.w['l3_w'], self.w['l3_b'] = conv2d(self.l2,
                                                             64, [3, 3],
                                                             [1, 1],
                                                             initializer,
                                                             activation_fn,
                                                             self.cnn_format,
                                                             name='l3')

            shape = self.l3.get_shape().as_list()
            self.l3_flat = tf.reshape(
                self.l3, [-1, reduce(lambda x, y: x * y, shape[1:])])

            self.l4, self.w['l4_w'], self.w['l4_b'] = linear(
                self.l3_flat, 512, activation_fn=activation_fn, name='l4')
            self.q, self.w['q_w'], self.w['q_b'] = linear(self.l4,
                                                          self.action_size,
                                                          name='q')

            self.q_action = tf.argmax(self.q, dimension=1)

        with tf.variable_scope('target'):
            if (self.cnn_format == 'NHWC'):
                self.target_s_t = tf.placeholder('float32', [
                    None, self.screen_height, self.screen_width,
                    self.history_length
                ],
                                                 name='target_s_t')
            else:
                self.target_s_t = tf.placeholder('float32', [
                    None, self.history_length, self.screen_height,
                    self.screen_width
                ],
                                                 name='target_s_t')

            self.target_l1, self.t_w['l1_w'], self.t_w['l1_b'] = conv2d(
                self.target_s_t,
                32, [8, 8], [4, 4],
                initializer,
                activation_fn,
                self.cnn_format,
                name='target_l1')
            self.target_l2, self.t_w['l2_w'], self.t_w['l2_b'] = conv2d(
                self.target_l1,
                64, [4, 4], [2, 2],
                initializer,
                activation_fn,
                self.cnn_format,
                name='target_l2')
            self.target_l3, self.t_w['l3_w'], self.t_w['l3_b'] = conv2d(
                self.target_l2,
                64, [3, 3], [1, 1],
                initializer,
                activation_fn,
                self.cnn_format,
                name='target_l3')

            shape = self.target_l3.get_shape().as_list()
            self.target_l3_flat = tf.reshape(
                self.target_l3, [-1, reduce(lambda x, y: x * y, shape[1:])])

            self.target_l4, self.t_w['l4_w'], self.t_w['l4_b'] = \
                    linear(self.target_l3_flat, 512,
                           activation_fn=activation_fn, name='target_l4')
            self.target_q, self.t_w['q_w'], self.t_w['q_b'] = \
                    linear(self.target_l4, self.action_size, name='target_q')

            self.target_q_idx = tf.placeholder('int32', [None, None],
                                               'outputs_idx')
            self.target_q_with_idx = tf.gather_nd(self.target_q,
                                                  self.target_q_idx)

        with tf.variable_scope('pred_to_target'):
            self.t_w_input = {}
            self.t_w_assign_op = {}

            for name in self.w.keys():
                self.t_w_input[name] = tf.placeholder(
                    'float32', self.t_w[name].get_shape().as_list(), name=name)
                self.t_w_assign_op[name] = self.t_w[name].assign(
                    self.t_w_input[name])

        with tf.variable_scope('optimiser'):
            self.target_q_t = tf.placeholder('float32', [None],
                                             name='target_q_t')
            self.action = tf.placeholder('int64', [None], name='action')

            action_one_hot = tf.one_hot(self.action,
                                        self.action_size,
                                        1.,
                                        0.,
                                        name='action_one_hot')
            q_acted = tf.reduce_sum(self.q * action_one_hot,
                                    reduction_indices=1,
                                    name='q_acted')

            self.delta = self.target_q_t - q_acted

            self.global_step = tf.Variable(0, trainable=False)

            self.loss = tf.reduce_mean(clipped_error(self.delta), name='loss')
            self.learning_rate_step = tf.placeholder('int64',
                                                     None,
                                                     name='learning_rate_step')
            self.learning_rate_op = tf.maximum(
                self.learning_rate_minimum,
                tf.train.exponential_decay(self.learning_rate,
                                           self.learning_rate_step,
                                           self.learning_rate_decay_step,
                                           self.learning_rate_decay,
                                           staircase=True))
            self.optim = tf.train.RMSPropOptimizer(self.learning_rate_op,
                                                   momentum=0.95,
                                                   epsilon=0.01).minimize(
                                                       self.loss)

        self.sess.run(tf.global_variables_initializer())
        #self._saver = tf.train.Saver(self.w.values() + [self.step_op], max_to_keep=30)

        #self.load_model()
        self.update_target_q_network()