def build_net(minimap, screen, info, msize, ssize, num_action):
  # Extract features
  screen_filters1 = tf.get_variable(name='sf1', shape=(5, 5, U.screen_channel(), 16))  # hwio
  screen_filters2 = tf.get_variable(name='sf2',shape=(3, 3, 16, 32))  
  minimap_filters1 = tf.get_variable(name='mmf1',shape=(5, 5, U.minimap_channel(), 16))
  minimap_filters2 = tf.get_variable(name='mmf2',shape=(3, 3, 16, 32))
  
  mconv1 = tf.nn.conv2d(tf.transpose(minimap, [0, 2, 3, 1]), minimap_filters1, strides=[1, 1, 1, 1], padding='SAME', name='mconv1')
  mconv2 = tf.nn.conv2d(mconv1, minimap_filters2, strides=[1, 1, 1, 1], padding='SAME', name='mconv2')
  sconv1 = tf.nn.conv2d(tf.transpose(screen, [0, 2, 3, 1]), screen_filters1, strides=[1, 1, 1, 1], padding='SAME', name='sconv1')
  sconv2 = tf.nn.conv2d(sconv1, screen_filters2, strides=[1, 1, 1, 1], padding='SAME', name='sconv2')
  info_fc = layers.fully_connected(layers.flatten(info), num_outputs=256, activation_fn=tf.tanh, scope='info_fc')

  # Compute spatial actions
  feat_conv = tf.concat([mconv2, sconv2], axis=3)

  spatial_weights = tf.get_variable(name='spatial_weights', shape=(1, 1, feat_conv.get_shape()[-1], 1))
  spatial_action = tf.nn.conv2d(feat_conv, spatial_weights, strides=[1, 1, 1 ,1], padding='SAME', name='spatial_action')
  spatial_action = tf.nn.softmax(layers.flatten(spatial_action))

  # Compute non spatial actions and value
  feat_fc = tf.concat([layers.flatten(mconv2), layers.flatten(sconv2), info_fc], axis=1)
  feat_fc = layers.fully_connected(feat_fc, num_outputs=256, activation_fn=tf.nn.relu, scope='feat_fc')
  non_spatial_action = layers.fully_connected(feat_fc, num_outputs=num_action, activation_fn=tf.nn.softmax, scope='non_spatial_action')
  value = tf.reshape(layers.fully_connected(feat_fc, num_outputs=1, activation_fn=None, scope='value'), [-1])

  return spatial_action, non_spatial_action, value 
Beispiel #2
0
  def build_model(self, reuse, dev, ntype):
    with tf.variable_scope(self.name) and tf.device(dev):
      if reuse:
        tf.get_variable_scope().reuse_variables()
        assert tf.get_variable_scope().reuse

      # Set inputs of networks
      self.minimap = tf.placeholder(tf.float32, [None, U.minimap_channel(), self.msize, self.msize], name='minimap')
      self.screen = tf.placeholder(tf.float32, [None, U.screen_channel(), self.ssize, self.ssize], name='screen')
      self.info = tf.placeholder(tf.float32, [None, self.isize], name='info')

      # Build networks
      net = build_net(self.minimap, self.screen, self.info, self.msize, self.ssize, len(actions.FUNCTIONS), ntype)
      self.spatial_action, self.non_spatial_action, self.value = net

      # Set targets and masks
      self.valid_spatial_action = tf.placeholder(tf.float32, [None], name='valid_spatial_action')
      self.spatial_action_selected = tf.placeholder(tf.float32, [None, self.ssize**2], name='spatial_action_selected')
      self.valid_non_spatial_action = tf.placeholder(tf.float32, [None, len(actions.FUNCTIONS)], name='valid_non_spatial_action')
      self.non_spatial_action_selected = tf.placeholder(tf.float32, [None, len(actions.FUNCTIONS)], name='non_spatial_action_selected')
      self.value_target = tf.placeholder(tf.float32, [None], name='value_target')

      # Compute log probability
      spatial_action_prob = tf.reduce_sum(self.spatial_action * self.spatial_action_selected, axis=1)
      spatial_action_log_prob = tf.log(tf.clip_by_value(spatial_action_prob, 1e-10, 1.))
      non_spatial_action_prob = tf.reduce_sum(self.non_spatial_action * self.non_spatial_action_selected, axis=1)
      valid_non_spatial_action_prob = tf.reduce_sum(self.non_spatial_action * self.valid_non_spatial_action, axis=1)
      valid_non_spatial_action_prob = tf.clip_by_value(valid_non_spatial_action_prob, 1e-10, 1.)
      non_spatial_action_prob = non_spatial_action_prob / valid_non_spatial_action_prob
      non_spatial_action_log_prob = tf.log(tf.clip_by_value(non_spatial_action_prob, 1e-10, 1.))
      self.summary.append(tf.summary.histogram('spatial_action_prob', spatial_action_prob))
      self.summary.append(tf.summary.histogram('non_spatial_action_prob', non_spatial_action_prob))

      # Compute losses, more details in https://arxiv.org/abs/1602.01783
      # Policy loss and value loss
      action_log_prob = self.valid_spatial_action * spatial_action_log_prob + non_spatial_action_log_prob
      advantage = tf.stop_gradient(self.value_target - self.value)
      policy_loss = - tf.reduce_mean(action_log_prob * advantage)
      value_loss = - tf.reduce_mean(self.value * advantage)
      self.summary.append(tf.summary.scalar('policy_loss', policy_loss))
      self.summary.append(tf.summary.scalar('value_loss', value_loss))

      # TODO: policy penalty
      loss = policy_loss + value_loss

      # Build the optimizer
      self.learning_rate = tf.placeholder(tf.float32, None, name='learning_rate')
      opt = tf.train.RMSPropOptimizer(self.learning_rate, decay=0.99, epsilon=1e-10)
      grads = opt.compute_gradients(loss)
      cliped_grad = []
      for grad, var in grads:
        self.summary.append(tf.summary.histogram(var.op.name, var))
        self.summary.append(tf.summary.histogram(var.op.name+'/grad', grad))
        grad = tf.clip_by_norm(grad, 10.0)
        cliped_grad.append([grad, var])
      self.train_op = opt.apply_gradients(cliped_grad)
      self.summary_op = tf.summary.merge(self.summary)

      self.saver = tf.train.Saver(max_to_keep=100)
Beispiel #3
0
  def build_model(self, reuse, dev, ntype):
    with tf.variable_scope(self.name) and tf.device(dev):
      if reuse:
        tf.get_variable_scope().reuse_variables()
        assert tf.get_variable_scope().reuse

      # Set inputs of networks
      self.minimap = tf.placeholder(tf.float32, [None, U.minimap_channel(), self.msize, self.msize], name='minimap')
      self.screen = tf.placeholder(tf.float32, [None, U.screen_channel(), self.ssize, self.ssize], name='screen')
      self.info = tf.placeholder(tf.float32, [None, self.isize], name='info')

      # Build networks
      net = build_net(self.minimap, self.screen, self.info, self.msize, self.ssize, len(actions.FUNCTIONS), ntype)
      self.spatial_action, self.non_spatial_action, self.value = net

      # Set targets and masks
      self.valid_spatial_action = tf.placeholder(tf.float32, [None], name='valid_spatial_action')
      self.spatial_action_selected = tf.placeholder(tf.float32, [None, self.ssize**2], name='spatial_action_selected')
      self.valid_non_spatial_action = tf.placeholder(tf.float32, [None, len(actions.FUNCTIONS)], name='valid_non_spatial_action')
      self.non_spatial_action_selected = tf.placeholder(tf.float32, [None, len(actions.FUNCTIONS)], name='non_spatial_action_selected')
      self.value_target = tf.placeholder(tf.float32, [None], name='value_target')

      # Compute log probability
      spatial_action_prob = tf.reduce_sum(self.spatial_action * self.spatial_action_selected, axis=1)
      spatial_action_log_prob = tf.log(tf.clip_by_value(spatial_action_prob, 1e-10, 1.))
      non_spatial_action_prob = tf.reduce_sum(self.non_spatial_action * self.non_spatial_action_selected, axis=1)
      valid_non_spatial_action_prob = tf.reduce_sum(self.non_spatial_action * self.valid_non_spatial_action, axis=1)
      valid_non_spatial_action_prob = tf.clip_by_value(valid_non_spatial_action_prob, 1e-10, 1.)
      non_spatial_action_prob = non_spatial_action_prob / valid_non_spatial_action_prob
      non_spatial_action_log_prob = tf.log(tf.clip_by_value(non_spatial_action_prob, 1e-10, 1.))
      self.summary.append(tf.summary.histogram('spatial_action_prob', spatial_action_prob))
      self.summary.append(tf.summary.histogram('non_spatial_action_prob', non_spatial_action_prob))

      # Compute losses, more details in https://arxiv.org/abs/1602.01783
      # Policy loss and value loss
      action_log_prob = self.valid_spatial_action * spatial_action_log_prob + non_spatial_action_log_prob
      advantage = tf.stop_gradient(self.value_target - self.value)
      policy_loss = - tf.reduce_mean(action_log_prob * advantage)
      value_loss = - tf.reduce_mean(self.value * advantage)
      self.summary.append(tf.summary.scalar('policy_loss', policy_loss))
      self.summary.append(tf.summary.scalar('value_loss', value_loss))

      # TODO: policy penalty
      loss = policy_loss + value_loss

      # Build the optimizer
      self.learning_rate = tf.placeholder(tf.float32, None, name='learning_rate')
      opt = tf.train.RMSPropOptimizer(self.learning_rate, decay=0.99, epsilon=1e-10)
      grads = opt.compute_gradients(loss)
      cliped_grad = []
      for grad, var in grads:
        self.summary.append(tf.summary.histogram(var.op.name, var))
        self.summary.append(tf.summary.histogram(var.op.name+'/grad', grad))
        grad = tf.clip_by_norm(grad, 10.0)
        cliped_grad.append([grad, var])
      self.train_op = opt.apply_gradients(cliped_grad)
      self.summary_op = tf.summary.merge(self.summary)

      self.saver = tf.train.Saver(max_to_keep=100)
Beispiel #4
0
def main(unused_argv):
    #config = tf.ConfigProto(allow_soft_placement=True)
    #print(type(config))
    #print(config.gpu_options)
    print(type(actions.FUNCTIONS))
    print(len(actions.FUNCTIONS))
    print(len(features.MINIMAP_FEATURES))
    print('minimap channel is', U.minimap_channel())
    print('screen channel is', U.screen_channel())
    print('player id index in minimap features is', features.MINIMAP_FEATURES.player_id.index)
Beispiel #5
0
  def build_model(self, reuse, dev, ntype):
    with tf.variable_scope(self.name) and tf.device(dev):
      if reuse:
        tf.get_variable_scope().reuse_variables()
        assert tf.get_variable_scope().reuse

      # Set inputs of networks
      self.minimap = tf.placeholder(tf.float32, [None, U.minimap_channel(), self.msize, self.msize], name='minimap')
      self.screen = tf.placeholder(tf.float32, [None, U.screen_channel(), self.ssize, self.ssize], name='screen')
      self.info = tf.placeholder(tf.float32, [None, self.isize], name='info')

      # Build networks
      net = build_net(self.minimap, self.screen, self.info, self.msize, self.ssize, len(actions.FUNCTIONS), ntype)
      self.spatial_action, self.non_spatial_action, self.value = net

      # Set targets and masks
      self.valid_spatial_action = tf.placeholder(tf.float32, [None], name='valid_spatial_action')
      self.spatial_action_selected = tf.placeholder(tf.float32, [None, self.ssize**2], name='spatial_action_selected')
      self.valid_non_spatial_action = tf.placeholder(tf.float32, [None, len(actions.FUNCTIONS)], name='valid_non_spatial_action')
      self.non_spatial_action_selected = tf.placeholder(tf.float32, [None, len(actions.FUNCTIONS)], name='non_spatial_action_selected')
      self.value_target = tf.placeholder(tf.float32, [None], name='value_target')

      # Compute log probability
      spatial_action_prob = tf.clip_by_value(tf.reduce_sum(self.spatial_action * self.spatial_action_selected, axis=1), 1e-10, 1.)
      non_spatial_action_prob = tf.clip_by_value(tf.reduce_sum(self.non_spatial_action * self.non_spatial_action_selected * self.valid_non_spatial_action, axis=1), 1e-10, 1.)

      q_value = spatial_action_prob * self.valid_spatial_action * self.ispatial + non_spatial_action_prob
      self.delta = self.value_target - q_value
      #self.clipped_error = tf.where(tf.abs(self.delta) < 1.0, 0.5 * tf.square(self.delta), tf.abs(self.delta) - 0.5, name='clipped_error')
      #value_loss = tf.reduce_mean(self.clipped_error, name='value_loss')
      
      value_loss = tf.reduce_mean(tf.square(self.delta))

      self.summary.append(tf.summary.histogram('spatial_action_prob', spatial_action_prob))
      self.summary.append(tf.summary.histogram('non_spatial_action_prob', non_spatial_action_prob))
      self.summary.append(tf.summary.scalar('value_loss', value_loss))
      
      # Build the optimizer
      self.learning_rate = tf.placeholder(tf.float32, None, name='learning_rate')
      opt = tf.train.RMSPropOptimizer(self.learning_rate, decay=0.99, epsilon=1e-10)
      grads = opt.compute_gradients(value_loss)
      cliped_grad = []
      for grad, var in grads:
        self.summary.append(tf.summary.histogram(var.op.name, var))
        grad = grad if grad is not None else tf.zeros_like(var)
        self.summary.append(tf.summary.histogram(var.op.name+'/grad', grad))
        grad = tf.clip_by_norm(grad, 10.0)
        cliped_grad.append([grad, var])
      self.train_op = opt.apply_gradients(cliped_grad)
      self.summary_op = tf.summary.merge(self.summary)

      self.saver = tf.train.Saver(max_to_keep=100)
Beispiel #6
0
    def build_model(self, reuse, dev, ntype):
        with tf.variable_scope(self.name) and tf.device(dev):
            if reuse:
                tf.get_variable_scope().reuse_variables()
                assert tf.get_variable_scope().reuse

            # Set inputs of networks
            self.minimap = tf.placeholder(
                tf.float32,
                [None, U.minimap_channel(), self.msize, self.msize],
                name='minimap')
            self.screen = tf.placeholder(
                tf.float32,
                [None, U.screen_channel(), self.ssize, self.ssize],
                name='screen')
            self.info = tf.placeholder(tf.float32, [None, self.isize],
                                       name='info')

            # create master and subpolicies
            self.subpolicy_Q = build_net(self.minimap, self.screen, self.info,
                                         self.msize, self.ssize, num_units + 2,
                                         'master_policy')

            # Set targets and masks for master policy update
            self.learning_rate = tf.placeholder(tf.float32,
                                                None,
                                                name='learning_rate')

            self.action_input = tf.placeholder("float", [None, num_units + 2])
            self.y_input = tf.placeholder("float", [None])
            self.Q_action = tf.reduce_sum(tf.multiply(self.subpolicy_Q,
                                                      self.action_input),
                                          reduction_indices=1)
            self.cost = tf.reduce_mean(tf.square(self.y_input - self.Q_action))
            self.master_train_op = tf.train.AdamOptimizer(
                self.learning_rate).minimize(self.cost)

            # Set targets and masks for subpolicies update
            self.valid_spatial_action = tf.placeholder(
                tf.float32, [None], name='valid_spatial_action_')
            self.spatial_action_selected = tf.placeholder(
                tf.float32, [None, self.ssize**2],
                name='spatial_action_selected')
            self.valid_non_spatial_action = tf.placeholder(
                tf.float32, [None, len(actions.FUNCTIONS)],
                name='valid_non_spatial_action_')
            self.non_spatial_action_selected = tf.placeholder(
                tf.float32, [None, len(actions.FUNCTIONS)],
                name='non_spatial_action_selected_')
            self.value_target = tf.placeholder(tf.float32, [None],
                                               name='value_target_')

            # Build the optimizer
            opt = tf.train.AdamOptimizer(self.learning_rate)

            self.subpolicy = build_net(self.minimap, self.screen, self.info,
                                       self.msize, self.ssize,
                                       len(actions.FUNCTIONS), 'fcn')
            self.spatial_action, self.non_spatial_action, self.value = self.subpolicy

            # Compute log probability
            spatial_action_prob = tf.reduce_sum(self.spatial_action *
                                                self.spatial_action_selected,
                                                axis=1)
            spatial_action_log_prob = tf.log(
                tf.clip_by_value(spatial_action_prob, 1e-10, 1.))
            non_spatial_action_prob = tf.reduce_sum(
                self.non_spatial_action * self.non_spatial_action_selected,
                axis=1)
            valid_non_spatial_action_prob = tf.reduce_sum(
                self.non_spatial_action * self.valid_non_spatial_action,
                axis=1)
            valid_non_spatial_action_prob = tf.clip_by_value(
                valid_non_spatial_action_prob, 1e-10, 1.)
            non_spatial_action_prob = non_spatial_action_prob / valid_non_spatial_action_prob
            non_spatial_action_log_prob = tf.log(
                tf.clip_by_value(non_spatial_action_prob, 1e-10, 1.))
            self.summary.append(
                tf.summary.histogram('spatial_action_prob_',
                                     spatial_action_prob))
            self.summary.append(
                tf.summary.histogram('non_spatial_action_prob_',
                                     non_spatial_action_prob))

            # Compute losses, more details in https://arxiv.org/abs/1602.01783
            # Policy loss and value loss
            action_log_prob = self.valid_spatial_action * spatial_action_log_prob + non_spatial_action_log_prob
            advantage = tf.stop_gradient(self.value_target - self.value)
            policy_loss = -tf.reduce_mean(action_log_prob * advantage)
            value_loss = -tf.reduce_mean(self.value * advantage)

            self.summary.append(tf.summary.scalar('policy_loss_', policy_loss))
            self.summary.append(tf.summary.scalar('value_loss_', value_loss))

            # TODO: policy penalty
            loss = policy_loss + value_loss

            grads = opt.compute_gradients(loss)
            cliped_grad = []
            for grad, var in grads:
                # get around of master policy gradients
                if grad is None:
                    continue
                self.summary.append(tf.summary.histogram(var.op.name, var))
                self.summary.append(
                    tf.summary.histogram(var.op.name + '/grad', grad))
                grad = tf.clip_by_norm(grad, 10.0)
                cliped_grad.append([grad, var])
            self.train_op = opt.apply_gradients(cliped_grad)
            self.summary_op = tf.summary.merge(self.summary)

            self.saver = tf.train.Saver(max_to_keep=100)
Beispiel #7
0
    def build_model(self, reuse, dev, ntype):
        with tf.variable_scope(self.name) and tf.device(dev):
            if reuse:
                #比如训练模式下4线程,除了第一个build_model的reuse是False以外,其他的均为True(main文件 124行)
                tf.get_variable_scope().reuse_variables()
                assert tf.get_variable_scope().reuse

            # Set inputs of networks 网络输入量为以下3项
            self.minimap = tf.placeholder(
                tf.float32,
                [None, U.minimap_channel(), self.msize, self.msize],
                name='minimap')
            self.screen = tf.placeholder(
                tf.float32,
                [None, U.screen_channel(), self.ssize, self.ssize],
                name='screen')
            # TODO:
            self.info = tf.placeholder(
                tf.float32, [None, self.isize + self.info_plus_size],
                name='info')

            self.dir_high_usedToFeedLowNet = tf.placeholder(
                tf.float32, [1, 1], name='dir_high_usedToFeedLowNet')
            self.act_id = tf.placeholder(tf.float32, [1, 1], name='act_id')

            # Build networks
            # net = build_net(self.minimap, self.screen, self.info, self.msize, self.ssize, len(actions.FUNCTIONS), ntype)  # build_net函数从network.py中引入
            # self.spatial_action, self.non_spatial_action, self.value = net  # 可见,build_model中建立的3个网络都是同样的结构

            # DHN add:
            self.dir_high, self.value_high, self.a_params_high, self.c_params_high = build_high_net(
                self.minimap, self.screen, self.info, num_macro_action)
            self.spatial_action_low, self.value_low, self.a_params_low, self.c_params_low = build_low_net(
                self.minimap, self.screen, self.info,
                self.dir_high_usedToFeedLowNet, self.act_id)

            # Set targets and masks
            # self.valid_spatial_action = tf.placeholder(tf.float32, [None], name='valid_spatial_action')
            # self.spatial_action_selected = tf.placeholder(tf.float32, [None, self.ssize**2], name='spatial_action_selected')
            # self.valid_non_spatial_action = tf.placeholder(tf.float32, [None, len(actions.FUNCTIONS)], name='valid_non_spatial_action')
            # self.non_spatial_action_selected = tf.placeholder(tf.float32, [None, len(actions.FUNCTIONS)], name='non_spatial_action_selected')
            # self.value_target = tf.placeholder(tf.float32, [None], name='value_target')   #value_target是v现实,是算完以后传进来的(219行),和莫烦A3C一致(莫烦A3C中56和154行)

            #DHN add:
            self.valid_spatial_action_low = tf.placeholder(
                tf.float32, [None], name='valid_spatial_action_low')
            self.spatial_action_selected_low = tf.placeholder(
                tf.float32, [None, self.ssize**2],
                name='spatial_action_selected_low')
            self.value_target_low = tf.placeholder(tf.float32, [None],
                                                   name='value_target_low')
            self.value_target_high = tf.placeholder(tf.float32, [None],
                                                    name='value_target_high')
            self.dir_high_selected = tf.placeholder(tf.float32,
                                                    [None, num_macro_action],
                                                    name='dir_high_selected')

            # Compute log probability
            spatial_action_prob_low = tf.reduce_sum(
                self.spatial_action_low * self.spatial_action_selected_low,
                axis=1)  # 用法可以参考Matrix_dot-multiply.py
            # spatial_action是网络输出的坐标,维度是“更新时历经的step数” x “ssize**2”
            # spatial_action_selected含义是每一个step需不需要坐标参数(第一维上),且具体坐标参数是什么(第二维上)。spatial_action_selected维度是“更新时历经的step数” x “ssize**2”
            spatial_action_log_prob_low = tf.log(
                tf.clip_by_value(spatial_action_prob_low, 1e-10,
                                 1.))  # 维度是“更新时历经的step数”

            dir_prob_high = tf.reduce_sum(self.dir_high *
                                          self.dir_high_selected,
                                          axis=1)
            dir_log_prob_high = tf.log(
                tf.clip_by_value(dir_prob_high, 1e-10, 1.))

            # non_spatial_action_prob = tf.reduce_sum(self.non_spatial_action * self.non_spatial_action_selected, axis=1)
            # valid_non_spatial_action_prob = tf.reduce_sum(self.non_spatial_action * self.valid_non_spatial_action, axis=1)
            # valid_non_spatial_action_prob = tf.clip_by_value(valid_non_spatial_action_prob, 1e-10, 1.)
            # non_spatial_action_prob = non_spatial_action_prob / valid_non_spatial_action_prob
            # non_spatial_action_log_prob = tf.log(tf.clip_by_value(non_spatial_action_prob, 1e-10, 1.))

            self.summary_low.append(
                tf.summary.histogram('spatial_action_prob_low',
                                     spatial_action_prob_low))
            self.summary_high.append(
                tf.summary.histogram('dir_prob_high', dir_prob_high))
            # self.summary.append(tf.summary.histogram('non_spatial_action_prob', non_spatial_action_prob))

            # Compute losses, more details in https://arxiv.org/abs/1602.01783

            # 计算网络的a、c loss(这里主要参考莫烦a3c的discrete action程序)
            # 下层网络:
            td_low = tf.subtract(self.value_target_low,
                                 self.value_low,
                                 name='TD_error_low')
            self.c_loss_low = tf.reduce_mean(tf.square(td_low))

            log_prob_low = self.valid_spatial_action_low * spatial_action_log_prob_low  # valid_spatial_action含义是每一个step需不需要坐标参数,维度是“更新时历经的step数”
            self.exp_v_low = log_prob_low * tf.stop_gradient(td_low)
            self.a_loss_low = -tf.reduce_mean(
                self.exp_v_low
            )  # 这里不需要像莫烦那样添加一步“增加探索度的操作”了,因为在step_low里已经设置了增加探索度的操作

            # 上层网络:
            td_high = tf.subtract(self.value_target_high,
                                  self.value_high,
                                  name='TD_error_low')
            self.c_loss_high = tf.reduce_mean(tf.square(td_high))

            self.exp_v_high = dir_log_prob_high * tf.stop_gradient(td_high)
            self.a_loss_high = -tf.reduce_mean(
                self.exp_v_high
            )  # 这里不需要epsilon greedy增加探索度了(像莫烦那样),因为在step_low里已经设置了增加探索度的操作

            # 添加summary:
            self.summary_low.append(
                tf.summary.scalar('a_loss_low', self.a_loss_low))
            self.summary_low.append(
                tf.summary.scalar('c_loss_low', self.c_loss_low))
            self.summary_high.append(
                tf.summary.scalar('a_loss_high', self.a_loss_high))
            self.summary_high.append(
                tf.summary.scalar('c_loss_high', self.c_loss_high))

            # TODO: policy penalty
            # loss = policy_loss + value_loss
            # Build the optimizer
            # self.learning_rate = tf.placeholder(tf.float32, None, name='learning_rate')
            # opt = tf.train.RMSPropOptimizer(self.learning_rate, decay=0.99, epsilon=1e-10)
            # grads = opt.compute_gradients(loss)
            # cliped_grad = []
            # for grad, var in grads:
            #   self.summary.append(tf.summary.histogram(var.op.name, var))
            #   self.summary.append(tf.summary.histogram(var.op.name+'/grad', grad))
            #   grad = tf.clip_by_norm(grad, 10.0)
            #   cliped_grad.append([grad, var])
            # self.train_op = opt.apply_gradients(cliped_grad)

            # 根据梯度进行更新(这里主要参考莫烦a3c的continuous action程序)
            # 下层网络:
            self.learning_rate_a_low = tf.placeholder(
                tf.float32, None, name='learning_rate_a_low')
            opt_a_low = tf.train.RMSPropOptimizer(self.learning_rate_a_low,
                                                  decay=0.99,
                                                  epsilon=1e-10)
            self.a_grads_low = tf.gradients(self.a_loss_low, self.a_params_low)
            self.update_a_low = opt_a_low.apply_gradients(
                zip(self.a_grads_low, self.a_params_low))

            self.learning_rate_c_low = tf.placeholder(
                tf.float32, None, name='learning_rate_c_low')
            opt_c_low = tf.train.RMSPropOptimizer(self.learning_rate_c_low,
                                                  decay=0.99,
                                                  epsilon=1e-10)
            self.c_grads_low = tf.gradients(self.c_loss_low, self.c_params_low)
            self.update_c_low = opt_c_low.apply_gradients(
                zip(self.c_grads_low, self.c_params_low))

            # 上层网络:
            self.learning_rate_a_high = tf.placeholder(
                tf.float32, None, name='learning_rate_a_high')
            opt_a_high = tf.train.RMSPropOptimizer(self.learning_rate_a_high,
                                                   decay=0.99,
                                                   epsilon=1e-10)
            self.a_grads_high = tf.gradients(self.a_loss_high,
                                             self.a_params_high)
            self.update_a_high = opt_a_high.apply_gradients(
                zip(self.a_grads_high, self.a_params_high))

            self.learning_rate_c_high = tf.placeholder(
                tf.float32, None, name='learning_rate_c_high')
            opt_c_high = tf.train.RMSPropOptimizer(self.learning_rate_c_high,
                                                   decay=0.99,
                                                   epsilon=1e-10)
            self.c_grads_high = tf.gradients(self.c_loss_high,
                                             self.c_params_high)
            self.update_c_high = opt_c_high.apply_gradients(
                zip(self.c_grads_high, self.c_params_high))

            self.summary_op_low = tf.summary.merge(self.summary_low)
            self.summary_op_high = tf.summary.merge(self.summary_high)
            self.saver = tf.train.Saver(
                max_to_keep=100
            )  # 定义self.saver 为 tf的存储器Saver(),在save_model和load_model函数里使用
Beispiel #8
0
    def build_model(self, reuse, device):
        with tf.variable_scope(self.name) and tf.device(device):
            if reuse:
                tf.get_variable_scope().reuse_variables()

            # placeholder for inputs of network
            self.screen_ph = tf.placeholder(tf.float32, [
                None,
                U.screen_channel(), self.screen_dimensions,
                self.screen_dimensions
            ],
                                            name='screen')
            self.minimap_ph = tf.placeholder(tf.float32, [
                None,
                U.minimap_channel(), self.minimap_dimensions,
                self.minimap_dimensions
            ],
                                             name='minimap')
            self.structured_ph = tf.placeholder(
                tf.float32, [None, self.structured_dimensions],
                name='structured')

            # build network
            network = build_network(self.structured_ph, self.screen_ph,
                                    self.minimap_ph, len(actions.FUNCTIONS))
            self.non_spatial_action, self.spatial_action, self.value = network

            # placeholder for targets and masks
            self.valid_non_spatial_action_ph = tf.placeholder(
                tf.float32, [None, len(actions.FUNCTIONS)],
                name='valid_non_spatial_action')
            self.sample_non_spatial_action_ph = tf.placeholder(
                tf.float32, [None, len(actions.FUNCTIONS)],
                name='sample_non_spatial_action')
            self.valid_spatial_action_ph = tf.placeholder(
                tf.float32, [None], name='valid_spatial_action')
            self.sample_spatial_action_ph = tf.placeholder(
                tf.float32, [None, self.minimap_dimensions**2],
                name='sample_spatial_action')
            self.target_value_ph = tf.placeholder(tf.float32, [None],
                                                  name='target_value')

            # compute log probability
            valid_non_spatial_action_prob = tf.reduce_sum(
                self.non_spatial_action * self.valid_non_spatial_action_ph,
                axis=1)
            valid_non_spatial_action_prob = tf.clip_by_value(
                valid_non_spatial_action_prob, 1e-10, 1.)
            non_spatial_action_prob = tf.reduce_sum(
                self.non_spatial_action * self.sample_non_spatial_action_ph,
                axis=1)
            non_spatial_action_prob /= valid_non_spatial_action_prob
            non_spatial_action_log_prob = tf.log(
                tf.clip_by_value(non_spatial_action_prob, 1e-10, 1.))
            spatial_action_prob = tf.reduce_sum(self.spatial_action *
                                                self.sample_spatial_action_ph,
                                                axis=1)
            spatial_action_log_prob = tf.log(
                tf.clip_by_value(spatial_action_prob, 1e-10, 1.))
            self.summary.append(
                tf.summary.histogram('non_spatial_action_prob',
                                     non_spatial_action_prob))
            self.summary.append(
                tf.summary.histogram('spatial_action_prob',
                                     spatial_action_prob))

            # compute loss
            action_log_prob = self.valid_spatial_action_ph * spatial_action_log_prob + non_spatial_action_log_prob
            advantage = tf.stop_gradient(self.target_value_ph - self.value)
            policy_loss = -tf.reduce_mean(action_log_prob * advantage)
            value_loss = -tf.reduce_mean(self.value * advantage)
            loss = policy_loss + value_loss
            self.summary.append(tf.summary.scalar('policy_loss', policy_loss))
            self.summary.append(tf.summary.scalar('value_loss', value_loss))

            # optimizer
            self.learning_rate_ph = tf.placeholder(tf.float32,
                                                   None,
                                                   name='learning_rate')
            optimizer = tf.train.RMSPropOptimizer(self.learning_rate_ph,
                                                  decay=0.99,
                                                  epsilon=1e-10)
            grads = optimizer.compute_gradients(loss)
            clipped_grads = []
            for grad, var in grads:
                self.summary.append(tf.summary.histogram(var.op.name, var))
                self.summary.append(
                    tf.summary.histogram(var.op.name + '/grad', grad))
                grad = tf.clip_by_norm(grad, 10.0)
                clipped_grads.append([grad, var])
            self.train_op = optimizer.apply_gradients(clipped_grads)
            self.summary_op = tf.summary.merge(self.summary)

            self.saver = tf.train.Saver(max_to_keep=None)
Beispiel #9
0
    def build_model(self):
        """
        define evaluation net, target net
        define optimizer for evaluation net
        """

        print("building model...")
        # ---------------------------evaluation net for spatial, non-spatial---------------------------
        # cnn input features
        self.screen = tf.placeholder(tf.float32, [None, screen_channel(), self.ssize, self.ssize], name='screen')
        self.info = tf.placeholder(tf.float32, [None, self.isize], name='info')
        # build eval net for spatial, non-spatial and return q_eval scope name = eval_net, collection name = eval...
        with tf.variable_scope('eval_net'):
            # c_name = ['eval_net_params', tf.GraphKeys.GLOBAL_VARIABLES]
            self.spatial_action, self.non_spatial_action, self.q_eval = self.build_network()
        # self.spatial_action, self.non_spatial_action, self.q_eval = self.build_network()

        # target value
        self.valid_spatial_action = tf.placeholder(tf.float32, [None], name='valid_spatial_action')
        self.spatial_action_selected = tf.placeholder(tf.float32, [None, self.ssize ** 2], name='spatial_action_selected')
        self.valid_non_spatial_action = tf.placeholder(tf.float32, [None, len(actions.FUNCTIONS)],name= 'valid_non_spatial_action')
        self.non_spatial_action_selected = tf.placeholder(tf.float32, [None, len(actions.FUNCTIONS)], name='non_spatial_action_selected')
        self.q_target = tf.placeholder(tf.float32, [None], name='q_target')

        # A3C loss calculation in deepmind paper
        # action log probability
        spatial_action_prob = tf.reduce_sum(self.spatial_action * self.spatial_action_selected, axis=1)
        spatial_action_log_prob = tf.log(tf.clip_by_value(spatial_action_prob, 1e-10, 1.))

        non_spatial_action_prob = tf.reduce_sum(self.non_spatial_action * self.non_spatial_action_selected, axis=1)
        valid_non_spatial_action_prob = tf.reduce_sum(self.non_spatial_action * self.valid_non_spatial_action, axis=1)
        valid_non_spatial_action_prob = tf.clip_by_value(valid_non_spatial_action_prob, 1e-10, 1.)
        non_spatial_action_prob = non_spatial_action_prob / valid_non_spatial_action_prob
        non_spatial_action_log_prob = tf.log(tf.clip_by_value(non_spatial_action_prob, 1e-10, 1.))

        # self.summary.append(tf.summary.histogram('spatial_action_prob', spatial_action_prob))
        # self.summary.append(tf.summary.histogram('non_spatial_action_prob', non_spatial_action_prob))

        # compute loss with gradient clipping
        action_log_prob = self.valid_spatial_action * spatial_action_log_prob + non_spatial_action_log_prob
        advantage = tf.stop_gradient(self.q_target - self.q_eval)
        policy_loss = - tf.reduce_mean(action_log_prob * advantage)
        value_loss = - tf.reduce_mean(self.q_eval * advantage)

        loss = policy_loss + value_loss

        # Build the optimizer
        opt = tf.train.RMSPropOptimizer(learning_rate=self.lr, decay=0.99, epsilon=1e-10)
        grads = opt.compute_gradients(loss)
        cliped_grad = []
        for grad, var in grads:
            # self.summary.append(tf.summary.histogram(var.op.name, var))
            # self.summary.append(tf.summary.histogram(var.op.name + '/grad', grad))
            grad = tf.clip_by_norm(grad, 10.0)
            cliped_grad.append([grad, var])
        self.train_op = opt.apply_gradients(cliped_grad)
        # self.summary_op = tf.summary.merge(self.summary)

        # # dueling net optimizer method
        # self.q_target = tf.placeholder(tf.float32, [None, len(actions.FUNCTIONS)], name='q_target')
        # with tf.variable_scope('loss'):
        #     self.loss = tf.reduce_mean(tf.squared_difference(self.q_target, self.q_eval))
        # with tf.variable_scope('train'):
        #     self._train_op = tf.train.RMSPropOptimizer(self.lr).minimize(self.loss)

        # ---------------------------target net for spatial, non-spatial---------------------------

        with tf.variable_scope('target_net'):
            _, _, self.q_next = self.build_network()

        pass
Beispiel #10
0
    def build_net(self, dev):
         with tf.variable_scope('a3c') and tf.device(dev):
            screenInput = Input(
                shape=(U.screen_channel(), self.ssize, self.ssize),
                name='screenInput',
            )

            permutedScreenInput = Permute((2,3,1))(screenInput)
            conv1 = Conv2D(16, kernel_size=5, strides=(1,1), padding='same',name='conv1')(permutedScreenInput)
            conv2 = Conv2D(32, kernel_size=3, strides=(1,1), padding='same',name='conv2')(conv1)

            infoInput = Input(
                shape=(self.isize,),
                name='infoInput',
            )

            customInput = Input(
                shape=(self.custom_input_size,),
                name='customInput',
            )

            nonSpatialInput = Concatenate(name='nonSpatialInputConcat')([infoInput, customInput])

            broadcasted = Lambda(self.broadcast,name='broadcasting')(nonSpatialInput)

            combinedSpatialNonSpatial = Concatenate(name='combinedConcat')([broadcasted, conv2])

            conv3 = Conv2D(1, kernel_size=1, strides=(1,1), padding='same',name='conv3')(combinedSpatialNonSpatial)

            flatConv3 = Flatten(name='flatConv3')(conv3)

            lstmInput = Lambda(self.expand_dims, name='lstmInput')(flatConv3)

            self.NUM_LSTM = 100

            hStateInput = Input(
                shape=(self.NUM_LSTM,),
                name='hStateInput'
            )

            cStateInput = Input(
                shape=(self.NUM_LSTM,),
                name='cStateInput'
            )

            lstm, hStates, cStates = LSTM(self.NUM_LSTM, return_state=True)(lstmInput, initial_state=[hStateInput, cStateInput])

            fc1 = Dense(256, activation='relu',name='dense1')(lstm)
            fc2 = Dense(1, activation='linear',name='fc2')(fc1)
            value = Lambda(self.Squeeze,name='value')(fc2)
            policy = Dense(self.isize, activation='softmax',name='policy')(fc1)


            broadcastLstm = Lambda(self.broadcast, name='breadcastLstm')(lstm)

            spatialLstm = Concatenate(name='spatialLstm')([conv3, broadcastLstm])

            conv4 = Conv2D(1,kernel_size=1, strides=(1,1), padding='same',name='conv4')(spatialLstm)
            flatConv4 = Flatten(name='flattenedConv3')(conv4)
            spatialPolicy = Softmax(name='spatialPolicy')(flatConv4)

            conv5 = Conv2D(1, kernel_size=1, strides=(1,1), padding='same',name='conv5')(spatialLstm)
            flatConv5 = Flatten(name='flattenedConv5')(conv5)
            bestRoach = Softmax(name='bestRoach')(flatConv5)

            self.model = Model(
                inputs=[screenInput, infoInput, customInput, hStateInput, cStateInput],
                outputs=[value, policy, spatialPolicy, hStates, cStates, bestRoach]
            )
            self.model._make_predict_function()
Beispiel #11
0
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import tensorflow as tf
import tensorflow.contrib.layers as layers
import utils as U

screen_filters1 = tf.get_variable(name='sf1',
                                  shape=(5, 5, U.screen_channel(), 16))  # hwio
screen_filters2 = tf.get_variable(name='sf2', shape=(3, 3, 16, 32))
minimap_filters1 = tf.get_variable(name='mmf1',
                                   shape=(5, 5, U.minimap_channel(), 16))
minimap_filters2 = tf.get_variable(name='mmf2', shape=(3, 3, 16, 32))


def build_net(minimap, screen, info, msize, ssize, num_action):
    # Extract features

    mconv1 = tf.nn.conv2d(tf.transpose(minimap, [0, 2, 3, 1]),
                          minimap_filters1,
                          strides=[1, 1, 1, 1],
                          padding='SAME',
                          name='mconv1')
    mconv2 = tf.nn.conv2d(mconv1,
                          minimap_filters2,
                          strides=[1, 1, 1, 1],
                          padding='SAME',
                          name='mconv2')
    sconv1 = tf.nn.conv2d(tf.transpose(screen, [0, 2, 3, 1]),
                          screen_filters1,
Beispiel #12
0
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import tensorflow as tf
import tensorflow.contrib.layers as layers
import utils as U


screen_filters1 = tf.get_variable(name='screen_f1', shape=(5, 5, U.screen_channel(), 16))  # hwio
screen_filters2 = tf.get_variable(name='screen_f2', shape=(3, 3, 16, 32))  
minimap_filters1 = tf.get_variable(name='minimap_f1', shape=(5, 5, U.minimap_channel(), 16))
minimap_filters2 = tf.get_variable(name='minimap_f2', shape=(3, 3, 16, 32))


def build_net(minimap, screen, info, msize, ssize, num_action):
  # Extract features


  mconv1 = tf.nn.conv2d(tf.transpose(minimap, [0, 2, 3, 1]), minimap_filters1, strides=[1, 1, 1, 1], padding='SAME', name='mconv1')
  mconv2 = tf.nn.conv2d(mconv1, minimap_filters2, strides=[1, 1, 1, 1], padding='SAME', name='mconv2')
  sconv1 = tf.nn.conv2d(tf.transpose(screen, [0, 2, 3, 1]), screen_filters1, strides=[1, 1, 1, 1], padding='SAME', name='sconv1')
  sconv2 = tf.nn.conv2d(sconv1, screen_filters2, strides=[1, 1, 1, 1], padding='SAME', name='sconv2')
  info_fc = layers.fully_connected(layers.flatten(info), num_outputs=256, activation_fn=tf.tanh, scope='info_fc')

  # Compute spatial actions
  feat_conv = tf.concat([mconv2, sconv2], axis=3)

  spatial_weights = tf.get_variable(name='spatial_weights', shape=(1, 1, feat_conv.get_shape()[-1], 1))
  spatial_action = tf.nn.conv2d(feat_conv, spatial_weights, strides=[1, 1, 1 ,1], padding='SAME', name='spatial_action')
  spatial_action = tf.nn.softmax(layers.flatten(spatial_action))
    def build_model(self, reuse, dev, ntype):
        """
    Build the TensorFlow model for the MLSH agent on this thread
    - valid_spatial_action:            shape (len(rbs),) 
      = whether agent took a spatial action or not at each step of replay buffer
    - spatial_action_selected:         shape (len(rbs), screensize**2)
      = one-hot encoding of (x,y) argts of action at each step replay buffer
    - valid_non_spatial_action:        shape (len(rbs), len(actions.FUNCTIONS))
      = one-hot encoding of available actions at each step of replay buffer
    - non_spatial_action_selected:     shape (len(rbs), len(actions.FUNCTIONS))
      = one-hot encoding of the action taken at each step of replay buffer
    """
        with tf.variable_scope(self.name) and tf.device(dev):

            # Set inputs of networks
            self.minimap = tf.placeholder(
                tf.float32,
                [None, U.minimap_channel(), self.msize, self.msize],
                name='minimap')
            self.screen = tf.placeholder(
                tf.float32,
                [None, U.screen_channel(), self.ssize, self.ssize],
                name='screen')
            self.info = tf.placeholder(tf.float32, [None, self.isize],
                                       name='info')

            # Build networks
            net = build_net(self.minimap, self.screen,
                            self.info, self.msize, self.ssize,
                            len(actions.FUNCTIONS), ntype, self.num_subpol,
                            reuse, self.num_thread)
            self.spatial_actions, self.non_spatial_actions, self.value, self.master_value, self.subpol_choice, self.master_vars = net

            # Create training operation for the subpolicies:
            # Set targets and masks
            self.valid_spatial_action = tf.placeholder(
                tf.float32, [None], name='valid_spatial_action')
            self.spatial_action_selected = tf.placeholder(
                tf.float32, [None, self.ssize**2],
                name='spatial_action_selected')
            self.valid_non_spatial_action = tf.placeholder(
                tf.float32, [None, len(actions.FUNCTIONS)],
                name='valid_non_spatial_action')
            self.non_spatial_action_selected = tf.placeholder(
                tf.float32, [None, len(actions.FUNCTIONS)],
                name='non_spatial_action_selected')
            self.value_target = tf.placeholder(tf.float32, [None],
                                               name='value_target')

            self.subpol_train_ops = []

            # Variables kept by training operations (such as gradients) are given scope 'train_vars'
            with tf.variable_scope('train_vars'):

                # Build the optimizer
                self.learning_rate = tf.placeholder(tf.float32,
                                                    None,
                                                    name='learning_rate')
                opt = tf.train.RMSPropOptimizer(self.learning_rate,
                                                decay=0.99,
                                                epsilon=1e-10)

                for pol_id in range(self.num_subpol):
                    self.build_subpolicy(opt, pol_id, reuse)

                # Create training operation for the master policy:
                self.build_master_policy(opt)

                # Log scores and decisions to tensorboard:
                self.summary_op = tf.summary.merge(self.summary)
                self.subpol_summary_op = tf.summary.merge(self.subpol_summary)

                self.train_score = tf.placeholder(tf.float32,
                                                  name='train_score')
                self.train_score_summary_op = tf.summary.scalar(
                    'train_score_thread_' + str(self.num_subpol),
                    self.train_score)

                self.test_score = tf.placeholder(tf.float32, name='test_score')
                self.test_score_summary_op = tf.summary.scalar(
                    'test_score_thread_' + str(self.num_subpol),
                    self.test_score)

                self.ep_subpol_choices_ph = tf.placeholder(
                    tf.float32, [None], name='subpol_choices')
                self.ep_subpol_choices_op = tf.summary.histogram(
                    'subpol_choices', self.ep_subpol_choices_ph)

                self.saver = tf.train.Saver(max_to_keep=100,
                                            keep_checkpoint_every_n_hours=1)
                                   mode="w")
file_handler.setLevel(logging.INFO)
formatter = logging.Formatter('%(levelname)s - %(message)s')
file_handler.setFormatter(formatter)
logger.addHandler(file_handler)

isize = 11
msize = 64
ssize = 64

score = tf.placeholder(tf.int32, [], name='score')
minimap = tf.placeholder(tf.float32,
                         [None, U.minimap_channel(), msize, msize],
                         name='minimap')  # 17, 64, 64
screen = tf.placeholder(tf.float32,
                        [None, U.screen_channel(), ssize, ssize],
                        name='screen')
info = tf.placeholder(tf.float32, [None, isize], name='info')

# minimap_placeholder = tf.placeholder(tf.float32, [None, 64, 64, 5])
# screen_placeholder = tf.placeholder(tf.float32, [None, 64, 64, 10])
# user_info_placeholder = tf.placeholder(tf.float32, [None, isize])
action_output = tf.placeholder(tf.float32, [None, 543])  # one hot

# set up network
screen_filters1 = tf.get_variable(name='sf1',
                                  shape=(5, 5, U.screen_channel(), 16))  # hwio
screen_filters2 = tf.get_variable(name='sf2', shape=(3, 3, 16, 32))
minimap_filters1 = tf.get_variable(name='mmf1',
                                   shape=(5, 5, U.minimap_channel(), 16))
minimap_filters2 = tf.get_variable(name='mmf2', shape=(3, 3, 16, 32))
Beispiel #15
0
    def build_model(self, reuse, dev, ntype):
        with tf.variable_scope(self.name) and tf.device(dev):
            if reuse:
                tf.get_variable_scope().reuse_variables()
                assert tf.get_variable_scope().reuse

            # Set inputs of networks
            self.minimap = tf.placeholder(
                tf.float32,
                [None, U.minimap_channel(), self.msize, self.msize],
                name='minimap')
            self.screen = tf.placeholder(
                tf.float32,
                [None, U.screen_channel(), self.ssize, self.ssize],
                name='screen')
            self.info = tf.placeholder(tf.float32, [None, self.isize],
                                       name='info')

            # Build a3c base networks
            net = build_net(self.minimap,
                            self.screen,
                            self.info,
                            self.msize,
                            self.ssize,
                            len(actions.FUNCTIONS),
                            ntype,
                            reuse=False)
            self.spatial_action, self.non_spatial_action, self.value = net

            # Set targets and masks
            self.valid_spatial_action = tf.placeholder(
                tf.float32, [None], name='valid_spatial_action')
            self.spatial_action_selected = tf.placeholder(
                tf.float32, [None, self.ssize**2],
                name='spatial_action_selected')
            self.valid_non_spatial_action = tf.placeholder(
                tf.float32, [None, len(actions.FUNCTIONS)],
                name='valid_non_spatial_action')
            self.non_spatial_action_selected = tf.placeholder(
                tf.float32, [None, len(actions.FUNCTIONS)],
                name='non_spatial_action_selected')
            self.value_target = tf.placeholder(tf.float32, [None],
                                               name='value_target')

            # Compute log probability
            spatial_action_prob = tf.reduce_sum(self.spatial_action *
                                                self.spatial_action_selected,
                                                axis=1)
            spatial_action_log_prob = tf.log(
                tf.clip_by_value(spatial_action_prob, 1e-10, 1.))
            non_spatial_action_prob = tf.reduce_sum(
                self.non_spatial_action * self.non_spatial_action_selected,
                axis=1)
            valid_non_spatial_action_prob = tf.reduce_sum(
                self.non_spatial_action * self.valid_non_spatial_action,
                axis=1)
            valid_non_spatial_action_prob = tf.clip_by_value(
                valid_non_spatial_action_prob, 1e-10, 1.)
            non_spatial_action_prob = non_spatial_action_prob / valid_non_spatial_action_prob
            non_spatial_action_log_prob = tf.log(
                tf.clip_by_value(non_spatial_action_prob, 1e-10, 1.))
            self.summary.append(
                tf.summary.histogram('spatial_action_prob',
                                     spatial_action_prob))
            self.summary.append(
                tf.summary.histogram('non_spatial_action_prob',
                                     non_spatial_action_prob))

            # Compute a3closses, more details in https://arxiv.org/abs/1602.01783
            # Policy loss and value loss
            action_log_prob = self.valid_spatial_action * spatial_action_log_prob + non_spatial_action_log_prob
            advantage = tf.stop_gradient(self.value_target - self.value)
            policy_loss = -tf.reduce_mean(action_log_prob * advantage)
            value_loss = -tf.reduce_mean(self.value * advantage)
            self.summary.append(tf.summary.scalar('policy_loss', policy_loss))
            self.summary.append(tf.summary.scalar('value_loss', value_loss))

            # TODO: policy penalty
            a3c_loss = policy_loss + value_loss

            #pc_part_start
            self.pc_minimap = tf.placeholder(
                tf.float32,
                [None, U.minimap_channel(), self.msize, self.msize],
                name='pc_minimap')
            self.pc_screen = tf.placeholder(
                tf.float32,
                [None, U.screen_channel(), self.ssize, self.ssize],
                name='pc_screen')
            self.pc_info = tf.placeholder(tf.float32, [None, self.isize],
                                          name='info')
            self.pc_valid_non_spatial_action = tf.placeholder(
                tf.float32, [None, len(actions.FUNCTIONS)],
                name='pc_valid_non_spatial_action')

            pc_net = build_pc_net(self.pc_minimap, self.pc_screen,
                                  self.pc_info, self.msize, self.ssize,
                                  len(actions.FUNCTIONS),
                                  self.pc_valid_non_spatial_action)
            pc_q, pc_q_max = pc_net
            pc_a = tf.placeholder("float", [None, len(actions.FUNCTIONS)])
            pc_a_reshaped = tf.reshape(
                self.pc_a, [-1, 1, 1, len(actions.FUNCTIONS)])

            # Extract Q for taken action
            pc_qa_ = tf.multiply(self.pc_q, pc_a_reshaped)
            pc_qa = tf.reduce_sum(pc_qa_, reduction_indices=3, keep_dims=False)
            # (-1, 20, 20)

            # TD target for Q
            self.pc_r = tf.placeholder("float", [None, 20, 20])
            pc_loss = self._pixel_change_lambda * tf.nn.l2_loss(self.pc_r -
                                                                pc_qa)

            # Build the optimizer
            loss = pc_loss + a3c_loss
            self.learning_rate = tf.placeholder(tf.float32,
                                                None,
                                                name='learning_rate')
            opt = tf.train.RMSPropOptimizer(self.learning_rate,
                                            decay=0.99,
                                            epsilon=1e-10)
            grads = opt.compute_gradients(loss)
            cliped_grad = []
            for grad, var in grads:
                self.summary.append(tf.summary.histogram(var.op.name, var))
                self.summary.append(
                    tf.summary.histogram(var.op.name + '/grad', grad))
                grad = tf.clip_by_norm(grad, 10.0)
                cliped_grad.append([grad, var])
            self.train_op = opt.apply_gradients(cliped_grad)
            self.summary_op = tf.summary.merge(self.summary)

            self.saver = tf.train.Saver(max_to_keep=100)
Beispiel #16
0
    def build(self, reuse,
              dev):  # chaging this around for now. need a2c to work first
        with tf.variable_scope(
                self.name) and tf.device(dev):  # A3C/A3CAgent/var_name
            if reuse:
                tf.get_variable_scope().reuse_variables()
                assert tf.get_variable_scope().reuse

            # Set inputs of networks
            self.score = tf.placeholder(tf.int32, [], name='score')
            self.minimap = tf.placeholder(
                tf.float32,
                [None, U.minimap_channel(), self.msize, self.msize],
                name='minimap')
            self.screen = tf.placeholder(
                tf.float32,
                [None, U.screen_channel(), self.ssize, self.ssize],
                name='screen')
            self.info = tf.placeholder(tf.float32, [None, self.isize],
                                       name='info')

            # Build networks
            net = networks.build_net(self.minimap, self.screen, self.info,
                                     self.msize, self.ssize,
                                     len(actions.FUNCTIONS))
            # will below give me nonetype error too? if so it is b/c arguments and/or argument_policy 100%
            self.spatial_policy, self.non_spatial_policy, self.state_representation, self.value = net

            # Set targets and masks
            self.valid_spatial_action = tf.placeholder(
                tf.float32, [None], name='valid_spatial_action')
            self.spatial_action_selected = tf.placeholder(
                tf.float32, [None, self.ssize**2],
                name='spatial_action_selected')
            self.valid_non_spatial_action = tf.placeholder(
                tf.float32, [None, self.isize],
                name='valid_non_spatial_action'
            )  # these match w. actions in previous
            self.non_spatial_action_selected = tf.placeholder(
                tf.float32, [None, self.isize],
                name='non_spatial_action_selected')
            self.value_target = tf.placeholder(tf.float32, [None],
                                               name='value_target')

            # Compute log probability -- what do these look like exactly?
            spatial_action_prob = tf.reduce_sum(self.spatial_policy *
                                                self.spatial_action_selected,
                                                axis=1)
            spatial_action_log_prob = tf.log(
                tf.clip_by_value(spatial_action_prob, 1e-10, 1.))
            non_spatial_action_prob = tf.reduce_sum(
                self.non_spatial_policy * self.non_spatial_action_selected,
                axis=1)
            valid_non_spatial_action_prob = tf.reduce_sum(
                self.non_spatial_policy * self.valid_non_spatial_action,
                axis=1)
            valid_non_spatial_action_prob = tf.clip_by_value(
                valid_non_spatial_action_prob, 1e-10, 1.)
            non_spatial_action_prob = non_spatial_action_prob / valid_non_spatial_action_prob
            non_spatial_action_log_prob = tf.log(
                tf.clip_by_value(non_spatial_action_prob, 1e-10, 1.))
            self.summary.append(
                tf.summary.histogram('spatial_action_prob',
                                     spatial_action_prob))
            self.summary.append(
                tf.summary.histogram('non_spatial_action_prob',
                                     non_spatial_action_prob))
            self.summary.append(
                tf.summary.histogram('spatial_action_log_prob',
                                     spatial_action_log_prob))
            self.summary.append(
                tf.summary.histogram('non_spatial_action_log_prob',
                                     non_spatial_action_log_prob))
            # self.logger.info(f"non_spatial_action_log_prob: {non_spatial_action_log_prob}")
            # self.logger.info(f"spatial_action_selected: {self.spatial_action_selected}")  # how does spatial_action_selected look, probs?

            # Compute losses, more details in https://arxiv.org/abs/1602.01783
            # Policy loss and value loss
            action_log_prob = self.valid_spatial_action * spatial_action_log_prob + non_spatial_action_log_prob
            advantage = tf.stop_gradient(self.value_target - self.value)
            policy_loss = -tf.reduce_mean(action_log_prob * advantage)
            value_loss = tf.reduce_mean(self.value * advantage)
            # entropy_loss = tf.reduce_mean(entropy) * self.entropy_regularisation
            entropy_loss = tf.reduce_sum(self.non_spatial_policy *
                                         tf.log(self.non_spatial_policy),
                                         name='entropy')  # reduce_sum or mean?
            self.summary.append(tf.summary.scalar("entropy_loss",
                                                  entropy_loss))

            self.summary.append(tf.summary.scalar('policy_loss', policy_loss))
            self.summary.append(tf.summary.scalar('value_loss', value_loss))

            # TODO: policy penalty/entropy
            # loss = policy_loss + value_loss
            total_loss = policy_loss + (value_loss * self.value_regularisation
                                        ) - (entropy_loss *
                                             self.entropy_regularisation)
            self.summary.append(tf.summary.scalar("total_loss", total_loss))
            # Build the optimizer
            self.learning_rate = tf.placeholder(tf.float32,
                                                None,
                                                name='learning_rate')
            opt = tf.train.RMSPropOptimizer(self.learning_rate,
                                            decay=0.99,
                                            epsilon=1e-10)
            grads = opt.compute_gradients(total_loss)
            cliped_grad = []
            for grad, var in grads:
                self.logger.info(f"CHECK1: {var}")
                self.summary.append(tf.summary.histogram(var.op.name, var))
                self.summary.append(
                    tf.summary.histogram(var.op.name + '/grad', grad))
                grad = tf.clip_by_norm(
                    grad, 10.0)  # is this an ideal value to clip with?
                cliped_grad.append([grad, var])
                self.logger.info(f"CHECK2: {var}")
            self.train_op = opt.apply_gradients(cliped_grad)
            self.summary_op = tf.summary.merge(self.summary)

            self.saver = tf.train.Saver(max_to_keep=10)

            self.argument_policy = dict()
            self.arguments = dict()
            for arg_type in actions.TYPES:
                # for spatial actions, represent each dimension independently
                # what if instead of making the output units, i make it a smaller number
                # then do something similar to what eps greedy is doing now?
                if len(arg_type.sizes) > 1:  # if spatial.  ***
                    if arg_type in SCREEN_TYPES:
                        units = self.ssize
                    elif arg_type in MINIMAP_TYPES:
                        units = self.msize

                    arg_policy_x = layers.fully_connected(
                        self.state_representation,
                        num_outputs=units,
                        activation_fn=tf.nn.softmax)

                    arg_policy_y = layers.fully_connected(
                        self.state_representation,
                        num_outputs=units,
                        activation_fn=tf.nn.softmax)

                    self.argument_policy[str(arg_type) + "x"] = arg_policy_x
                    self.argument_policy[str(arg_type) + "y"] = arg_policy_y

                    arg_placeholder_x = tf.placeholder(tf.float32,
                                                       shape=[None, units])

                    arg_placeholder_y = tf.placeholder(tf.float32,
                                                       shape=[None, units])

                    self.arguments[str(arg_type) + "x"] = arg_placeholder_x
                    self.arguments[str(arg_type) + "y"] = arg_placeholder_y

                else:  # if non spatial
                    arg_policy = layers.fully_connected(
                        self.state_representation,
                        num_outputs=arg_type.sizes[0],
                        activation_fn=tf.nn.softmax)

                    self.argument_policy[str(arg_type)] = arg_policy

                    arg_placeholder = tf.placeholder(
                        tf.float32, shape=[None, arg_type.sizes[0]])

                    self.arguments[str(arg_type)] = arg_placeholder
Beispiel #17
0
    def buildNetwork(self):

        self.minimap = tf.placeholder(
            tf.float32,
            [None,
             minimap_channel(), self.minimap_size, self.minimap_size],
            name="minimap")
        self.screen = tf.placeholder(
            tf.float32,
            [None, screen_channel(), self.screen_size, self.screen_size],
            name="screen")
        self.info = tf.placeholder(tf.float32, [None, self.action_size],
                                   name="info")
        self.spatial_action, self.non_spatial_action, self.value = network(
            self.minimap, self.screen, self.info, self.minimap_size,
            self.screen_size, self.action_size)
        # Set targets and masks

        self.valid_spatial_action = tf.placeholder(tf.float32, [None],
                                                   name='valid_spatial_action')
        self.spatial_action_selected = tf.placeholder(
            tf.float32, [None, self.screen_size**2],
            name='spatial_action_selected')
        self.valid_non_spatial_action = tf.placeholder(
            tf.float32, [None, len(actions.FUNCTIONS)],
            name='valid_non_spatial_action')
        self.non_spatial_action_selected = tf.placeholder(
            tf.float32, [None, len(actions.FUNCTIONS)],
            name='non_spatial_action_selected')
        self.value_target = tf.placeholder(tf.float32, [None],
                                           name='value_target')
        # Compute log probability
        spatial_action_prob = tf.reduce_sum(self.spatial_action *
                                            self.spatial_action_selected,
                                            axis=1)
        spatial_action_log_prob = tf.log(
            tf.clip_by_value(spatial_action_prob, 1e-10, 1.))
        non_spatial_action_prob = tf.reduce_sum(
            self.non_spatial_action * self.non_spatial_action_selected, axis=1)
        valid_non_spatial_action_prob = tf.reduce_sum(
            self.non_spatial_action * self.valid_non_spatial_action, axis=1)
        valid_non_spatial_action_prob = tf.clip_by_value(
            valid_non_spatial_action_prob, 1e-10, 1.)
        non_spatial_action_prob = non_spatial_action_prob / valid_non_spatial_action_prob
        non_spatial_action_log_prob = tf.log(
            tf.clip_by_value(non_spatial_action_prob, 1e-10, 1.))
        tf.summary.histogram('spatial_action_prob', spatial_action_prob)
        tf.summary.histogram('non_spatial_action_prob',
                             non_spatial_action_prob)
        # Compute losses, more details in https://arxiv.org/abs/1602.01783
        # Policy loss and value loss
        action_log_prob = self.valid_spatial_action * spatial_action_log_prob + non_spatial_action_log_prob
        advantage = tf.stop_gradient(self.value_target - self.value)
        policy_loss = -tf.reduce_mean(action_log_prob * advantage)
        value_loss = -tf.reduce_mean(self.value * advantage)
        tf.summary.scalar('policy_loss', policy_loss)
        tf.summary.scalar('value_loss', value_loss)
        # TODO: policy penalty
        self.reg = tf.squeeze(self.non_spatial_action)
        self.reg = tf.reduce_sum(self.reg * tf.log(self.reg))
        loss = policy_loss + value_loss + self.reg
        # Build the optimizer
        self.learning_rate = tf.placeholder(tf.float32,
                                            None,
                                            name='learning_rate')
        rmsprop = tf.train.RMSPropOptimizer(learning_rate=self.learning_rate,
                                            decay=0.99,
                                            epsilon=1e-10)
        self.train_op = rmsprop.minimize(loss)
        self.merged = tf.summary.merge_all()
        self.saver = tf.train.Saver(max_to_keep=100)
Beispiel #18
0
    def build_model(self, dev):
        with tf.variable_scope('a3c') and tf.device(dev):
            self.valid_spatial_action = tf.placeholder(
                tf.float32, [None], name='valid_spatial_action')
            self.spatial_action_selected = tf.placeholder(
                tf.float32, [None, self.ssize**2], name='spatial_action_selected')
            self.valid_action = tf.placeholder(
                tf.float32, [None, len(U.useful_actions)], name='valid_action')
            self.action_selected = tf.placeholder(
                tf.float32, [None, len(U.useful_actions)], name='action_selected')
            self.value_target = tf.placeholder(
                tf.float32, [None], name='value_target')
            self.entropy_rate = tf.placeholder(
                tf.float32, None, name='entropy_rate')
            self.advantage = tf.placeholder(tf.float32, [None], name='advantage')
            self.learning_rate = tf.placeholder(tf.float32, None, name='learning_rate')
            self.screen = tf.placeholder(
                tf.float32, [None, U.screen_channel(), self.ssize, self.ssize], name='screen')
            self.info = tf.placeholder(
                tf.float32, [None, self.isize], name='info')
            self.custom_inputs = tf.placeholder(
                tf.float32, [None, self.custom_input_size], name='custom_input')
            self.hStateInput = tf.placeholder(
                tf.float32, [None, self.NUM_LSTM], name='h_state_input')
            self.cStateInput = tf.placeholder(
                tf.float32, [None, self.NUM_LSTM], name='c_state_input')
            self.roach_location = tf.placeholder(
                tf.float32, [None, self.ssize ** 2], name='roach_location'
            )

            self.value, self.policy, self.spatial_policy, _, _, self.roachPrediction = self.model([self.screen, self.info, self.custom_inputs, self.hStateInput, self.cStateInput])
            # This will get the probability of choosing a valid action. Given that we force it to choose from
            # the set of valid actions. The probability of an action is the probability the policy chooses
            # divided by the probability of a valid action
            valid_action_prob = tf.reduce_sum(
                self.valid_action * self.policy+1e-10, axis=1)
            action_prob = tf.reduce_sum(
                self.action_selected * self.policy+1e-10, axis=1) / valid_action_prob

            # Here we compute the probability of the spatial action. If the action selected was non spactial,
            # the probability will be one.
            # TODO: Make this use vectorized things (using a constant "valid_spatial_action" seems fishy to me, but maybe it's fine)
            spatial_action_prob = (self.valid_spatial_action * tf.reduce_sum(
                self.spatial_policy * self.spatial_action_selected, axis=1)) + (1.0 - self.valid_spatial_action)+1e-10

            # The probability of the action will be the the product of the non spatial and the spatial prob
            combined_action_probability = action_prob * spatial_action_prob

            # The advantage function, which will represent how much better this action was than what was expected from this state


            policy_loss = self.getPolicyLoss(combined_action_probability, self.advantage)
            value_loss = self.getValueLoss(self.value_target - self.value)
            entropy = self.getEntropy(
                self.policy, self.spatial_policy, self.valid_spatial_action)
            roachLoss = self.getMinRoachLoss(
                self.roach_location, self.roachPrediction
            )


            loss = tf.reduce_mean(policy_loss + value_loss * .5 + entropy * .01 + .5 * roachLoss)
            # Build the optimizer
            global_step = tf.Variable(0)
            learning_rate_decayed = tf.train.exponential_decay(self.learning_rate,
                                                               global_step,10000, .95)

            opt = tf.train.AdamOptimizer(self.learning_rate)
            grads, vars = zip(*opt.compute_gradients(loss))
            grads, glob_norm = tf.clip_by_global_norm(grads, 40.0)
            self.train_op = opt.apply_gradients(zip(grads, vars), global_step=global_step)
            if self.flags.use_tensorboard:
                summary = []
                summary.append(tf.summary.scalar(
                    'policy_loss', tf.reduce_mean(policy_loss)))
                summary.append(tf.summary.scalar(
                    'glob_norm', glob_norm))
                summary.append(tf.summary.scalar(
                    'value_loss', tf.reduce_mean(value_loss)))
                summary.append(tf.summary.scalar(
                    'entropy_loss', tf.reduce_mean(entropy)))
                summary.append(tf.summary.scalar(
                    'advantage', tf.reduce_mean(self.advantage)))
                summary.append(tf.summary.scalar(
                    'loss', tf.reduce_mean(loss)))
                summary.append(tf.summary.scalar(
                    'roachLoss', tf.reduce_mean(roachLoss)))
                self.summary_op = tf.summary.merge(summary)
            else:
                self.summary_op = []
            self.saver = tf.train.Saver(max_to_keep=100)

            # debug graph:
            self.summary_writer.add_graph(self.session.graph)