예제 #1
0
  def build_model(self, reuse, dev, ntype):
    with tf.variable_scope(self.name) and tf.device(dev):
      if reuse:
        tf.get_variable_scope().reuse_variables()
        assert tf.get_variable_scope().reuse

      # Set inputs of networks
      self.minimap = tf.placeholder(tf.float32, [None, U.minimap_channel(), self.msize, self.msize], name='minimap')
      self.screen = tf.placeholder(tf.float32, [None, U.screen_channel(), self.ssize, self.ssize], name='screen')
      self.info = tf.placeholder(tf.float32, [None, self.isize], name='info')

      # Build networks
      net = build_net(self.minimap, self.screen, self.info, self.msize, self.ssize, len(actions.FUNCTIONS), ntype)
      self.spatial_action, self.non_spatial_action, self.value = net

      # Set targets and masks
      self.valid_spatial_action = tf.placeholder(tf.float32, [None], name='valid_spatial_action')
      self.spatial_action_selected = tf.placeholder(tf.float32, [None, self.ssize**2], name='spatial_action_selected')
      self.valid_non_spatial_action = tf.placeholder(tf.float32, [None, len(actions.FUNCTIONS)], name='valid_non_spatial_action')
      self.non_spatial_action_selected = tf.placeholder(tf.float32, [None, len(actions.FUNCTIONS)], name='non_spatial_action_selected')
      self.value_target = tf.placeholder(tf.float32, [None], name='value_target')

      # Compute log probability
      spatial_action_prob = tf.reduce_sum(self.spatial_action * self.spatial_action_selected, axis=1)
      spatial_action_log_prob = tf.log(tf.clip_by_value(spatial_action_prob, 1e-10, 1.))
      non_spatial_action_prob = tf.reduce_sum(self.non_spatial_action * self.non_spatial_action_selected, axis=1)
      valid_non_spatial_action_prob = tf.reduce_sum(self.non_spatial_action * self.valid_non_spatial_action, axis=1)
      valid_non_spatial_action_prob = tf.clip_by_value(valid_non_spatial_action_prob, 1e-10, 1.)
      non_spatial_action_prob = non_spatial_action_prob / valid_non_spatial_action_prob
      non_spatial_action_log_prob = tf.log(tf.clip_by_value(non_spatial_action_prob, 1e-10, 1.))
      self.summary.append(tf.summary.histogram('spatial_action_prob', spatial_action_prob))
      self.summary.append(tf.summary.histogram('non_spatial_action_prob', non_spatial_action_prob))

      # Compute losses, more details in https://arxiv.org/abs/1602.01783
      # Policy loss and value loss
      action_log_prob = self.valid_spatial_action * spatial_action_log_prob + non_spatial_action_log_prob
      advantage = tf.stop_gradient(self.value_target - self.value)
      policy_loss = - tf.reduce_mean(action_log_prob * advantage)
      value_loss = - tf.reduce_mean(self.value * advantage)
      self.summary.append(tf.summary.scalar('policy_loss', policy_loss))
      self.summary.append(tf.summary.scalar('value_loss', value_loss))

      # TODO: policy penalty
      loss = policy_loss + value_loss

      # Build the optimizer
      self.learning_rate = tf.placeholder(tf.float32, None, name='learning_rate')
      opt = tf.train.RMSPropOptimizer(self.learning_rate, decay=0.99, epsilon=1e-10)
      grads = opt.compute_gradients(loss)
      cliped_grad = []
      for grad, var in grads:
        self.summary.append(tf.summary.histogram(var.op.name, var))
        self.summary.append(tf.summary.histogram(var.op.name+'/grad', grad))
        grad = tf.clip_by_norm(grad, 10.0)
        cliped_grad.append([grad, var])
      self.train_op = opt.apply_gradients(cliped_grad)
      self.summary_op = tf.summary.merge(self.summary)

      self.saver = tf.train.Saver(max_to_keep=100)
예제 #2
0
  def build_model(self, reuse, dev, ntype):
    with tf.variable_scope(self.name) and tf.device(dev):
      if reuse:
        tf.get_variable_scope().reuse_variables()
        assert tf.get_variable_scope().reuse

      # Set inputs of networks
      self.minimap = tf.placeholder(tf.float32, [None, U.minimap_channel(), self.msize, self.msize], name='minimap')
      self.screen = tf.placeholder(tf.float32, [None, U.screen_channel(), self.ssize, self.ssize], name='screen')
      self.info = tf.placeholder(tf.float32, [None, self.isize], name='info')

      # Build networks
      net = build_net(self.minimap, self.screen, self.info, self.msize, self.ssize, len(actions.FUNCTIONS), ntype)
      self.spatial_action, self.non_spatial_action, self.value = net

      # Set targets and masks
      self.valid_spatial_action = tf.placeholder(tf.float32, [None], name='valid_spatial_action')
      self.spatial_action_selected = tf.placeholder(tf.float32, [None, self.ssize**2], name='spatial_action_selected')
      self.valid_non_spatial_action = tf.placeholder(tf.float32, [None, len(actions.FUNCTIONS)], name='valid_non_spatial_action')
      self.non_spatial_action_selected = tf.placeholder(tf.float32, [None, len(actions.FUNCTIONS)], name='non_spatial_action_selected')
      self.value_target = tf.placeholder(tf.float32, [None], name='value_target')

      # Compute log probability
      spatial_action_prob = tf.reduce_sum(self.spatial_action * self.spatial_action_selected, axis=1)
      spatial_action_log_prob = tf.log(tf.clip_by_value(spatial_action_prob, 1e-10, 1.))
      non_spatial_action_prob = tf.reduce_sum(self.non_spatial_action * self.non_spatial_action_selected, axis=1)
      valid_non_spatial_action_prob = tf.reduce_sum(self.non_spatial_action * self.valid_non_spatial_action, axis=1)
      valid_non_spatial_action_prob = tf.clip_by_value(valid_non_spatial_action_prob, 1e-10, 1.)
      non_spatial_action_prob = non_spatial_action_prob / valid_non_spatial_action_prob
      non_spatial_action_log_prob = tf.log(tf.clip_by_value(non_spatial_action_prob, 1e-10, 1.))
      self.summary.append(tf.summary.histogram('spatial_action_prob', spatial_action_prob))
      self.summary.append(tf.summary.histogram('non_spatial_action_prob', non_spatial_action_prob))

      # Compute losses, more details in https://arxiv.org/abs/1602.01783
      # Policy loss and value loss
      action_log_prob = self.valid_spatial_action * spatial_action_log_prob + non_spatial_action_log_prob
      advantage = tf.stop_gradient(self.value_target - self.value)
      policy_loss = - tf.reduce_mean(action_log_prob * advantage)
      value_loss = - tf.reduce_mean(self.value * advantage)
      self.summary.append(tf.summary.scalar('policy_loss', policy_loss))
      self.summary.append(tf.summary.scalar('value_loss', value_loss))

      # TODO: policy penalty
      loss = policy_loss + value_loss

      # Build the optimizer
      self.learning_rate = tf.placeholder(tf.float32, None, name='learning_rate')
      opt = tf.train.RMSPropOptimizer(self.learning_rate, decay=0.99, epsilon=1e-10)
      grads = opt.compute_gradients(loss)
      cliped_grad = []
      for grad, var in grads:
        self.summary.append(tf.summary.histogram(var.op.name, var))
        self.summary.append(tf.summary.histogram(var.op.name+'/grad', grad))
        grad = tf.clip_by_norm(grad, 10.0)
        cliped_grad.append([grad, var])
      self.train_op = opt.apply_gradients(cliped_grad)
      self.summary_op = tf.summary.merge(self.summary)

      self.saver = tf.train.Saver(max_to_keep=100)
예제 #3
0
  def build_model(self, reuse, dev, ntype):
    with tf.variable_scope(self.name) and tf.device(dev):
      if reuse:
        tf.get_variable_scope().reuse_variables()
        assert tf.get_variable_scope().reuse

      # Set inputs of networks
      self.minimap = tf.placeholder(tf.float32, [None, U.minimap_channel(), self.msize, self.msize], name='minimap')
      self.screen = tf.placeholder(tf.float32, [None, U.screen_channel(), self.ssize, self.ssize], name='screen')
      self.info = tf.placeholder(tf.float32, [None, self.isize], name='info')

      # Build networks
      net = build_net(self.minimap, self.screen, self.info, self.msize, self.ssize, len(actions.FUNCTIONS), ntype)
      self.spatial_action, self.non_spatial_action, self.value = net

      # Set targets and masks
      self.valid_spatial_action = tf.placeholder(tf.float32, [None], name='valid_spatial_action')
      self.spatial_action_selected = tf.placeholder(tf.float32, [None, self.ssize**2], name='spatial_action_selected')
      self.valid_non_spatial_action = tf.placeholder(tf.float32, [None, len(actions.FUNCTIONS)], name='valid_non_spatial_action')
      self.non_spatial_action_selected = tf.placeholder(tf.float32, [None, len(actions.FUNCTIONS)], name='non_spatial_action_selected')
      self.value_target = tf.placeholder(tf.float32, [None], name='value_target')

      # Compute log probability
      spatial_action_prob = tf.clip_by_value(tf.reduce_sum(self.spatial_action * self.spatial_action_selected, axis=1), 1e-10, 1.)
      non_spatial_action_prob = tf.clip_by_value(tf.reduce_sum(self.non_spatial_action * self.non_spatial_action_selected * self.valid_non_spatial_action, axis=1), 1e-10, 1.)

      q_value = spatial_action_prob * self.valid_spatial_action * self.ispatial + non_spatial_action_prob
      self.delta = self.value_target - q_value
      #self.clipped_error = tf.where(tf.abs(self.delta) < 1.0, 0.5 * tf.square(self.delta), tf.abs(self.delta) - 0.5, name='clipped_error')
      #value_loss = tf.reduce_mean(self.clipped_error, name='value_loss')
      
      value_loss = tf.reduce_mean(tf.square(self.delta))

      self.summary.append(tf.summary.histogram('spatial_action_prob', spatial_action_prob))
      self.summary.append(tf.summary.histogram('non_spatial_action_prob', non_spatial_action_prob))
      self.summary.append(tf.summary.scalar('value_loss', value_loss))
      
      # Build the optimizer
      self.learning_rate = tf.placeholder(tf.float32, None, name='learning_rate')
      opt = tf.train.RMSPropOptimizer(self.learning_rate, decay=0.99, epsilon=1e-10)
      grads = opt.compute_gradients(value_loss)
      cliped_grad = []
      for grad, var in grads:
        self.summary.append(tf.summary.histogram(var.op.name, var))
        grad = grad if grad is not None else tf.zeros_like(var)
        self.summary.append(tf.summary.histogram(var.op.name+'/grad', grad))
        grad = tf.clip_by_norm(grad, 10.0)
        cliped_grad.append([grad, var])
      self.train_op = opt.apply_gradients(cliped_grad)
      self.summary_op = tf.summary.merge(self.summary)

      self.saver = tf.train.Saver(max_to_keep=100)
예제 #4
0
    def build_model(self, reuse, dev, ntype):
        with tf.variable_scope(self.name) and tf.device(dev):
            if reuse:
                tf.get_variable_scope().reuse_variables()
                assert tf.get_variable_scope().reuse

            # Set inputs of networks
            self.minimap = tf.placeholder(
                tf.float32,
                [None, U.minimap_channel(), self.msize, self.msize],
                name='minimap')
            self.screen = tf.placeholder(
                tf.float32,
                [None, U.screen_channel(), self.ssize, self.ssize],
                name='screen')
            self.info = tf.placeholder(tf.float32, [None, self.isize],
                                       name='info')

            # Build a3c base networks
            net = build_net(self.minimap,
                            self.screen,
                            self.info,
                            self.msize,
                            self.ssize,
                            len(actions.FUNCTIONS),
                            ntype,
                            reuse=False)
            self.spatial_action, self.non_spatial_action, self.value = net

            # Set targets and masks
            self.valid_spatial_action = tf.placeholder(
                tf.float32, [None], name='valid_spatial_action')
            self.spatial_action_selected = tf.placeholder(
                tf.float32, [None, self.ssize**2],
                name='spatial_action_selected')
            self.valid_non_spatial_action = tf.placeholder(
                tf.float32, [None, len(actions.FUNCTIONS)],
                name='valid_non_spatial_action')
            self.non_spatial_action_selected = tf.placeholder(
                tf.float32, [None, len(actions.FUNCTIONS)],
                name='non_spatial_action_selected')
            self.value_target = tf.placeholder(tf.float32, [None],
                                               name='value_target')

            # Compute log probability
            spatial_action_prob = tf.reduce_sum(self.spatial_action *
                                                self.spatial_action_selected,
                                                axis=1)
            spatial_action_log_prob = tf.log(
                tf.clip_by_value(spatial_action_prob, 1e-10, 1.))
            non_spatial_action_prob = tf.reduce_sum(
                self.non_spatial_action * self.non_spatial_action_selected,
                axis=1)
            valid_non_spatial_action_prob = tf.reduce_sum(
                self.non_spatial_action * self.valid_non_spatial_action,
                axis=1)
            valid_non_spatial_action_prob = tf.clip_by_value(
                valid_non_spatial_action_prob, 1e-10, 1.)
            non_spatial_action_prob = non_spatial_action_prob / valid_non_spatial_action_prob
            non_spatial_action_log_prob = tf.log(
                tf.clip_by_value(non_spatial_action_prob, 1e-10, 1.))
            self.summary.append(
                tf.summary.histogram('spatial_action_prob',
                                     spatial_action_prob))
            self.summary.append(
                tf.summary.histogram('non_spatial_action_prob',
                                     non_spatial_action_prob))

            # Compute a3closses, more details in https://arxiv.org/abs/1602.01783
            # Policy loss and value loss
            action_log_prob = self.valid_spatial_action * spatial_action_log_prob + non_spatial_action_log_prob
            advantage = tf.stop_gradient(self.value_target - self.value)
            policy_loss = -tf.reduce_mean(action_log_prob * advantage)
            value_loss = -tf.reduce_mean(self.value * advantage)
            self.summary.append(tf.summary.scalar('policy_loss', policy_loss))
            self.summary.append(tf.summary.scalar('value_loss', value_loss))

            # TODO: policy penalty
            a3c_loss = policy_loss + value_loss

            #pc_part_start
            self.pc_minimap = tf.placeholder(
                tf.float32,
                [None, U.minimap_channel(), self.msize, self.msize],
                name='pc_minimap')
            self.pc_screen = tf.placeholder(
                tf.float32,
                [None, U.screen_channel(), self.ssize, self.ssize],
                name='pc_screen')
            self.pc_info = tf.placeholder(tf.float32, [None, self.isize],
                                          name='info')
            self.pc_valid_non_spatial_action = tf.placeholder(
                tf.float32, [None, len(actions.FUNCTIONS)],
                name='pc_valid_non_spatial_action')

            pc_net = build_pc_net(self.pc_minimap, self.pc_screen,
                                  self.pc_info, self.msize, self.ssize,
                                  len(actions.FUNCTIONS),
                                  self.pc_valid_non_spatial_action)
            pc_q, pc_q_max = pc_net
            pc_a = tf.placeholder("float", [None, len(actions.FUNCTIONS)])
            pc_a_reshaped = tf.reshape(
                self.pc_a, [-1, 1, 1, len(actions.FUNCTIONS)])

            # Extract Q for taken action
            pc_qa_ = tf.multiply(self.pc_q, pc_a_reshaped)
            pc_qa = tf.reduce_sum(pc_qa_, reduction_indices=3, keep_dims=False)
            # (-1, 20, 20)

            # TD target for Q
            self.pc_r = tf.placeholder("float", [None, 20, 20])
            pc_loss = self._pixel_change_lambda * tf.nn.l2_loss(self.pc_r -
                                                                pc_qa)

            # Build the optimizer
            loss = pc_loss + a3c_loss
            self.learning_rate = tf.placeholder(tf.float32,
                                                None,
                                                name='learning_rate')
            opt = tf.train.RMSPropOptimizer(self.learning_rate,
                                            decay=0.99,
                                            epsilon=1e-10)
            grads = opt.compute_gradients(loss)
            cliped_grad = []
            for grad, var in grads:
                self.summary.append(tf.summary.histogram(var.op.name, var))
                self.summary.append(
                    tf.summary.histogram(var.op.name + '/grad', grad))
                grad = tf.clip_by_norm(grad, 10.0)
                cliped_grad.append([grad, var])
            self.train_op = opt.apply_gradients(cliped_grad)
            self.summary_op = tf.summary.merge(self.summary)

            self.saver = tf.train.Saver(max_to_keep=100)
예제 #5
0
    def build_model(self, reuse, dev, ntype):
        with tf.variable_scope(self.name) and tf.device(dev):
            if reuse:
                tf.get_variable_scope().reuse_variables()
                assert tf.get_variable_scope().reuse

            # Set inputs of networks
            self.minimap = tf.placeholder(
                tf.float32,
                [None, U.minimap_channel(), self.msize, self.msize],
                name='minimap')
            self.screen = tf.placeholder(
                tf.float32,
                [None, U.screen_channel(), self.ssize, self.ssize],
                name='screen')
            self.info = tf.placeholder(tf.float32, [None, self.isize],
                                       name='info')

            # create master and subpolicies
            self.subpolicy_Q = build_net(self.minimap, self.screen, self.info,
                                         self.msize, self.ssize, num_units + 2,
                                         'master_policy')

            # Set targets and masks for master policy update
            self.learning_rate = tf.placeholder(tf.float32,
                                                None,
                                                name='learning_rate')

            self.action_input = tf.placeholder("float", [None, num_units + 2])
            self.y_input = tf.placeholder("float", [None])
            self.Q_action = tf.reduce_sum(tf.multiply(self.subpolicy_Q,
                                                      self.action_input),
                                          reduction_indices=1)
            self.cost = tf.reduce_mean(tf.square(self.y_input - self.Q_action))
            self.master_train_op = tf.train.AdamOptimizer(
                self.learning_rate).minimize(self.cost)

            # Set targets and masks for subpolicies update
            self.valid_spatial_action = tf.placeholder(
                tf.float32, [None], name='valid_spatial_action_')
            self.spatial_action_selected = tf.placeholder(
                tf.float32, [None, self.ssize**2],
                name='spatial_action_selected')
            self.valid_non_spatial_action = tf.placeholder(
                tf.float32, [None, len(actions.FUNCTIONS)],
                name='valid_non_spatial_action_')
            self.non_spatial_action_selected = tf.placeholder(
                tf.float32, [None, len(actions.FUNCTIONS)],
                name='non_spatial_action_selected_')
            self.value_target = tf.placeholder(tf.float32, [None],
                                               name='value_target_')

            # Build the optimizer
            opt = tf.train.AdamOptimizer(self.learning_rate)

            self.subpolicy = build_net(self.minimap, self.screen, self.info,
                                       self.msize, self.ssize,
                                       len(actions.FUNCTIONS), 'fcn')
            self.spatial_action, self.non_spatial_action, self.value = self.subpolicy

            # Compute log probability
            spatial_action_prob = tf.reduce_sum(self.spatial_action *
                                                self.spatial_action_selected,
                                                axis=1)
            spatial_action_log_prob = tf.log(
                tf.clip_by_value(spatial_action_prob, 1e-10, 1.))
            non_spatial_action_prob = tf.reduce_sum(
                self.non_spatial_action * self.non_spatial_action_selected,
                axis=1)
            valid_non_spatial_action_prob = tf.reduce_sum(
                self.non_spatial_action * self.valid_non_spatial_action,
                axis=1)
            valid_non_spatial_action_prob = tf.clip_by_value(
                valid_non_spatial_action_prob, 1e-10, 1.)
            non_spatial_action_prob = non_spatial_action_prob / valid_non_spatial_action_prob
            non_spatial_action_log_prob = tf.log(
                tf.clip_by_value(non_spatial_action_prob, 1e-10, 1.))
            self.summary.append(
                tf.summary.histogram('spatial_action_prob_',
                                     spatial_action_prob))
            self.summary.append(
                tf.summary.histogram('non_spatial_action_prob_',
                                     non_spatial_action_prob))

            # Compute losses, more details in https://arxiv.org/abs/1602.01783
            # Policy loss and value loss
            action_log_prob = self.valid_spatial_action * spatial_action_log_prob + non_spatial_action_log_prob
            advantage = tf.stop_gradient(self.value_target - self.value)
            policy_loss = -tf.reduce_mean(action_log_prob * advantage)
            value_loss = -tf.reduce_mean(self.value * advantage)

            self.summary.append(tf.summary.scalar('policy_loss_', policy_loss))
            self.summary.append(tf.summary.scalar('value_loss_', value_loss))

            # TODO: policy penalty
            loss = policy_loss + value_loss

            grads = opt.compute_gradients(loss)
            cliped_grad = []
            for grad, var in grads:
                # get around of master policy gradients
                if grad is None:
                    continue
                self.summary.append(tf.summary.histogram(var.op.name, var))
                self.summary.append(
                    tf.summary.histogram(var.op.name + '/grad', grad))
                grad = tf.clip_by_norm(grad, 10.0)
                cliped_grad.append([grad, var])
            self.train_op = opt.apply_gradients(cliped_grad)
            self.summary_op = tf.summary.merge(self.summary)

            self.saver = tf.train.Saver(max_to_keep=100)