Ejemplo n.º 1
0
def predict(task: TaggingTask,
            params: cfg.DataConfig,
            model: tf.keras.Model) -> List[Tuple[int, int, List[int]]]:
  """Predicts on the input data.

  Args:
    task: A `TaggingTask` object.
    params: A `cfg.DataConfig` object.
    model: A keras.Model.

  Returns:
    A list of tuple. Each tuple contains `sentence_id`, `sub_sentence_id` and
      a list of predicted ids.
  """

  def predict_step(inputs):
    """Replicated prediction calculation."""
    x, y = inputs
    sentence_ids = x.pop('sentence_id')
    sub_sentence_ids = x.pop('sub_sentence_id')
    outputs = task.inference_step(x, model)
    predict_ids = outputs['predict_ids']
    label_mask = tf.greater_equal(y, 0)
    return dict(
        predict_ids=predict_ids,
        label_mask=label_mask,
        sentence_ids=sentence_ids,
        sub_sentence_ids=sub_sentence_ids)

  def aggregate_fn(state, outputs):
    """Concatenates model's outputs."""
    if state is None:
      state = []

    for (batch_predict_ids, batch_label_mask, batch_sentence_ids,
         batch_sub_sentence_ids) in zip(outputs['predict_ids'],
                                        outputs['label_mask'],
                                        outputs['sentence_ids'],
                                        outputs['sub_sentence_ids']):
      for (tmp_predict_ids, tmp_label_mask, tmp_sentence_id,
           tmp_sub_sentence_id) in zip(batch_predict_ids.numpy(),
                                       batch_label_mask.numpy(),
                                       batch_sentence_ids.numpy(),
                                       batch_sub_sentence_ids.numpy()):
        real_predict_ids = []
        assert len(tmp_predict_ids) == len(tmp_label_mask)
        for i in range(len(tmp_predict_ids)):
          # Skip the padding label.
          if tmp_label_mask[i]:
            real_predict_ids.append(tmp_predict_ids[i])
        state.append((tmp_sentence_id, tmp_sub_sentence_id, real_predict_ids))

    return state

  dataset = orbit.utils.make_distributed_dataset(tf.distribute.get_strategy(),
                                                 task.build_inputs, params)
  outputs = utils.predict(predict_step, aggregate_fn, dataset)
  return sorted(outputs, key=lambda x: (x[0], x[1]))
Ejemplo n.º 2
0
def predict(task: TaggingTask, params: cfg.DataConfig,
            model: tf.keras.Model) -> Tuple[List[List[int]], List[int]]:
  """Predicts on the input data.

  Args:
    task: A `TaggingTask` object.
    params: A `cfg.DataConfig` object.
    model: A keras.Model.

  Returns:
    A tuple of `predict_ids` and `sentence_ids`, which are list with length
      of `num_examples`. Each element in `predict_ids` is a sequence of
      predicted per-word label id, and each element in `sentence_ids` is the
      sentence id of the corresponding example.
  """

  def predict_step(inputs):
    """Replicated prediction calculation."""
    x, y = inputs
    sentence_ids = x.pop('sentence_id')
    outputs = task.inference_step(x, model)
    predict_ids = outputs['predict_ids']
    label_mask = tf.greater_equal(y, 0)
    return dict(
        predict_ids=predict_ids,
        label_mask=label_mask,
        sentence_ids=sentence_ids)

  def aggregate_fn(state, outputs):
    """Concatenates model's outputs."""
    if state is None:
      state = {'predict_ids': [], 'sentence_ids': []}

    cur_predict_ids = state['predict_ids']
    cur_sentence_ids = state['sentence_ids']
    for batch_predict_ids, batch_label_mask, batch_sentence_ids in zip(
        outputs['predict_ids'], outputs['label_mask'],
        outputs['sentence_ids']):
      for tmp_predict_ids, tmp_label_mask, tmp_sentence_id in zip(
          batch_predict_ids.numpy(), batch_label_mask.numpy(),
          batch_sentence_ids.numpy()):
        cur_sentence_ids.append(tmp_sentence_id)
        cur_predict_ids.append([])
        assert len(tmp_predict_ids) == len(tmp_label_mask)
        for i in range(len(tmp_predict_ids)):
          # Skip the padding label.
          if tmp_label_mask[i]:
            cur_predict_ids[-1].append(tmp_predict_ids[i])
    return state

  dataset = orbit.utils.make_distributed_dataset(tf.distribute.get_strategy(),
                                                 task.build_inputs, params)
  outputs = utils.predict(predict_step, aggregate_fn, dataset)
  return outputs['predict_ids'], outputs['sentence_ids']
Ejemplo n.º 3
0
def predict(task: SentencePredictionTask, params: cfg.DataConfig,
            model: tf.keras.Model) -> List[Union[int, float]]:
    """Predicts on the input data.

  Args:
    task: A `SentencePredictionTask` object.
    params: A `cfg.DataConfig` object.
    model: A keras.Model.

  Returns:
    A list of predictions with length of `num_examples`. For regression task,
      each element in the list is the predicted score; for classification task,
      each element is the predicted class id.
  """
    is_regression = task.task_config.model.num_classes == 1

    def predict_step(inputs):
        """Replicated prediction calculation."""
        x, _ = inputs
        example_id = x.pop('example_id')
        outputs = task.inference_step(x, model)
        if is_regression:
            return dict(example_id=example_id, predictions=outputs)
        else:
            return dict(example_id=example_id,
                        predictions=tf.argmax(outputs, axis=-1))

    def aggregate_fn(state, outputs):
        """Concatenates model's outputs."""
        if state is None:
            state = []

        for per_replica_example_id, per_replica_batch_predictions in zip(
                outputs['example_id'], outputs['predictions']):
            state.extend(
                zip(per_replica_example_id, per_replica_batch_predictions))
        return state

    dataset = orbit.utils.make_distributed_dataset(
        tf.distribute.get_strategy(), task.build_inputs, params)
    outputs = utils.predict(predict_step, aggregate_fn, dataset)

    # When running on TPU POD, the order of output cannot be maintained,
    # so we need to sort by example_id.
    outputs = sorted(outputs, key=lambda x: x[0])
    return [x[1] for x in outputs]
Ejemplo n.º 4
0
def predict(task: QuestionAnsweringTask, params: cfg.DataConfig,
            model: tf.keras.Model):
    """Predicts on the input data.

  Args:
    task: A `QuestionAnsweringTask` object.
    params: A `cfg.DataConfig` object.
    model: A keras.Model.

  Returns:
    A tuple of `all_predictions`, `all_nbest` and `scores_diff`, which
      are dict and can be written to json files including prediction json file,
      nbest json file and null_odds json file.
  """
    tf_record_input_path, eval_examples, eval_features = (
        task._preprocess_eval_data(params))  # pylint: disable=protected-access

    # `tf_record_input_path` will overwrite `params.input_path`,
    # when `task.buid_inputs()` is called.
    task.set_preprocessed_eval_input_path(tf_record_input_path)

    def predict_step(inputs):
        """Replicated prediction calculation."""
        return task.validation_step(inputs, model)

    dataset = orbit.utils.make_distributed_dataset(
        tf.distribute.get_strategy(), task.build_inputs, params)
    aggregated_outputs = utils.predict(predict_step, task.aggregate_logs,
                                       dataset)

    all_predictions, all_nbest, scores_diff = (
        task.squad_lib.postprocess_output(
            eval_examples,
            eval_features,
            aggregated_outputs,
            task.task_config.n_best_size,
            task.task_config.max_answer_length,
            task.task_config.validation_data.do_lower_case,
            version_2_with_negative=(params.version_2_with_negative),
            null_score_diff_threshold=task.task_config.
            null_score_diff_threshold,
            xlnet_format=task.task_config.validation_data.xlnet_format,
            verbose=False))
    return all_predictions, all_nbest, scores_diff
Ejemplo n.º 5
0
def predict(task: SentencePredictionTask, params: cfg.DataConfig,
            model: tf.keras.Model) -> List[Union[int, float]]:
  """Predicts on the input data.

  Args:
    task: A `SentencePredictionTask` object.
    params: A `cfg.DataConfig` object.
    model: A keras.Model.

  Returns:
    A list of predictions with length of `num_examples`. For regression task,
      each element in the list is the predicted score; for classification task,
      each element is the predicted class id.
  """
  is_regression = task.task_config.model.num_classes == 1

  def predict_step(inputs):
    """Replicated prediction calculation."""
    x, _ = inputs
    outputs = task.inference_step(x, model)
    if is_regression:
      return outputs
    else:
      return tf.argmax(outputs, axis=-1)

  def aggregate_fn(state, outputs):
    """Concatenates model's outputs."""
    if state is None:
      state = {'predictions': []}

    for per_replica_batch_predictions in outputs:
      state['predictions'].extend(per_replica_batch_predictions)
    return state

  dataset = orbit.utils.make_distributed_dataset(tf.distribute.get_strategy(),
                                                 task.build_inputs, params)
  outputs = utils.predict(predict_step, aggregate_fn, dataset)
  return outputs['predictions']
Ejemplo n.º 6
0
        outputs['predict_ids'], outputs['label_mask'],
        outputs['sentence_ids']):
      for tmp_predict_ids, tmp_label_mask, tmp_sentence_id in zip(
          batch_predict_ids.numpy(), batch_label_mask.numpy(),
          batch_sentence_ids.numpy()):
        cur_sentence_ids.append(tmp_sentence_id)
        cur_predict_ids.append([])
        assert len(tmp_predict_ids) == len(tmp_label_mask)
        for i in range(len(tmp_predict_ids)):
          # Skip the padding label.
          if tmp_label_mask[i]:
            cur_predict_ids[-1].append(tmp_predict_ids[i])
<<<<<<< HEAD
    return cur_predict_ids, cur_sentence_ids

  loop_fn = orbit.utils.create_loop_fn(predict_step)
  dataset = orbit.utils.make_distributed_dataset(tf.distribute.get_strategy(),
                                                 task.build_inputs, params)
  # Set `num_steps` to -1 to exhaust the dataset.
  predict_ids, sentence_ids = loop_fn(
      iter(dataset), num_steps=-1, state=([], []), reduce_fn=reduce_fn)
  return predict_ids, sentence_ids
=======
    return state

  dataset = orbit.utils.make_distributed_dataset(tf.distribute.get_strategy(),
                                                 task.build_inputs, params)
  outputs = utils.predict(predict_step, aggregate_fn, dataset)
  return outputs['predict_ids'], outputs['sentence_ids']
>>>>>>> a811a3b7e640722318ad868c99feddf3f3063e36
Ejemplo n.º 7
0
def predict(task: SentencePredictionTask,
            params: cfg.DataConfig,
            model: tf.keras.Model,
            params_aug: Optional[cfg.DataConfig] = None,
            test_time_aug_wgt: float = 0.3) -> List[Union[int, float]]:
  """Predicts on the input data.

  Args:
    task: A `SentencePredictionTask` object.
    params: A `cfg.DataConfig` object.
    model: A keras.Model.
    params_aug: A `cfg.DataConfig` object for augmented data.
    test_time_aug_wgt: Test time augmentation weight. The prediction score will
      use (1. - test_time_aug_wgt) original prediction plus test_time_aug_wgt
      augmented prediction.

  Returns:
    A list of predictions with length of `num_examples`. For regression task,
      each element in the list is the predicted score; for classification task,
      each element is the predicted class id.
  """

  def predict_step(inputs):
    """Replicated prediction calculation."""
    x, _ = inputs
    example_id = x.pop('example_id')
    outputs = task.inference_step(x, model)
    return dict(example_id=example_id, predictions=outputs)

  def aggregate_fn(state, outputs):
    """Concatenates model's outputs."""
    if state is None:
      state = []

    for per_replica_example_id, per_replica_batch_predictions in zip(
        outputs['example_id'], outputs['predictions']):
      state.extend(zip(per_replica_example_id, per_replica_batch_predictions))
    return state

  dataset = orbit.utils.make_distributed_dataset(tf.distribute.get_strategy(),
                                                 task.build_inputs, params)
  outputs = utils.predict(predict_step, aggregate_fn, dataset)

  # When running on TPU POD, the order of output cannot be maintained,
  # so we need to sort by example_id.
  outputs = sorted(outputs, key=lambda x: x[0])
  is_regression = task.task_config.model.num_classes == 1
  if params_aug is not None:
    dataset_aug = orbit.utils.make_distributed_dataset(
        tf.distribute.get_strategy(), task.build_inputs, params_aug)
    outputs_aug = utils.predict(predict_step, aggregate_fn, dataset_aug)
    outputs_aug = sorted(outputs_aug, key=lambda x: x[0])
    if is_regression:
      return [(1. - test_time_aug_wgt) * x[1] + test_time_aug_wgt * y[1]
              for x, y in zip(outputs, outputs_aug)]
    else:
      return [
          tf.argmax(
              (1. - test_time_aug_wgt) * x[1] + test_time_aug_wgt * y[1],
              axis=-1) for x, y in zip(outputs, outputs_aug)
      ]
  if is_regression:
    return [x[1] for x in outputs]
  else:
    return [tf.argmax(x[1], axis=-1) for x in outputs]
Ejemplo n.º 8
0
def _predict(task, params, model):
    """Predicts on the input data.

    Similiar to official.nlp.tasks.tagging.predict, but returns the logits
    instead of the final label.

    Args:
        task: A `TaggingTask` object.
        params: A `cfg.DataConfig` object.
        model: A keras.Model.

    Returns:
        A list of tuple. Each tuple contains `sentence_id`, `sub_sentence_id`
            and a list of predicted ids.
    """
    def _predict_step(inputs):
        """Replicated prediction calculation."""
        x, y = inputs
        sentence_ids = x.pop("sentence_id")
        sub_sentence_ids = x.pop("sub_sentence_id")
        outputs = task.inference_step(x, model)
        logits = outputs["logits"]
        label_mask = tf.greater_equal(y, 0)
        return dict(logits=logits,
                    label_mask=label_mask,
                    sentence_ids=sentence_ids,
                    sub_sentence_ids=sub_sentence_ids)

    def _aggregate_fn(state, outputs):
        """Concatenates model's outputs."""
        if state is None:
            state = []

        for (batch_logits, batch_label_mask, batch_sentence_ids,
             batch_sub_sentence_ids) in zip(outputs["logits"],
                                            outputs["label_mask"],
                                            outputs["sentence_ids"],
                                            outputs["sub_sentence_ids"]):
            batch_probs = tf.keras.activations.softmax(batch_logits)
            for (tmp_prob, tmp_label_mask, tmp_sentence_id,
                 tmp_sub_sentence_id) in zip(batch_probs.numpy(),
                                             batch_label_mask.numpy(),
                                             batch_sentence_ids.numpy(),
                                             batch_sub_sentence_ids.numpy()):

                real_probs = []
                assert len(tmp_prob) == len(tmp_label_mask)
                _assert_same_length([tmp_prob, tmp_label_mask],
                                    tmp_sentence_id)
                for i in range(len(tmp_prob)):
                    # Skip the padding label.
                    if tmp_label_mask[i]:
                        real_probs.append(tmp_prob[i])
                state.append(
                    (tmp_sentence_id, tmp_sub_sentence_id, real_probs))

        return state

    dataset = orbit.utils.make_distributed_dataset(
        tf.distribute.get_strategy(), task.build_inputs, params)
    outputs = utils.predict(_predict_step, _aggregate_fn, dataset)
    return sorted(outputs, key=lambda x: (x[0], x[1]))