Ejemplo n.º 1
0
 def test_ids2str(self):
     encoder = public_parsing_ops.create_text_encoder(
         "sentencepiece", _SPM_VOCAB)
     text = "the quick brown fox jumps over the lazy dog"
     ids = np.array([
         367, 1910, 3619, 1660, 8068, 664, 604, 1154, 684, 367, 648, 8090,
         8047, 3576, 1, 0, 0, 0
     ])
     decode_text = text_eval.ids2str(encoder, ids, None)
     self.assertEqual(text, decode_text)
     decode_text = text_eval.ids2str(encoder, ids, 100)
     self.assertEqual(text, decode_text)
     ids = np.array([
         367, 1910, 3619, 4, 1660, 8068, 664, 604, 1154, 684, 96, 367, 648,
         8090, 8047, 3576, 25, 1, 0, 0, 0
     ])
     decode_text = text_eval.ids2str(encoder, ids, 100)
     self.assertEqual(
         "the quick brown <4> fox jumps over <96> the lazy dog <25> ",
         decode_text)
Ejemplo n.º 2
0
    def run(self):
        checkpoint_path = self.model_dir
        checkpoint_path = tf.train.latest_checkpoint(checkpoint_path)
        params = registry.get_params(self.params_transformer)(
            self.param_overrides)
        parser, shapes = params.parser(mode=tf.estimator.ModeKeys.PREDICT)
        estimator = estimator_utils.create_estimator(self.master,
                                                     self.model_dir,
                                                     self.use_tpu,
                                                     self.iterations_per_loop,
                                                     self.num_shards, params)
        encoder = public_parsing_ops.create_text_encoder(
            self.encoder_type, self.vocab_filename)

        def input_function(params):
            input_text1 = "hello this is a first text"
            target1 = "first text"
            input_text2 = "Eighteen sailors were injured after an explosion and fire on board a ship at the US Naval Base in San Diego, US Navy officials said.The sailors on the USS Bonhomme Richard had 'minor injuries' from the fire and were taken to a hospital, Lt. Cmdr. Patricia Kreuzberger told CNN."
            target2 = "18 sailors injured after an explosion and fire on a naval ship in San Diego"
            read_dictionary_data = np.load(self.test_dict_dataset_path,
                                           allow_pickle='TRUE').item()
            # dataset = tf.data.Dataset.from_tensor_slices({"inputs":[input_text1, input_text2],"targets":[target1, target2]}).map(parser)

            dataset = tf.data.Dataset.from_tensor_slices(
                read_dictionary_data).map(parser)
            dataset = dataset.unbatch()
            dataset = dataset.padded_batch(params["batch_size"],
                                           padded_shapes=shapes,
                                           drop_remainder=True)
            return dataset

        predictions = estimator.predict(input_fn=input_function,
                                        checkpoint_path=checkpoint_path)
        for i in predictions:
            print(
                "======================================================================================================================================="
            )
            print("inputs: " + text_eval.ids2str(encoder, i['inputs'], None))
            print("targets: " + text_eval.ids2str(encoder, i['targets'], None))
            print("outputs: " + text_eval.ids2str(encoder, i['outputs'], None))
Ejemplo n.º 3
0
def transformer_params(patterns, param_overrides):
    """Params for TransformerEncoderDecoderMLModel.

  Args:
    patterns: a dict include train_pattern, dev_pattern, test_pattern
    param_overrides: a string, comma separated list of name=value

  Returns:
    A instance of HParams
  """

    hparams = contrib_training.HParams(
        train_pattern=patterns["train_pattern"],
        dev_pattern=patterns["dev_pattern"],
        test_pattern=patterns["test_pattern"],
        vocab_filename="pegasus/ops/testdata/sp_test.model",
        encoder_type="sentencepiece_newline",
        length_bucket_size=0,
        add_task_id=False,
        batch_size=patterns["batch_size"],
        max_input_len=patterns["max_input_len"],
        max_target_len=patterns["max_output_len"],
        max_decode_len=patterns["max_output_len"],
        hidden_size=1024,
        filter_size=4096,
        num_heads=16,
        num_encoder_layers=16,
        num_decoder_layers=16,
        beam_size=1,
        beam_start=5,
        beam_alpha=0.8,
        beam_min=0,
        beam_max=-1,
        temperature=0.0,
        top_k=0,
        top_p=0.0,
        optimizer_name="adafactor",
        train_steps=patterns["train_steps"],
        learning_rate=patterns["learning_rate"],
        label_smoothing=0.1,
        dropout=0.1,
        eval_max_predictions=patterns.get("eval_steps", 1000),
        use_bfloat16=False,
        model=None,
        parser=None,
        encoder=None,
        estimator_prediction_fn=None,
        eval=None,
        estimator_eval_metrics_fn=estimator_metrics.gen_eval_metrics_fn,
    )

    if param_overrides:
        hparams.parse(param_overrides)

    hparams.parser = functools.partial(
        parsers.supervised_strings_parser,
        hparams.vocab_filename,
        hparams.encoder_type,
        hparams.max_input_len,
        hparams.max_target_len,
        length_bucket_size=hparams.length_bucket_size,
        length_bucket_start_id=pegasus_params.LENGTH_BUCKET_START_ID,
        length_bucket_max_id=pegasus_params.TASK_START_ID - 1,
        add_task_id=hparams.add_task_id,
        task_start_id=pegasus_params.TASK_START_ID)

    hparams.encoder = public_parsing_ops.create_text_encoder(
        hparams.encoder_type, hparams.vocab_filename)

    hparams.model = functools.partial(
        transformer.TransformerEncoderDecoderModel, hparams.encoder.vocab_size,
        hparams.hidden_size, hparams.filter_size, hparams.num_heads,
        hparams.num_encoder_layers, hparams.num_decoder_layers,
        hparams.label_smoothing, hparams.dropout)

    beam_keys = ("beam_start", "beam_alpha", "beam_min", "beam_max",
                 "temperature", "top_k", "top_p")
    beam_kwargs = {
        k: hparams.get(k)
        for k in beam_keys if k in hparams.values()
    }

    def decode_fn(features):
        return hparams.model().predict(features, hparams.max_decode_len,
                                       hparams.beam_size, **beam_kwargs)

    hparams.estimator_prediction_fn = decode_fn
    hparams.eval = functools.partial(
        text_eval.text_eval,
        hparams.encoder,
        num_reserved=pegasus_params.NUM_RESERVED_TOKENS)

    return hparams
Ejemplo n.º 4
0
def pegasus_large_params(param_overrides):
  """Params for PegasusLarge."""

  hparams = contrib_training.HParams(
      train_pattern="tfds_transformed:common_crawl-train",
      dev_pattern="tfds_transformed:common_crawl-validation",
      test_pattern="tfds_transformed:common_crawl-test",
      vocab_filename="pegasus/ops/testdata/sp_test.model",
      encoder_type="sentencepiece_newline",
      parser_strategy="dynamic_rouge",
      parser_masked_sentence_ratio=0.45,
      parser_masked_words_ratio=0.0,
      # Configure the options of word masking
      # The sum of the three probs below (mask word by MSK, random, or intact)
      # should be 1.
      # By default, following the word masking procedure of BERT, which is
      # 80% by <MSK>, 10% by random tokens, 10% remain unchanged.
      parser_mask_word_by_msk_token_prob=0.8,
      parser_mask_word_by_random_token_prob=0.1,
      parser_mask_word_by_intact_prob=0.1,
      # Configure the options of sentence masking.
      # The sum of the four probs below (mask sentence by MSK, random, intact
      # or remove) should be 1.
      # The four sentence masking options:
      #   1. Masking seleted sentences by <MSK>. In practice, the <MSK> token
      #      for sentences is different from the <MSK> token for words in order
      #      to distinguish sentence masking and word masking.
      #   2. Masking selected sentences by another sentences which are randomly
      #      picked from the same document.
      #   3. Masking selected sentences by leaving them unchanged.
      #   4. Masking selected sentences by removing them from inputs.
      parser_mask_sentence_by_msk_token_prob=0.9,
      parser_mask_sentence_by_random_sentence_prob=0.,
      parser_mask_sentence_by_intact_prob=0.1,
      parser_mask_sentence_by_remove_prob=0.,
      # rouge_ngrams_size: a positive integer
      parser_rouge_ngrams_size=1,
      # rouge_metric_type: precision, recall, F
      parser_rouge_metric_type="F",
      # rouge_compute_option: standard, deduplicate, log
      #   standard: number of each ngram counted as it appears
      #   deduplicate: number of each ngram counted once only
      #   log: apply log(1+n) when compute the appearance of each ngram
      parser_rouge_compute_option="standard",
      parser_rouge_stopwords_filename="pegasus/ops/testdata/english_stopwords",
      parser_rouge_noise_ratio=0.20,
      parser_dynamic_mask_min_ratio=0.33,
      # if greater than zero, assign target into buckets by
      # length // bucket_size, the bucket id is appended to the start of inputs.
      # the bucket id uses the reserved bucket ids, starting from the start id,
      # goes up to the maximum number of reseerved tokens.
      length_bucket_size=0,
      add_task_id=False,
      batch_size=16,
      max_input_len=512,
      max_target_len=256,
      max_decode_len=256,
      max_total_words=0,
      pretrain_target_filter_min=0,
      hidden_size=1024,
      filter_size=4096,
      num_heads=16,
      num_encoder_layers=16,
      num_decoder_layers=16,
      optimizer_name="adafactor",
      learning_rate=0.01,
      label_smoothing=0.0,
      dropout=0.1,
      train_steps=1500000,
      beam_size=1,
      eval_max_predictions=1000,
      use_bfloat16=False,
      model=None,
      encoder=None,
      parser=None,
      estimator_prediction_fn=None,
      eval=None,
      estimator_eval_metrics_fn=estimator_metrics.pretrain_eval_metrics_fn,
  )

  if param_overrides:
    hparams.parse(param_overrides)

  # Check values
  if (hparams.parser_mask_word_by_msk_token_prob +
      hparams.parser_mask_word_by_random_token_prob +
      hparams.parser_mask_word_by_intact_prob) != 1.:
    raise ValueError("The sum of rates of the three word masking options "
                     "(MSK, random, intact) does not equal to 1.")
  if (hparams.parser_mask_sentence_by_msk_token_prob +
      hparams.parser_mask_sentence_by_random_sentence_prob +
      hparams.parser_mask_sentence_by_intact_prob +
      hparams.parser_mask_sentence_by_remove_prob) != 1.:
    raise ValueError("The sum of rates of the four sentence masking options "
                     "(MSK, random, intact, skip) does not equal to 1.")
  hparams.encoder = public_parsing_ops.create_text_encoder(
      hparams.encoder_type, hparams.vocab_filename)
  hparams.parser = functools.partial(
      parsers.string_features_for_pretraining_parser,
      hparams.vocab_filename,
      hparams.encoder_type,
      hparams.max_input_len,
      hparams.max_target_len,
      hparams.max_total_words,
      hparams.parser_strategy,
      hparams.parser_masked_sentence_ratio,
      hparams.parser_masked_words_ratio, [
          hparams.parser_mask_word_by_msk_token_prob,
          hparams.parser_mask_word_by_random_token_prob,
          hparams.parser_mask_word_by_intact_prob
      ], [
          hparams.parser_mask_sentence_by_msk_token_prob,
          hparams.parser_mask_sentence_by_random_sentence_prob,
          hparams.parser_mask_sentence_by_intact_prob,
          hparams.parser_mask_sentence_by_remove_prob
      ],
      hparams.parser_rouge_ngrams_size,
      hparams.parser_rouge_metric_type,
      hparams.parser_rouge_compute_option,
      hparams.parser_rouge_stopwords_filename,
      NUM_RESERVED_TOKENS,
      parser_rouge_noise_ratio=hparams.parser_rouge_noise_ratio,
      parser_dynamic_mask_min_ratio=hparams.parser_dynamic_mask_min_ratio,
      input_feature="inputs",
      pretrain_target_filter_min=hparams.pretrain_target_filter_min,
      length_bucket_size=hparams.length_bucket_size,
      length_bucket_start_id=LENGTH_BUCKET_START_ID,
      length_bucket_max_id=TASK_START_ID - 1,
      add_task_id=hparams.add_task_id,
      task_start_id=TASK_START_ID)
  hparams.model = functools.partial(
      transformer.TransformerEncoderDecoderModel, hparams.encoder.vocab_size,
      hparams.hidden_size, hparams.filter_size, hparams.num_heads,
      hparams.num_encoder_layers, hparams.num_decoder_layers,
      hparams.label_smoothing, hparams.dropout)

  def decode_fn(features):
    return hparams.model().predict(features, hparams.max_decode_len,
                                   hparams.beam_size)

  hparams.estimator_prediction_fn = decode_fn
  hparams.eval = functools.partial(
      text_eval.text_eval, hparams.encoder, num_reserved=NUM_RESERVED_TOKENS)
  return hparams
Ejemplo n.º 5
0
 def test_py_decode(self, encoder_type):
   text = "the quick brown fox jumps \n over the lazy dog."
   e1 = text_encoder_utils.create_text_encoder(encoder_type, _SPM_VOCAB)
   e2 = public_parsing_ops.create_text_encoder(encoder_type, _SPM_VOCAB)
   ids = e1.encode(text)
   self.assertEqual(e1.decode(ids), e2.decode(ids))
Ejemplo n.º 6
0
 def test_py_encode(self, encoder_type):
   text = "the quick brown fox\n jumps over the lazy dog.\n"
   e1 = text_encoder_utils.create_text_encoder(encoder_type, _SPM_VOCAB)
   e2 = public_parsing_ops.create_text_encoder(encoder_type, _SPM_VOCAB)
   self.assertEqual(e1.encode(text), e2.encode(text))
Ejemplo n.º 7
0
 def test_vocab(self, encoder_type):
   e1 = text_encoder_utils.create_text_encoder(encoder_type, _SPM_VOCAB)
   e2 = public_parsing_ops.create_text_encoder(encoder_type, _SPM_VOCAB)
   self.assertEqual(e1.vocab_size, e2.vocab_size)