def generate_retrieval_dataset():
    """Generate retrieval test and dev dataset and returns input meta data."""
    assert (FLAGS.input_data_dir and FLAGS.retrieval_task_name)
    if FLAGS.tokenization == "WordPiece":
        tokenizer = tokenization.FullTokenizer(
            vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case)
        processor_text_fn = tokenization.convert_to_unicode
    else:
        assert FLAGS.tokenization == "SentencePiece"
        tokenizer = tokenization.FullSentencePieceTokenizer(
            FLAGS.sp_model_file)
        processor_text_fn = functools.partial(tokenization.preprocess_text,
                                              lower=FLAGS.do_lower_case)

    processors = {
        "bucc": sentence_retrieval_lib.BuccProcessor,
        "tatoeba": sentence_retrieval_lib.TatoebaProcessor,
    }

    task_name = FLAGS.retrieval_task_name.lower()
    if task_name not in processors:
        raise ValueError("Task not found: %s" % task_name)

    processor = processors[task_name](process_text_fn=processor_text_fn)

    return sentence_retrieval_lib.generate_sentence_retrevial_tf_record(
        processor, FLAGS.input_data_dir, tokenizer,
        FLAGS.eval_data_output_path, FLAGS.test_data_output_path,
        FLAGS.max_seq_length)
def generate_tagging_dataset():
    """Generates tagging dataset."""
    processors = {
        "panx":
        functools.partial(tagging_data_lib.PanxProcessor,
                          only_use_en_train=FLAGS.tagging_only_use_en_train,
                          only_use_en_dev=FLAGS.only_use_en_dev),
        "udpos":
        functools.partial(tagging_data_lib.UdposProcessor,
                          only_use_en_train=FLAGS.tagging_only_use_en_train,
                          only_use_en_dev=FLAGS.only_use_en_dev),
    }
    task_name = FLAGS.tagging_task_name.lower()
    if task_name not in processors:
        raise ValueError("Task not found: %s" % task_name)

    if FLAGS.tokenization == "WordPiece":
        tokenizer = tokenization.FullTokenizer(
            vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case)
        processor_text_fn = tokenization.convert_to_unicode
    elif FLAGS.tokenization == "SentencePiece":
        tokenizer = tokenization.FullSentencePieceTokenizer(
            FLAGS.sp_model_file)
        processor_text_fn = functools.partial(tokenization.preprocess_text,
                                              lower=FLAGS.do_lower_case)
    else:
        raise ValueError("Unsupported tokenization: %s" % FLAGS.tokenization)

    processor = processors[task_name]()
    return tagging_data_lib.generate_tf_record_from_data_file(
        processor, FLAGS.input_data_dir, tokenizer, FLAGS.max_seq_length,
        FLAGS.train_data_output_path, FLAGS.eval_data_output_path,
        FLAGS.test_data_output_path, processor_text_fn)
def generate_regression_dataset():
    """Generates regression dataset and returns input meta data."""
    if FLAGS.tokenization == "WordPiece":
        tokenizer = tokenization.FullTokenizer(
            vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case)
        processor_text_fn = tokenization.convert_to_unicode
    else:
        assert FLAGS.tokenization == "SentencePiece"
        tokenizer = tokenization.FullSentencePieceTokenizer(
            FLAGS.sp_model_file)
        processor_text_fn = functools.partial(tokenization.preprocess_text,
                                              lower=FLAGS.do_lower_case)

    if FLAGS.tfds_params:
        processor = classifier_data_lib.TfdsProcessor(
            tfds_params=FLAGS.tfds_params, process_text_fn=processor_text_fn)
        return classifier_data_lib.generate_tf_record_from_data_file(
            processor,
            None,
            tokenizer,
            train_data_output_path=FLAGS.train_data_output_path,
            eval_data_output_path=FLAGS.eval_data_output_path,
            test_data_output_path=FLAGS.test_data_output_path,
            max_seq_length=FLAGS.max_seq_length)
    else:
        raise ValueError(
            "No data processor found for the given regression task.")
def main(_):
    tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file,
                                           do_lower_case=FLAGS.do_lower_case)

    input_files = []
    for input_pattern in FLAGS.input_file.split(","):
        input_files.extend(tf.io.gfile.glob(input_pattern))

    logging.info("*** Reading from input files ***")
    for input_file in input_files:
        logging.info("  %s", input_file)

    rng = random.Random(FLAGS.random_seed)
    instances = create_training_instances(
        input_files, tokenizer, FLAGS.max_seq_length, FLAGS.dupe_factor,
        FLAGS.short_seq_prob, FLAGS.masked_lm_prob,
        FLAGS.max_predictions_per_seq, rng, FLAGS.do_whole_word_mask,
        FLAGS.max_ngram_size)

    output_files = FLAGS.output_file.split(",")
    logging.info("*** Writing to output files ***")
    for output_file in output_files:
        logging.info("  %s", output_file)

    write_instance_to_example_files(instances, tokenizer, FLAGS.max_seq_length,
                                    FLAGS.max_predictions_per_seq,
                                    output_files, FLAGS.gzip_compress,
                                    FLAGS.use_v2_feature_names)
def predict_squad(strategy, input_meta_data):
    """Makes predictions for the squad dataset."""
    bert_config = bert_configs.BertConfig.from_json_file(
        FLAGS.bert_config_file)
    tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file,
                                           do_lower_case=FLAGS.do_lower_case)
    run_squad_helper.predict_squad(strategy, input_meta_data, tokenizer,
                                   bert_config, squad_lib_wp)
    def _preprocess_eval_data(self, params):
        eval_examples = self.squad_lib.read_squad_examples(
            input_file=params.input_path,
            is_training=False,
            version_2_with_negative=params.version_2_with_negative)

        temp_file_path = params.input_preprocessed_data_path or self.logging_dir
        if not temp_file_path:
            raise ValueError(
                'You must specify a temporary directory, either in '
                'params.input_preprocessed_data_path or logging_dir to '
                'store intermediate evaluation TFRecord data.')
        eval_writer = self.squad_lib.FeatureWriter(filename=os.path.join(
            temp_file_path, 'eval.tf_record'),
                                                   is_training=False)
        eval_features = []

        def _append_feature(feature, is_padding):
            if not is_padding:
                eval_features.append(feature)
            eval_writer.process_feature(feature)

        # XLNet preprocesses SQuAD examples in a P, Q, class order whereas
        # BERT preprocesses in a class, Q, P order.
        xlnet_ordering = self.task_config.model.encoder.type == 'xlnet'
        kwargs = dict(examples=eval_examples,
                      max_seq_length=params.seq_length,
                      doc_stride=params.doc_stride,
                      max_query_length=params.query_length,
                      is_training=False,
                      output_fn=_append_feature,
                      batch_size=params.global_batch_size,
                      xlnet_format=xlnet_ordering)

        if params.tokenization == 'SentencePiece':
            # squad_lib_sp requires one more argument 'do_lower_case'.
            kwargs['do_lower_case'] = params.do_lower_case
            kwargs['tokenizer'] = tokenization.FullSentencePieceTokenizer(
                sp_model_file=params.vocab_file)
        elif params.tokenization == 'WordPiece':
            kwargs['tokenizer'] = tokenization.FullTokenizer(
                vocab_file=params.vocab_file,
                do_lower_case=params.do_lower_case)
        else:
            raise ValueError('Unexpected tokenization: %s' %
                             params.tokenization)

        eval_dataset_size = self.squad_lib.convert_examples_to_features(
            **kwargs)
        eval_writer.close()

        logging.info('***** Evaluation input stats *****')
        logging.info('  Num orig examples = %d', len(eval_examples))
        logging.info('  Num split examples = %d', len(eval_features))
        logging.info('  Batch size = %d', params.global_batch_size)
        logging.info('  Dataset size = %d', eval_dataset_size)

        return eval_writer.filename, eval_examples, eval_features
def eval_squad(strategy, input_meta_data):
    """Evaluate on the squad dataset."""
    bert_config = bert_configs.BertConfig.from_json_file(
        FLAGS.bert_config_file)
    tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file,
                                           do_lower_case=FLAGS.do_lower_case)
    eval_metrics = run_squad_helper.eval_squad(strategy, input_meta_data,
                                               tokenizer, bert_config,
                                               squad_lib_wp)
    return eval_metrics
  def test_generate_tf_record(self, task_type):
    processor = self.processors[task_type]()
    input_data_dir = os.path.join(self.get_temp_dir(), task_type)
    tf.io.gfile.mkdir(input_data_dir)
    # Write fake train file.
    _create_fake_file(
        os.path.join(input_data_dir, "train-en.tsv"),
        processor.get_labels(),
        is_test=False)

    # Write fake dev file.
    _create_fake_file(
        os.path.join(input_data_dir, "dev-en.tsv"),
        processor.get_labels(),
        is_test=False)

    # Write fake test files.
    for lang in processor.supported_languages:
      _create_fake_file(
          os.path.join(input_data_dir, "test-%s.tsv" % lang),
          processor.get_labels(),
          is_test=True)

    output_path = os.path.join(self.get_temp_dir(), task_type, "output")
    tokenizer = tokenization.FullTokenizer(
        vocab_file=self.vocab_file, do_lower_case=True)
    metadata = tagging_data_lib.generate_tf_record_from_data_file(
        processor,
        input_data_dir,
        tokenizer,
        max_seq_length=8,
        train_data_output_path=os.path.join(output_path, "train.tfrecord"),
        eval_data_output_path=os.path.join(output_path, "eval.tfrecord"),
        test_data_output_path=os.path.join(output_path, "test_{}.tfrecord"),
        text_preprocessing=tokenization.convert_to_unicode)

    self.assertEqual(metadata["train_data_size"], 5)
    files = tf.io.gfile.glob(output_path + "/*")
    expected_files = []
    expected_files.append(os.path.join(output_path, "train.tfrecord"))
    expected_files.append(os.path.join(output_path, "eval.tfrecord"))
    for lang in processor.supported_languages:
      expected_files.append(
          os.path.join(output_path, "test_%s.tfrecord" % lang))

    self.assertCountEqual(files, expected_files)
  def setUp(self):
    super(BertClassifierLibTest, self).setUp()
    self.model_dir = self.get_temp_dir()
    self.processors = {
        "CB": classifier_data_lib.CBProcessor,
        "SUPERGLUE-RTE": classifier_data_lib.SuperGLUERTEProcessor,
        "BOOLQ": classifier_data_lib.BoolQProcessor,
        "WIC": classifier_data_lib.WiCProcessor,
    }

    vocab_tokens = [
        "[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn",
        "##ing", ","
    ]
    with tempfile.NamedTemporaryFile(delete=False) as vocab_writer:
      vocab_writer.write("".join([x + "\n" for x in vocab_tokens
                                 ]).encode("utf-8"))
    vocab_file = vocab_writer.name
    self.tokenizer = tokenization.FullTokenizer(vocab_file)
Beispiel #10
0
def generate_tf_record_from_json_file(input_file_path,
                                      vocab_file_path,
                                      output_path,
                                      translated_input_folder=None,
                                      max_seq_length=384,
                                      do_lower_case=True,
                                      max_query_length=64,
                                      doc_stride=128,
                                      version_2_with_negative=False,
                                      xlnet_format=False):
  """Generates and saves training data into a tf record file."""
  train_examples = read_squad_examples(
      input_file=input_file_path,
      is_training=True,
      version_2_with_negative=version_2_with_negative,
      translated_input_folder=translated_input_folder)
  tokenizer = tokenization.FullTokenizer(
      vocab_file=vocab_file_path, do_lower_case=do_lower_case)
  train_writer = FeatureWriter(filename=output_path, is_training=True)
  number_of_examples = convert_examples_to_features(
      examples=train_examples,
      tokenizer=tokenizer,
      max_seq_length=max_seq_length,
      doc_stride=doc_stride,
      max_query_length=max_query_length,
      is_training=True,
      output_fn=train_writer.process_feature,
      xlnet_format=xlnet_format)
  train_writer.close()

  meta_data = {
      "task_type": "bert_squad",
      "train_data_size": number_of_examples,
      "max_seq_length": max_seq_length,
      "max_query_length": max_query_length,
      "doc_stride": doc_stride,
      "version_2_with_negative": version_2_with_negative,
  }

  return meta_data
Beispiel #11
0
  def test_full_tokenizer(self):
    vocab_tokens = [
        "[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn",
        "##ing", ","
    ]
    with tempfile.NamedTemporaryFile(delete=False) as vocab_writer:
      if six.PY2:
        vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
      else:
        vocab_writer.write("".join([x + "\n" for x in vocab_tokens
                                   ]).encode("utf-8"))

      vocab_file = vocab_writer.name

    tokenizer = tokenization.FullTokenizer(vocab_file)
    os.unlink(vocab_file)

    tokens = tokenizer.tokenize(u"UNwant\u00E9d,running")
    self.assertAllEqual(tokens, ["un", "##want", "##ed", ",", "runn", "##ing"])

    self.assertAllEqual(
        tokenizer.convert_tokens_to_ids(tokens), [7, 4, 5, 10, 8, 9])
  def __init__(self,
               vocab: str,
               do_lower_case: bool,
               len_title: int = 15,
               len_passage: int = 200,
               max_num_articles: int = 5,
               include_article_title_in_passage: bool = False,
               include_text_snippet_in_example: bool = False):
    """Constructs a RawDataProcessor.

    Args:
      vocab: Filepath of the BERT vocabulary.
      do_lower_case: Whether the vocabulary is uncased or not.
      len_title: Maximum number of tokens in story headline.
      len_passage: Maximum number of tokens in article passage.
      max_num_articles: Maximum number of articles in a story.
      include_article_title_in_passage: Whether to include article title in
        article passage.
      include_text_snippet_in_example: Whether to include text snippet (headline
        and article content) in generated tensorflow Examples, for debug usage.
        If include_article_title_in_passage=True, title and body will be
        separated by [SEP].
    """
    self.articles = dict()
    self.tokenizer = tokenization.FullTokenizer(
        vocab, do_lower_case=do_lower_case, split_on_punc=False)
    self.len_title = len_title
    self.len_passage = len_passage
    self.max_num_articles = max_num_articles
    self.include_article_title_in_passage = include_article_title_in_passage
    self.include_text_snippet_in_example = include_text_snippet_in_example
    # ex_index=5 deactivates printing inside convert_single_example.
    self.ex_index = 5
    # Parameters used in InputExample, not used in NHNet.
    self.label = 0
    self.guid = 0
    self.num_generated_examples = 0
def generate_classifier_dataset():
    """Generates classifier dataset and returns input meta data."""
    if FLAGS.classification_task_name in [
            "COLA",
            "WNLI",
            "SST-2",
            "MRPC",
            "QQP",
            "STS-B",
            "MNLI",
            "QNLI",
            "RTE",
            "AX",
            "SUPERGLUE-RTE",
            "CB",
            "BoolQ",
            "WIC",
    ]:
        assert not FLAGS.input_data_dir or FLAGS.tfds_params
    else:
        assert (FLAGS.input_data_dir and FLAGS.classification_task_name
                or FLAGS.tfds_params)

    if FLAGS.tokenization == "WordPiece":
        tokenizer = tokenization.FullTokenizer(
            vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case)
        processor_text_fn = tokenization.convert_to_unicode
    else:
        assert FLAGS.tokenization == "SentencePiece"
        tokenizer = tokenization.FullSentencePieceTokenizer(
            FLAGS.sp_model_file)
        processor_text_fn = functools.partial(tokenization.preprocess_text,
                                              lower=FLAGS.do_lower_case)

    if FLAGS.tfds_params:
        processor = classifier_data_lib.TfdsProcessor(
            tfds_params=FLAGS.tfds_params, process_text_fn=processor_text_fn)
        return classifier_data_lib.generate_tf_record_from_data_file(
            processor,
            None,
            tokenizer,
            train_data_output_path=FLAGS.train_data_output_path,
            eval_data_output_path=FLAGS.eval_data_output_path,
            test_data_output_path=FLAGS.test_data_output_path,
            max_seq_length=FLAGS.max_seq_length)
    else:
        processors = {
            "ax":
            classifier_data_lib.AxProcessor,
            "cola":
            classifier_data_lib.ColaProcessor,
            "imdb":
            classifier_data_lib.ImdbProcessor,
            "mnli":
            functools.partial(classifier_data_lib.MnliProcessor,
                              mnli_type=FLAGS.mnli_type),
            "mrpc":
            classifier_data_lib.MrpcProcessor,
            "qnli":
            classifier_data_lib.QnliProcessor,
            "qqp":
            classifier_data_lib.QqpProcessor,
            "rte":
            classifier_data_lib.RteProcessor,
            "sst-2":
            classifier_data_lib.SstProcessor,
            "sts-b":
            classifier_data_lib.StsBProcessor,
            "xnli":
            functools.partial(classifier_data_lib.XnliProcessor,
                              language=FLAGS.xnli_language),
            "paws-x":
            functools.partial(classifier_data_lib.PawsxProcessor,
                              language=FLAGS.pawsx_language),
            "wnli":
            classifier_data_lib.WnliProcessor,
            "xtreme-xnli":
            functools.partial(
                classifier_data_lib.XtremeXnliProcessor,
                translated_data_dir=FLAGS.translated_input_data_dir,
                only_use_en_dev=FLAGS.only_use_en_dev),
            "xtreme-paws-x":
            functools.partial(
                classifier_data_lib.XtremePawsxProcessor,
                translated_data_dir=FLAGS.translated_input_data_dir,
                only_use_en_dev=FLAGS.only_use_en_dev),
            "ax-g":
            classifier_data_lib.AXgProcessor,
            "superglue-rte":
            classifier_data_lib.SuperGLUERTEProcessor,
            "cb":
            classifier_data_lib.CBProcessor,
            "boolq":
            classifier_data_lib.BoolQProcessor,
            "wic":
            classifier_data_lib.WnliProcessor,
        }
        task_name = FLAGS.classification_task_name.lower()
        if task_name not in processors:
            raise ValueError("Task not found: %s" % (task_name))

        processor = processors[task_name](process_text_fn=processor_text_fn)
        return classifier_data_lib.generate_tf_record_from_data_file(
            processor,
            FLAGS.input_data_dir,
            tokenizer,
            train_data_output_path=FLAGS.train_data_output_path,
            eval_data_output_path=FLAGS.eval_data_output_path,
            test_data_output_path=FLAGS.test_data_output_path,
            max_seq_length=FLAGS.max_seq_length)