Ejemplo n.º 1
0
 def convert_examples_to_features(self, examples, tfrecord_file,
                                  label_names):
     """Converts examples to features and write them into TFRecord file."""
     if not self.is_built:
         self.build()
     classifier_data_lib.file_based_convert_examples_to_features(
         examples, label_names, self.seq_len, self.tokenizer, tfrecord_file)
def generate_sentence_retrevial_tf_record(processor,
                                          data_dir,
                                          tokenizer,
                                          eval_data_output_path=None,
                                          test_data_output_path=None,
                                          max_seq_length=128):
    """Generates the tf records for retrieval tasks.

  Args:
    processor: Input processor object to be used for generating data. Subclass
      of `DataProcessor`.
      data_dir: Directory that contains train/eval data to process. Data files
        should be in from.
      tokenizer: The tokenizer to be applied on the data.
      eval_data_output_path: Output to which processed tf record for evaluation
        will be saved.
      test_data_output_path: Output to which processed tf record for testing
        will be saved. Must be a pattern template with {} if processor has
        language specific test data.
      max_seq_length: Maximum sequence length of the to be generated
        training/eval data.

  Returns:
      A dictionary containing input meta data.
  """
    assert eval_data_output_path or test_data_output_path

    if processor.get_processor_name() == "BUCC":
        path_pattern = "{}-en.{{}}.{}"

    if processor.get_processor_name() == "TATOEBA":
        path_pattern = "{}-en.{}"

    meta_data = {
        "processor_type": processor.get_processor_name(),
        "max_seq_length": max_seq_length,
        "number_eval_data": {},
        "number_test_data": {},
    }
    logging.info("Start to process %s task data",
                 processor.get_processor_name())

    for lang_a in processor.languages:
        for lang_b in [lang_a, "en"]:
            if eval_data_output_path:
                eval_input_data_examples = processor.get_dev_examples(
                    data_dir, os.path.join(path_pattern.format(lang_a,
                                                               lang_b)))

                num_eval_data = len(eval_input_data_examples)
                logging.info("Processing %d dev examples of %s-en.%s",
                             num_eval_data, lang_a, lang_b)
                output_file = os.path.join(
                    eval_data_output_path,
                    "{}-en-{}.{}.tfrecords".format(lang_a, lang_b, "dev"))
                classifier_data_lib.file_based_convert_examples_to_features(
                    eval_input_data_examples, None, max_seq_length, tokenizer,
                    output_file, None)
                meta_data["number_eval_data"][
                    f"{lang_a}-en.{lang_b}"] = num_eval_data

            if test_data_output_path:
                test_input_data_examples = processor.get_test_examples(
                    data_dir, os.path.join(path_pattern.format(lang_a,
                                                               lang_b)))

                num_test_data = len(test_input_data_examples)
                logging.info("Processing %d test examples of %s-en.%s",
                             num_test_data, lang_a, lang_b)
                output_file = os.path.join(
                    test_data_output_path,
                    "{}-en-{}.{}.tfrecords".format(lang_a, lang_b, "test"))
                classifier_data_lib.file_based_convert_examples_to_features(
                    test_input_data_examples, None, max_seq_length, tokenizer,
                    output_file, None)
                meta_data["number_test_data"][
                    f"{lang_a}-en.{lang_b}"] = num_test_data

    return meta_data