Example #1
0
def main(_):
  tokenizer = tokenization.FullTokenizer(
      vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case)

  input_files = []
  for input_pattern in FLAGS.input_file.split(","):
    input_files.extend(tf.io.gfile.glob(input_pattern))

  logging.info("*** Reading from input files ***")
  for input_file in input_files:
    logging.info("  %s", input_file)

  rng = random.Random(FLAGS.random_seed)
  instances = create_training_instances(
      input_files, tokenizer, FLAGS.max_seq_length, FLAGS.dupe_factor,
      FLAGS.short_seq_prob, FLAGS.masked_lm_prob, FLAGS.max_predictions_per_seq,
      rng)

  output_files = FLAGS.output_file.split(",")
  logging.info("*** Writing to output files ***")
  for output_file in output_files:
    logging.info("  %s", output_file)

  write_instance_to_example_files(instances, tokenizer, FLAGS.max_seq_length,
                                  FLAGS.max_predictions_per_seq, output_files)
    def gen_tf_records(self,
                       processor,
                       vocab_file,
                       max_seq_length,
                       do_lower_case=True):
        tokenizer = bert_tokenization.FullTokenizer(
            vocab_file=vocab_file, do_lower_case=do_lower_case)

        meta_data = {
            "labels": processor.get_labels(),
            "num_labels": len(processor.get_labels()),
            "max_seq_length": max_seq_length,
        }
        for set_type in processor.get_set_types():
            tf_record_path = self._get_tf_record_path(set_type)
            input_data_examples = processor.get_examples(
                self.data_dir, set_type)
            file_based_convert_examples_to_features(input_data_examples,
                                                    processor.get_labels(),
                                                    max_seq_length, tokenizer,
                                                    tf_record_path)
            meta_data['{}_data_size'.format(set_type)] = len(
                input_data_examples)
            if set_type == 'dev':
                meta_data['eval_data_size'.format(set_type)] = len(
                    input_data_examples)

        with tf.io.gfile.GFile(self._get_metadata_path(), "w") as writer:
            writer.write(json.dumps(meta_data, indent=4) + "\n")
def generate_tf_record_from_data_file(processor,
                                      data_dir,
                                      vocab_file,
                                      train_data_output_path=None,
                                      eval_data_output_path=None,
                                      max_seq_length=128,
                                      do_lower_case=True):
  """Generates and saves training data into a tf record file.

  Arguments:
      processor: Input processor object to be used for generating data. Subclass
        of `DataProcessor`.
      data_dir: Directory that contains train/eval data to process. Data files
        should be in from "dev.tsv", "test.tsv", or "train.tsv".
      vocab_file: Text file with words to be used for training/evaluation.
      train_data_output_path: Output to which processed tf record for training
        will be saved.
      eval_data_output_path: Output to which processed tf record for evaluation
        will be saved.
      max_seq_length: Maximum sequence length of the to be generated
        training/eval data.
      do_lower_case: Whether to lower case input text.

  Returns:
      A dictionary containing input meta data.
  """
  tokenizer = tokenization.FullTokenizer(
      vocab_file=vocab_file, do_lower_case=do_lower_case)

  meta_data = {
    "processor_type": processor.get_processor_name(),
    "num_labels": len(processor.get_labels()),
    "max_seq_length": max_seq_length,
  }
  for set_type in ['train', 'dev', 'test']:
    data_output_path = os.path.join(data_dir, '{}.tf_record'.format(set_type))
    input_data_examples = processor.get_examples(data_dir, set_type)
    file_based_convert_examples_to_features(input_data_examples,
                                            processor.get_labels(),
                                            max_seq_length, tokenizer,
                                            data_output_path)
    meta_data['{}_data_size'.format(set_type)] = len(input_data_examples)
    if set_type == 'dev':
      meta_data['eval_data_size'.format(set_type)] = len(input_data_examples)
  return meta_data
Example #4
0
def generate_tf_record_from_data_file(processor,
                                      data_dir,
                                      vocab_file,
                                      token_prob,
                                      index,
                                      max_seq_length=128,
                                      do_lower_case=True):
    label_list = processor.get_labels()
    tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file,
                                           do_lower_case=do_lower_case)

    assert data_dir
    train_input_data_examples = processor.get_examples(data_dir, 'train')

    data_stats_dir = os.path.join(data_dir, "data_stats")
    unsup_out_dir = os.path.join(data_dir, "unsup",
                                 "tf_idf-{}".format(token_prob), str(index))
    _proc_and_save_unsup_data(train_input_data_examples,
                              processor.get_labels(), data_stats_dir,
                              unsup_out_dir, tokenizer, max_seq_length,
                              token_prob)