def main(_): tokenizer = tokenization.FullTokenizer( vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) input_files = [] for input_pattern in FLAGS.input_file.split(","): input_files.extend(tf.io.gfile.glob(input_pattern)) logging.info("*** Reading from input files ***") for input_file in input_files: logging.info(" %s", input_file) rng = random.Random(FLAGS.random_seed) instances = create_training_instances( input_files, tokenizer, FLAGS.max_seq_length, FLAGS.dupe_factor, FLAGS.short_seq_prob, FLAGS.masked_lm_prob, FLAGS.max_predictions_per_seq, rng) output_files = FLAGS.output_file.split(",") logging.info("*** Writing to output files ***") for output_file in output_files: logging.info(" %s", output_file) write_instance_to_example_files(instances, tokenizer, FLAGS.max_seq_length, FLAGS.max_predictions_per_seq, output_files)
def gen_tf_records(self, processor, vocab_file, max_seq_length, do_lower_case=True): tokenizer = bert_tokenization.FullTokenizer( vocab_file=vocab_file, do_lower_case=do_lower_case) meta_data = { "labels": processor.get_labels(), "num_labels": len(processor.get_labels()), "max_seq_length": max_seq_length, } for set_type in processor.get_set_types(): tf_record_path = self._get_tf_record_path(set_type) input_data_examples = processor.get_examples( self.data_dir, set_type) file_based_convert_examples_to_features(input_data_examples, processor.get_labels(), max_seq_length, tokenizer, tf_record_path) meta_data['{}_data_size'.format(set_type)] = len( input_data_examples) if set_type == 'dev': meta_data['eval_data_size'.format(set_type)] = len( input_data_examples) with tf.io.gfile.GFile(self._get_metadata_path(), "w") as writer: writer.write(json.dumps(meta_data, indent=4) + "\n")
def generate_tf_record_from_data_file(processor, data_dir, vocab_file, train_data_output_path=None, eval_data_output_path=None, max_seq_length=128, do_lower_case=True): """Generates and saves training data into a tf record file. Arguments: processor: Input processor object to be used for generating data. Subclass of `DataProcessor`. data_dir: Directory that contains train/eval data to process. Data files should be in from "dev.tsv", "test.tsv", or "train.tsv". vocab_file: Text file with words to be used for training/evaluation. train_data_output_path: Output to which processed tf record for training will be saved. eval_data_output_path: Output to which processed tf record for evaluation will be saved. max_seq_length: Maximum sequence length of the to be generated training/eval data. do_lower_case: Whether to lower case input text. Returns: A dictionary containing input meta data. """ tokenizer = tokenization.FullTokenizer( vocab_file=vocab_file, do_lower_case=do_lower_case) meta_data = { "processor_type": processor.get_processor_name(), "num_labels": len(processor.get_labels()), "max_seq_length": max_seq_length, } for set_type in ['train', 'dev', 'test']: data_output_path = os.path.join(data_dir, '{}.tf_record'.format(set_type)) input_data_examples = processor.get_examples(data_dir, set_type) file_based_convert_examples_to_features(input_data_examples, processor.get_labels(), max_seq_length, tokenizer, data_output_path) meta_data['{}_data_size'.format(set_type)] = len(input_data_examples) if set_type == 'dev': meta_data['eval_data_size'.format(set_type)] = len(input_data_examples) return meta_data
def generate_tf_record_from_data_file(processor, data_dir, vocab_file, token_prob, index, max_seq_length=128, do_lower_case=True): label_list = processor.get_labels() tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file, do_lower_case=do_lower_case) assert data_dir train_input_data_examples = processor.get_examples(data_dir, 'train') data_stats_dir = os.path.join(data_dir, "data_stats") unsup_out_dir = os.path.join(data_dir, "unsup", "tf_idf-{}".format(token_prob), str(index)) _proc_and_save_unsup_data(train_input_data_examples, processor.get_labels(), data_stats_dir, unsup_out_dir, tokenizer, max_seq_length, token_prob)