def main(): """Prepares data. """ # Loads data print("Loading data") data_dir = './data' tfrecord_output_dir = data_dir tx.utils.maybe_create_dir(tfrecord_output_dir) processor = data_utils.TsvProcessor() num_train_data = len(processor.get_train_examples(data_dir)) print('num_train_data:%d' % num_train_data) tokenizer = tx.data.BERTTokenizer( pretrained_model_name=config_model.bert['pretrained_model_name']) # Produces TFRecord files data_utils.prepare_TFRecord_data(processor=processor, tokenizer=tokenizer, data_dir=data_dir, max_seq_length=MAX_SEQ_LENGTH, output_dir=tfrecord_output_dir) _modify_config_data(MAX_SEQ_LENGTH, num_train_data)
def main(): """Prepares data. """ # Loads data tf.logging.info("Loading data") task_datasets_rename = { "COLA": "CoLA", "SST": "SST-2", } data_dir = 'data/{}'.format(FLAGS.task) if FLAGS.task.upper() in task_datasets_rename: data_dir = 'data/{}'.format(task_datasets_rename[FLAGS.task]) if FLAGS.tfrecord_output_dir is None: tfrecord_output_dir = data_dir else: tfrecord_output_dir = FLAGS.tfrecord_output_dir tx.utils.maybe_create_dir(tfrecord_output_dir) processors = { "COLA": data_utils.ColaProcessor, "MNLI": data_utils.MnliProcessor, "MRPC": data_utils.MrpcProcessor, "XNLI": data_utils.XnliProcessor, 'SST': data_utils.SSTProcessor } processor = processors[FLAGS.task]() num_classes = len(processor.get_labels()) num_train_data = len(processor.get_train_examples(data_dir)) tf.logging.info('num_classes:%d; num_train_data:%d' % (num_classes, num_train_data)) tokenizer = tx.data.BERTTokenizer( pretrained_model_name=FLAGS.pretrained_model_name) # Produces TFRecord files data_utils.prepare_TFRecord_data(processor=processor, tokenizer=tokenizer, data_dir=data_dir, max_seq_length=FLAGS.max_seq_length, output_dir=tfrecord_output_dir) _modify_config_data(FLAGS.max_seq_length, num_train_data, num_classes)
def prepare_data(): """ Builds the model and runs. """ data_dir = FLAGS.data_dir if FLAGS.tfrecord_output_dir is None: tfrecord_output_dir = data_dir else: tfrecord_output_dir = FLAGS.tfrecord_output_dir tx.utils.maybe_create_dir(tfrecord_output_dir) # Creates a data pre-processor for, e.g., BPE encoding proc = processor.get_encoder(FLAGS.pretrain_model_dir) # Produces TFRecord files data_utils.prepare_TFRecord_data(data_dir=data_dir, max_seq_length=FLAGS.max_seq_length, encoder=proc, output_dir=tfrecord_output_dir)