Beispiel #1
0
def main():
    """Prepares data.
    """
    # Loads data
    print("Loading data")

    data_dir = './data'

    tfrecord_output_dir = data_dir
    tx.utils.maybe_create_dir(tfrecord_output_dir)

    processor = data_utils.TsvProcessor()

    num_train_data = len(processor.get_train_examples(data_dir))
    print('num_train_data:%d' % num_train_data)

    tokenizer = tx.data.BERTTokenizer(
        pretrained_model_name=config_model.bert['pretrained_model_name'])

    # Produces TFRecord files
    data_utils.prepare_TFRecord_data(processor=processor,
                                     tokenizer=tokenizer,
                                     data_dir=data_dir,
                                     max_seq_length=MAX_SEQ_LENGTH,
                                     output_dir=tfrecord_output_dir)

    _modify_config_data(MAX_SEQ_LENGTH, num_train_data)
Beispiel #2
0
def main():
    """Prepares data.
    """
    # Loads data
    tf.logging.info("Loading data")

    task_datasets_rename = {
        "COLA": "CoLA",
        "SST": "SST-2",
    }

    data_dir = 'data/{}'.format(FLAGS.task)
    if FLAGS.task.upper() in task_datasets_rename:
        data_dir = 'data/{}'.format(task_datasets_rename[FLAGS.task])

    if FLAGS.tfrecord_output_dir is None:
        tfrecord_output_dir = data_dir
    else:
        tfrecord_output_dir = FLAGS.tfrecord_output_dir
    tx.utils.maybe_create_dir(tfrecord_output_dir)

    processors = {
        "COLA": data_utils.ColaProcessor,
        "MNLI": data_utils.MnliProcessor,
        "MRPC": data_utils.MrpcProcessor,
        "XNLI": data_utils.XnliProcessor,
        'SST': data_utils.SSTProcessor
    }
    processor = processors[FLAGS.task]()

    num_classes = len(processor.get_labels())
    num_train_data = len(processor.get_train_examples(data_dir))
    tf.logging.info('num_classes:%d; num_train_data:%d' %
                    (num_classes, num_train_data))

    tokenizer = tx.data.BERTTokenizer(
        pretrained_model_name=FLAGS.pretrained_model_name)

    # Produces TFRecord files
    data_utils.prepare_TFRecord_data(processor=processor,
                                     tokenizer=tokenizer,
                                     data_dir=data_dir,
                                     max_seq_length=FLAGS.max_seq_length,
                                     output_dir=tfrecord_output_dir)

    _modify_config_data(FLAGS.max_seq_length, num_train_data, num_classes)
def prepare_data():
    """
    Builds the model and runs.
    """
    data_dir = FLAGS.data_dir
    if FLAGS.tfrecord_output_dir is None:
        tfrecord_output_dir = data_dir
    else:
        tfrecord_output_dir = FLAGS.tfrecord_output_dir
    tx.utils.maybe_create_dir(tfrecord_output_dir)

    # Creates a data pre-processor for, e.g., BPE encoding
    proc = processor.get_encoder(FLAGS.pretrain_model_dir)

    # Produces TFRecord files
    data_utils.prepare_TFRecord_data(data_dir=data_dir,
                                     max_seq_length=FLAGS.max_seq_length,
                                     encoder=proc,
                                     output_dir=tfrecord_output_dir)