Ejemplo n.º 1
0
    print("device %s n_gpu %d" % (device, n_gpu))
    print("device: {} n_gpu: {} 16-bits training: {}".format(
        device, n_gpu, args.float16))

    # load the bert setting
    bert_config = BertConfig.from_json_file(args.bert_config_file)

    # load data
    print('loading data...')
    tokenizer = tokenization.BertTokenizer(vocab_file=args.vocab_file,
                                           do_lower_case=True)
    assert args.vocab_size == len(tokenizer.vocab)
    if not os.path.exists(args.train_dir):
        json2features(args.train_file, [
            args.train_dir.replace('_features_', '_examples_'), args.train_dir
        ],
                      tokenizer,
                      is_training=True,
                      max_seq_length=bert_config.max_position_embeddings)

    if not os.path.exists(args.dev_dir1) or not os.path.exists(args.dev_dir2):
        json2features(args.dev_file, [args.dev_dir1, args.dev_dir2],
                      tokenizer,
                      is_training=False,
                      max_seq_length=bert_config.max_position_embeddings)

    train_features = json.load(open(args.train_dir, 'r'))
    dev_examples = json.load(open(args.dev_dir1, 'r'))
    dev_features = json.load(open(args.dev_dir2, 'r'))
    if os.path.exists(args.log_file):
        os.remove(args.log_file)
Ejemplo n.º 2
0
    args.checkpoint_dir += (
        '/epoch{}_batch{}_lr{}_warmup{}_anslen{}_tf/'.format(
            args.train_epochs, args.n_batch, args.lr, args.warmup_rate,
            args.max_ans_length))
    args = utils.check_args(args, mpi_rank)
    print_rank0('######## generating data ########')

    if mpi_rank == 0:
        tokenizer = BertTokenizer(vocab_file=args.vocab_file,
                                  do_lower_case=True)
        assert args.vocab_size == len(tokenizer.vocab)
        if not os.path.exists(args.train_dir):
            json2features(args.train_file, [
                args.train_dir.replace('_features_', '_examples_'),
                args.train_dir
            ],
                          tokenizer,
                          is_training=True)

        if not os.path.exists(args.dev_dir1) or not os.path.exists(
                args.dev_dir2):
            json2features(args.dev_file, [args.dev_dir1, args.dev_dir2],
                          tokenizer,
                          is_training=False)

    train_data = json.load(open(args.train_dir, 'r'))
    dev_examples = json.load(open(args.dev_dir1, 'r'))
    dev_data = json.load(open(args.dev_dir2, 'r'))

    if mpi_rank == 0:
        if os.path.exists(args.log_file):
Ejemplo n.º 3
0
    os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu_ids
    n_gpu = len(args.gpu_ids.split(','))

    args.checkpoint_dir += (
        '/epoch{}_batch{}_lr{}_warmup{}_anslen{}_tf/'.format(
            args.train_epochs, args.n_batch, args.lr, args.warmup_rate,
            args.max_ans_length))
    mpi_rank = 0
    args = utils.check_args(args, mpi_rank)

    tokenizer = BertTokenizer(vocab_file=args.vocab_file, do_lower_case=True)

    if not os.path.exists(args.eval_dir1) or not os.path.exists(
            args.eval_dir2):
        json2features(args.eval_file, [args.eval_dir1, args.eval_dir2],
                      tokenizer,
                      is_training=False)

    eval_examples = json.load(open(args.eval_dir1, 'r'))
    eval_data = json.load(open(args.eval_dir2, 'r'))
    eval_steps_per_epoch = len(eval_data) // (args.n_batch * n_gpu)

    eval_gen = data_generator(eval_data,
                              args.n_batch * n_gpu,
                              shuffle=False,
                              drop_last=False)

    if len(eval_data) % (args.n_batch * n_gpu) != 0:
        eval_steps_per_epoch += 1

    with tf.device("/gpu:0"):