Esempio n. 1
0
        pass
    else:
        args.init_restore_dir = glob(args.init_restore_dir + '*.pth')
        assert len(args.init_restore_dir) == 1
        args.init_restore_dir = args.init_restore_dir[0]

    os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu_ids
    device = torch.device("cuda")
    n_gpu = torch.cuda.device_count()
    print("device %s n_gpu %d" % (device, n_gpu))
    print("device: {} n_gpu: {} 16-bits training: {}".format(
        device, n_gpu, args.float16))

    # load the bert setting
    if 'albert' not in args.bert_config_file:
        bert_config = BertConfig.from_json_file(args.bert_config_file)
    else:
        if 'google' in args.bert_config_file:
            bert_config = AlbertConfig.from_json_file(args.bert_config_file)
        else:
            bert_config = ALBertConfig.from_json_file(args.bert_config_file)

    # load data
    print('loading data...')
    tokenizer = tokenization.BertTokenizer(vocab_file=args.vocab_file,
                                           do_lower_case=True)
    assert args.vocab_size == len(tokenizer.vocab)

    if not os.path.exists(args.test_dir1) or not os.path.exists(
            args.test_dir2):
        json2features(args.test_file, [args.test_dir1, args.test_dir2],
Esempio n. 2
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--gpu_ids", default='', required=True, type=str)
    parser.add_argument("--bert_config_file", required=True,
                        default='check_points/pretrain_models/roberta_wwm_ext_large/bert_config.json')
    parser.add_argument("--vocab_file", required=True,
                        default='check_points/pretrain_models/roberta_wwm_ext_large/vocab.txt')
    parser.add_argument("--init_restore_dir", required=True,
                        default='check_points/pretrain_models/roberta_wwm_ext_large/pytorch_model.pth')
    parser.add_argument("--input_dir", required=True, default='dataset/CHID')
    parser.add_argument("--output_dir", required=True, default='check_points/CHID')

    ## Other parameters
    parser.add_argument("--train_file", default='./origin_data/CHID/train.json', type=str,
                        help="SQuAD json for training. E.g., train-v1.1.json")
    parser.add_argument("--train_ans_file", default='./origin_data/CHID/train_answer.json', type=str,
                        help="SQuAD answer for training. E.g., train-v1.1.json")
    parser.add_argument("--predict_file", default='./origin_data/CHID/dev.json', type=str,
                        help="SQuAD json for predictions. E.g., dev-v1.1.json or test-v1.1.json")
    parser.add_argument("--predict_ans_file", default='origin_data/CHID/dev_answer.json', type=str,
                        help="SQuAD json for predictions. E.g., dev-v1.1.json or test-v1.1.json")
    parser.add_argument("--max_seq_length", default=64, type=int,
                        help="The maximum total input sequence length after WordPiece tokenization. Sequences "
                             "longer than this will be truncated, and sequences shorter than this will be padded.")
    parser.add_argument("--max_num_choices", default=10, type=int,
                        help="The maximum number of cadicate answer,  shorter than this will be padded.")
    parser.add_argument("--train_batch_size", default=20, type=int, help="Total batch size for training.")
    parser.add_argument("--predict_batch_size", default=16, type=int, help="Total batch size for predictions.")
    parser.add_argument("--learning_rate", default=2e-5, type=float, help="The initial learning rate for Adam.")
    parser.add_argument("--num_train_epochs", default=3.0, type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument("--warmup_proportion", default=0.06, type=float,
                        help="Proportion of training to perform linear learning rate warmup for. E.g., 0.1 = 10% "
                             "of training.")
    parser.add_argument('--seed', type=int, default=42, help="random seed for initialization")
    parser.add_argument('--gradient_accumulation_steps', type=int, default=1,
                        help="Number of updates steps to accumulate before performing a backward/update pass.")
    parser.add_argument("--do_lower_case", default=True,
                        help="Whether to lower case the input text. True for uncased models, False for cased models.")
    parser.add_argument('--fp16', default=False, action='store_true',
                        help="Whether to use 16-bit float precision instead of 32-bit")

    args = parser.parse_args()
    print(args)
    os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu_ids

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    n_gpu = torch.cuda.device_count()
    print("device: {} n_gpu: {}, 16-bits training: {}".format(device, n_gpu, args.fp16))

    if args.gradient_accumulation_steps < 1:
        raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format(
            args.gradient_accumulation_steps))

    args.train_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps)

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

    if os.path.exists(args.input_dir) == False:
        os.makedirs(args.input_dir, exist_ok=True)
    if os.path.exists(args.output_dir) == False:
        os.makedirs(args.output_dir, exist_ok=True)

    tokenizer = BertTokenizer(vocab_file=args.vocab_file, do_lower_case=args.do_lower_case)

    print('ready for train dataset')

    train_example_file = os.path.join(args.input_dir, 'train_examples_{}.pkl'.format(str(args.max_seq_length)))
    train_feature_file = os.path.join(args.input_dir, 'train_features_{}.pkl'.format(str(args.max_seq_length)))

    train_features = generate_input(args.train_file, args.train_ans_file, train_example_file, train_feature_file,
                                    tokenizer, max_seq_length=args.max_seq_length,
                                    max_num_choices=args.max_num_choices,
                                    is_training=True)

    dev_example_file = os.path.join(args.input_dir, 'dev_examples_{}.pkl'.format(str(args.max_seq_length)))
    dev_feature_file = os.path.join(args.input_dir, 'dev_features_{}.pkl'.format(str(args.max_seq_length)))

    eval_features = generate_input(args.predict_file, None, dev_example_file, dev_feature_file, tokenizer,
                                   max_seq_length=args.max_seq_length, max_num_choices=args.max_num_choices,
                                   is_training=False)

    print("train features {}".format(len(train_features)))
    num_train_steps = int(
        len(train_features) / args.train_batch_size / args.gradient_accumulation_steps * args.num_train_epochs)

    print("loaded train dataset")
    print("Num generate examples = {}".format(len(train_features)))
    print("Batch size = {}".format(args.train_batch_size))
    print("Num steps for a epoch = {}".format(num_train_steps))

    all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long)
    all_input_masks = torch.tensor([f.input_masks for f in train_features], dtype=torch.long)
    all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long)
    all_choice_masks = torch.tensor([f.choice_masks for f in train_features], dtype=torch.long)
    all_labels = torch.tensor([f.label for f in train_features], dtype=torch.long)

    train_data = TensorDataset(all_input_ids, all_input_masks, all_segment_ids, all_choice_masks, all_labels)
    train_sampler = RandomSampler(train_data)
    train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size,
                                  drop_last=True)

    all_example_ids = [f.example_id for f in eval_features]
    all_tags = [f.tag for f in eval_features]
    all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long)
    all_input_masks = torch.tensor([f.input_masks for f in eval_features], dtype=torch.long)
    all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long)
    all_choice_masks = torch.tensor([f.choice_masks for f in eval_features], dtype=torch.long)
    all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long)
    eval_data = TensorDataset(all_input_ids, all_input_masks, all_segment_ids, all_choice_masks,
                              all_example_index)
    # Run prediction for full data
    eval_sampler = SequentialSampler(eval_data)
    eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.predict_batch_size)

    # Prepare model
    if 'albert' in args.bert_config_file:
        if 'google' in args.bert_config_file:
            bert_config = AlbertConfig.from_json_file(args.bert_config_file)
            model = reset_model(args, bert_config, AlbertForMultipleChoice)
        else:
            bert_config = ALBertConfig.from_json_file(args.bert_config_file)
            model = reset_model(args, bert_config, ALBertForMultipleChoice)
    else:
        bert_config = BertConfig.from_json_file(args.bert_config_file)
        model = reset_model(args, bert_config, BertForMultipleChoice)
    model = model.to(device)
    if n_gpu > 1:
        model = torch.nn.DataParallel(model)

    optimizer = get_optimization(model,
                                 float16=args.fp16,
                                 learning_rate=args.learning_rate,
                                 total_steps=num_train_steps,
                                 schedule='warmup_linear',
                                 warmup_rate=args.warmup_proportion,
                                 weight_decay_rate=0.01,
                                 max_grad_norm=1.0,
                                 opt_pooler=True)

    global_step = 0
    best_acc = 0
    acc = 0
    for i in range(int(args.num_train_epochs)):
        num_step = 0
        average_loss = 0
        model.train()
        model.zero_grad()  # 等价于optimizer.zero_grad()
        steps_per_epoch = num_train_steps // args.num_train_epochs
        with tqdm(total=int(steps_per_epoch), desc='Epoch %d' % (i + 1)) as pbar:
            for step, batch in enumerate(train_dataloader):
                if n_gpu == 1:
                    batch = tuple(t.to(device) for t in batch)  # multi-gpu does scattering it-self
                input_ids, input_masks, segment_ids, choice_masks, labels = batch
                if step == 0 and i == 0:
                    print('shape of input_ids: {}'.format(input_ids.shape))
                    print('shape of labels: {}'.format(labels.shape))
                loss = model(input_ids=input_ids,
                             token_type_ids=segment_ids,
                             attention_mask=input_masks,
                             labels=labels)
                if n_gpu > 1:
                    loss = loss.mean()  # mean() to average on multi-gpu.
                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps
                if args.fp16:
                    optimizer.backward(loss)
                    # modify learning rate with special warm up BERT uses
                    # if args.fp16 is False, BertAdam is used and handles this automatically
                    lr_this_step = args.learning_rate * warmup_linear(global_step / num_train_steps,
                                                                      args.warmup_proportion)
                    for param_group in optimizer.param_groups:
                        param_group['lr'] = lr_this_step
                else:
                    loss.backward()
                if (step + 1) % args.gradient_accumulation_steps == 0:
                    optimizer.step()
                    optimizer.zero_grad()
                    global_step += 1

                average_loss += loss.item()
                num_step += 1

                pbar.set_postfix({'loss': '{0:1.5f}'.format(average_loss / (num_step + 1e-5))})
                pbar.update(1)

        print("***** Running predictions *****")
        print("Num split examples = {}".format(len(eval_features)))
        print("Batch size = {}".format(args.predict_batch_size))

        model.eval()
        all_results = []
        print("Start evaluating")
        for input_ids, input_masks, segment_ids, choice_masks, example_indices in tqdm(eval_dataloader,
                                                                                       desc="Evaluating",
                                                                                       disable=None):
            if len(all_results) == 0:
                print('shape of input_ids: {}'.format(input_ids.shape))
            input_ids = input_ids.to(device)
            input_masks = input_masks.to(device)
            segment_ids = segment_ids.to(device)
            with torch.no_grad():
                batch_logits = model(input_ids=input_ids,
                                     token_type_ids=segment_ids,
                                     attention_mask=input_masks,
                                     labels=None)
            for i, example_index in enumerate(example_indices):
                logits = batch_logits[i].detach().cpu().tolist()
                eval_feature = eval_features[example_index.item()]
                unique_id = int(eval_feature.unique_id)
                all_results.append(RawResult(unique_id=unique_id,
                                             example_id=all_example_ids[unique_id],
                                             tag=all_tags[unique_id],
                                             logit=logits))

        predict_file = 'dev_predictions.json'
        print('decoder raw results')
        tmp_predict_file = os.path.join(args.output_dir, "raw_predictions.pkl")
        output_prediction_file = os.path.join(args.output_dir, predict_file)
        results = get_final_predictions(all_results, tmp_predict_file, g=True)
        write_predictions(results, output_prediction_file)
        print('predictions saved to {}'.format(output_prediction_file))

        if args.predict_ans_file:
            acc = evaluate(args.predict_ans_file, output_prediction_file)
            print(f'{args.predict_file} 预测精度:{acc}')

        # Save a epoch trained model
        if acc > best_acc:
            best_acc = acc
            output_model_file = os.path.join(args.output_dir, "best_checkpoint.bin")
            print('save trained model from {}'.format(output_model_file))
            model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self
            torch.save(model_to_save.state_dict(), output_model_file)
def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, bert_config_file, pytorch_dump_path, is_albert):
    config_path = os.path.abspath(bert_config_file)
    tf_path = os.path.abspath(tf_checkpoint_path)
    print("Converting TensorFlow checkpoint from {} with config at {}".format(tf_path, config_path))
    # Load weights from TF model
    init_vars = tf.train.list_variables(tf_path)
    names = []
    arrays = []
    for name, shape in init_vars:
        print("Loading TF weight {} with shape {}".format(name, shape))
        array = tf.train.load_variable(tf_path, name)
        names.append(name)
        arrays.append(array)

    # Initialise PyTorch model
    if is_albert:
        config = ALBertConfig.from_json_file(bert_config_file)
        print("Building PyTorch model from configuration: {}".format(str(config)))
        model = ALBertForPreTraining(config)
    else:
        config = BertConfig.from_json_file(bert_config_file)
        print("Building PyTorch model from configuration: {}".format(str(config)))
        model = BertForPreTraining(config)

    for name, array in zip(names, arrays):
        name = name.split('/')
        if name[0] == 'global_step':
            continue
        # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
        # which are not required for using pretrained model
        if any(n in ["adam_v", "adam_m"] for n in name):
            print("Skipping {}".format("/".join(name)))
            continue
        pointer = model
        for m_name in name:
            if re.fullmatch(r'[A-Za-z]+_\d+', m_name):
                l = re.split(r'_(\d+)', m_name)
            else:
                l = [m_name]
            if l[0] == 'kernel' or l[0] == 'gamma':
                pointer = getattr(pointer, 'weight')
            elif l[0] == 'output_bias' or l[0] == 'beta':
                pointer = getattr(pointer, 'bias')
            elif l[0] == 'output_weights':
                pointer = getattr(pointer, 'weight')
            else:
                pointer = getattr(pointer, l[0])
            if len(l) >= 2:
                num = int(l[1])
                pointer = pointer[num]
        if m_name[-11:] == '_embeddings':
            pointer = getattr(pointer, 'weight')
        elif m_name[-13:] == '_embeddings_2':
            pointer = getattr(pointer, 'weight')
            array = np.transpose(array)
        elif m_name == 'kernel':
            array = np.transpose(array)
        try:
            assert pointer.shape == array.shape
        except AssertionError as e:
            e.args += (pointer.shape, array.shape)
            raise
        print("Initialize PyTorch weight {}".format(name))
        pointer.data = torch.from_numpy(array)

    # Save pytorch-model
    print("Save PyTorch model to {}".format(pytorch_dump_path))
    torch.save(model.state_dict(), pytorch_dump_path)