Example #1
0
def train(train_iter, val_iter, model):
    opt = AdamW(model.parameters(), lr=1e-4, eps=1e-8)
    scheduler = WarmupLinearSchedule(opt, warmup_steps=20, t_total=2500)
    model.train()
    losses = []
    for i, ex in enumerate(train_iter):
        opt.zero_grad()
        words, mapper, _ = ex.word
        label, lengths = ex.head
        batch, _ = label.shape

        # Model
        final = model(words.cuda(), mapper)
        for b in range(batch):
            final[b, lengths[b]-1:, :] = 0
            final[b, :, lengths[b]-1:] = 0

        if not lengths.max() <= final.shape[1] + 1:
            print("fail")
            continue
        dist = DependencyCRF(final, lengths=lengths)

        labels = dist.struct.to_parts(label, lengths=lengths).type_as(final)
        log_prob = dist.log_prob(labels)

        loss = log_prob.sum()
        (-loss).backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        opt.step()
        scheduler.step()
        losses.append(loss.detach())
        if i % 50 == 1:
            print(-torch.tensor(losses).mean(), words.shape)
            losses = []
        if i % 600 == 500:
            validate(val_iter)
Example #2
0
 def train(self):
     '''
         train start here.
     '''
     optimizer = AdamW(
         [p for p in self.model.parameters() if p.requires_grad], lr=1e-5)
     for round_num in range(0, self.ROUND):
         # do valid per round
         print(f'**** now round {round_num} valid begin:')
         self._eval(self.valid_loader)
         # do train
         for step, batch in enumerate(self.train_loader):
             self.model.train()
             deep_apply_dict(batch, lambda _, v: v.to(self.DEVICE))
             y = batch.pop('y').view(-1)
             res = self.model.forward(**batch)
             res = res.view(-1, res.size(-1))
             loss = F.cross_entropy(res, y)
             print(
                 f'[round: {round_num}]: {step}/{len(self.train_loader)} end. loss: {loss}'
             )
             loss.backward()
             optimizer.step()
             optimizer.zero_grad()
def main():
    parser = argparse.ArgumentParser(description='openGPT-2 analysis')

    parser.add_argument(
        '--mode',
        choices=['train', 'eval-singletoken', 'eval-completion', 'eval-both'],
        default='eval-singletoken')
    parser.add_argument('--eval-split', choices=['train', 'valid', 'test'])
    parser.add_argument('--model-name',
                        choices=['gpt2', 'gpt2-medium', 'gpt2-large'],
                        default='gpt2-medium')
    parser.add_argument('--model-load-dir', type=str, default=None)
    parser.add_argument('--seed', type=int, default=42)
    parser.add_argument('--data-base', type=str)
    parser.add_argument('--num-train-epochs', type=int, default=1)
    parser.add_argument('--batch-size-singletoken', type=int, default=1024)
    parser.add_argument('--batch-size-completion', type=int, default=300)
    parser.add_argument(
        "--output-dir",
        default=None,
        type=str,
        required=True,
        help=
        "The output directory where the model predictions and checkpoints will be written."
    )

    # eval-completion
    parser.add_argument('--prefix-length', type=int, default=50)
    parser.add_argument('--continuation-length', type=int, default=100)
    parser.add_argument('--top-k', type=int, default=1)
    parser.add_argument('--top-p', type=float, default=0.0)

    # custom training
    parser.add_argument('--sequence-tune-rate', type=float, default=0.5)
    parser.add_argument('--train-batch-size', type=int, default=300)
    parser.add_argument('--report-metrics-every', type=int, default=10)
    parser.add_argument('--save-every', type=int, default=1000)
    parser.add_argument('--sequence-ngram-n', type=int, default=4)
    parser.add_argument('--train-n-steps', type=int, default=10000)
    parser.add_argument('--validate-every', type=int, default=10000)

    # training loop
    parser.add_argument("--adam-epsilon",
                        default=1e-8,
                        type=float,
                        help="Epsilon for Adam optimizer.")
    parser.add_argument('--max-grad-norm', type=int, default=1)
    parser.add_argument("--max-steps",
                        default=-1,
                        type=int,
                        help="If > 0: set total number of training \
                            steps to perform. Override num_train_epochs.")
    parser.add_argument('--gradient-accumulation-steps',
                        type=int,
                        default=1,
                        help="Number of updates steps to accumulate before\
                            performing a backward/update pass.")
    parser.add_argument('--learning-rate', type=float, default=6.25e-5)
    parser.add_argument("--warmup-steps",
                        default=0,
                        type=int,
                        help="Linear warmup over warmup_steps.")
    parser.add_argument('--lr-schedule', type=str, default='warmup_linear')
    parser.add_argument('--weight-decay', type=float, default=0.01)
    parser.add_argument('--lm-coef', type=float, default=0.9)

    args = parser.parse_args()
    print(args)

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    n_gpu = torch.cuda.device_count()
    logger.info("device: {}, n_gpu {}".format(device, n_gpu))

    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)

    tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

    dataset_paths = {
        'train': os.path.join(args.data_base, 'train_tokens_bpe_gpt2.pt'),
        'valid': os.path.join(args.data_base, 'valid_tokens_bpe_gpt2.pt'),
        'test': os.path.join(args.data_base, 'test_tokens_bpe_gpt2.pt'),
    }

    if args.model_load_dir:
        model = GPT2LMHeadModel.from_pretrained(args.model_load_dir)
    else:
        model = GPT2LMHeadModel.from_pretrained(args.model_name)
    model.to(device)

    if args.mode == 'eval-singletoken' or args.mode == 'eval-both':
        eval_singletoken(model, args, dataset_paths)

    if args.mode == 'eval-completion' or args.mode == 'eval-both':
        datasets = get_datasets(dataset_paths,
                                max_len=args.batch_size_completion)
        eval_sampler = SequentialSampler(datasets[args.eval_split])
        eval_dataloader = DataLoader(datasets[args.eval_split],
                                     sampler=eval_sampler,
                                     batch_size=1)

        model.eval()

        with torch.no_grad():
            all_text_completions = []

            bpe_ngram_metrics = Metrics(pad=-1)
            word_ngram_metrics = Metrics(pad=-1)

            for i, batch in tqdm(enumerate(eval_dataloader),
                                 desc="Evaluating",
                                 total=len(eval_dataloader)):
                input_sequence = batch[0].cuda()
                if input_sequence.size(1) < args.prefix_length:
                    continue

                # Predict the completions.
                batch = batch_input_sequence_by_prefix_length(
                    input_sequence, args.prefix_length)
                bpe_completions, _ = sample_sequence(model, batch,
                                                     args.prefix_length,
                                                     args.continuation_length,
                                                     args.top_k, args.top_p)
                bpe_completions = bpe_completions.tolist()

                # Extract continuations from the predicted completions.
                bpe_continuations = []
                text_continuations = []
                for bpe_completion in bpe_completions:
                    bpe_continuations.append(
                        bpe_completion[args.prefix_length:])
                    text_continuations.append(
                        get_text_continuation(bpe_completion, tokenizer, args))
                    all_text_completions.append(
                        tokenizer.decode(bpe_completion))

                # Only keep continuations with at least one 4-gram
                # (A short continuation may occur due to predicted whitespace, then tokenizing, despite being
                #  normal length in BPE tokens).
                text_continuations = [
                    c for c in text_continuations if len(c) > 3
                ]

                # Update metrics with this batch of continuations.
                bpe_ngram_metrics.update(bpe_continuations)
                word_ngram_metrics.update(text_continuations)

                # Save the (possibly intermediate) metrics.
                save_completion_metrics(bpe_metrics=bpe_ngram_metrics.report(
                    'bpe_%s' % args.eval_split),
                                        word_metrics=word_ngram_metrics.report(
                                            'word_%s' % args.eval_split),
                                        text_completions=all_text_completions,
                                        config=model.config.to_dict(),
                                        args=args)

    if args.mode == 'train':
        if not os.path.exists(os.path.join(args.output_dir, 'best')):
            os.makedirs(os.path.join(args.output_dir, 'best'))

        token_loss = mle_loss
        datasets = get_datasets(dataset_paths, max_len=args.train_batch_size)
        train_sampler = RandomSampler(datasets['train'])
        train_seq_dataloader = DataLoader(datasets['train'],
                                          sampler=train_sampler,
                                          batch_size=1)

        # Setup optimizer
        if args.max_steps > 0:
            t_total = args.max_steps
            args.num_train_epochs = args.max_steps // (len(
                train_seq_dataloader) // args.gradient_accumulation_steps) + 1
        else:
            t_total = len(
                train_seq_dataloader
            ) // args.gradient_accumulation_steps * args.num_train_epochs

        param_optimizer = list(model.named_parameters())
        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [{
            'params': [
                p for n, p in param_optimizer
                if not any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            args.weight_decay
        }, {
            'params':
            [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
            'weight_decay':
            0.0
        }]
        optimizer = AdamW(optimizer_grouped_parameters,
                          lr=args.learning_rate,
                          eps=args.adam_epsilon)
        scheduler = WarmupLinearSchedule(optimizer,
                                         warmup_steps=args.warmup_steps,
                                         t_total=t_total)

        total_steps = 0
        best_ppl = 1e20
        for _ in trange(args.num_train_epochs, desc="Epoch"):
            logging_outputs = []
            epoch_loss = 0
            epoch_steps = 0
            tqdm_bar = tqdm(train_seq_dataloader,
                            desc="Training",
                            total=args.train_n_steps)
            for step, batch in enumerate(tqdm_bar):
                optimizer.zero_grad()

                # Sequence loss
                if torch.rand(1).item() < args.sequence_tune_rate:
                    if batch[0].size(1) < args.prefix_length:
                        continue
                    loss, batch_metrics = ul_seq(model, batch, args)

                # Token loss
                else:
                    loss, batch_metrics = token_loss(model, batch, args)

                loss.backward()
                optimizer.step()
                scheduler.step()
                epoch_loss += loss.item()
                epoch_steps += 1
                total_steps += 1
                tqdm_bar.desc = "Training loss: {:.2e} lr: {:.2e}".format(
                    epoch_loss / epoch_steps,
                    scheduler.get_lr()[0])

                logging_outputs.append(batch_metrics)

                if epoch_steps % args.report_metrics_every == 0:
                    logging_average = CrossEntropyCriterionWCustomMetrics.aggregate_logging_outputs(
                        logging_outputs)
                    temp = SequencePenaltyCriterion.aggregate_logging_outputs(
                        logging_outputs)
                    for k, v in temp.items():
                        logging_average[k] = v
                    logging_average['ppl'] = 2**logging_average['loss']
                    print(logging_average)
                    logging_outputs = []

                if step == args.train_n_steps:
                    break

                if epoch_steps % args.save_every == 0:
                    model_to_save = model.module if hasattr(
                        model, 'module') else model
                    output_model_file = os.path.join(args.output_dir,
                                                     WEIGHTS_NAME)
                    output_config_file = os.path.join(args.output_dir,
                                                      CONFIG_NAME)
                    torch.save(model_to_save.state_dict(), output_model_file)
                    model_to_save.config.to_json_file(output_config_file)
                    tokenizer.save_vocabulary(args.output_dir)

                if total_steps % args.validate_every == 0:
                    print("Validating...")
                    validation_outputs = eval_singletoken(
                        model, args, dataset_paths, train_iter=total_steps)
                    if validation_outputs['ppl'] < best_ppl:
                        best_ppl = validation_outputs['ppl']
                        model_to_save = model.module if hasattr(
                            model, 'module') else model
                        output_model_file = os.path.join(
                            args.output_dir, 'best', WEIGHTS_NAME)
                        output_config_file = os.path.join(
                            args.output_dir, 'best', CONFIG_NAME)
                        torch.save(model_to_save.state_dict(),
                                   output_model_file)
                        model_to_save.config.to_json_file(output_config_file)
                        tokenizer.save_vocabulary(
                            os.path.join(args.output_dir, 'best'))
                        save_singletoken_metrics(validation_outputs,
                                                 model.config.to_dict(),
                                                 args,
                                                 train_iter=total_steps,
                                                 best=True)
Example #4
0
    def train(self,
              model,
              model_name,
              B, N_for_train, N_for_eval, K, Q,
              na_rate=0,
              learning_rate=1e-1,
              lr_step_size=20000,
              weight_decay=1e-5,
              train_iter=30000,
              val_iter=1000,
              val_step=2000,
              test_iter=3000,
              load_ckpt=None,
              save_ckpt=None,
              pytorch_optim=optim.SGD,
              bert_optim=False,
              warmup=True,
              warmup_step=300,
              grad_iter=1,
              fp16=False,
              pair=False,
              adv_dis_lr=1e-1,
              adv_enc_lr=1e-1):
        '''
        model: a FewShotREModel instance
        model_name: Name of the model
        B: Batch size
        N: Num of classes for each batch
        K: Num of instances for each class in the support set
        Q: Num of instances for each class in the query set
        ckpt_dir: Directory of checkpoints
        learning_rate: Initial learning rate
        lr_step_size: Decay learning rate every lr_step_size steps
        weight_decay: Rate of decaying weight
        train_iter: Num of iterations of training
        val_iter: Num of iterations of validating
        val_step: Validate every val_step steps
        test_iter: Num of iterations of testing
        '''
        print("Start training...")
    
        # Init
        if bert_optim:
            print('Use bert optim!')
            parameters_to_optimize = list(model.named_parameters())
            no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
            parameters_to_optimize = [
                {'params': [p for n, p in parameters_to_optimize 
                    if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
                {'params': [p for n, p in parameters_to_optimize
                    if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
                ]
            optimizer = AdamW(parameters_to_optimize, lr=2e-5, correct_bias=False)
            if self.adv:
                optimizer_encoder = AdamW(parameters_to_optimize, lr=1e-5, correct_bias=False)
            scheduler = WarmupLinearSchedule(optimizer, warmup_steps=warmup_step, t_total=train_iter) 
        else:
            optimizer = pytorch_optim(model.parameters(),
                    learning_rate, weight_decay=weight_decay)
            if self.adv:
                optimizer_encoder = pytorch_optim(model.parameters(), lr=adv_enc_lr)
            scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=lr_step_size)

        if self.adv:
            optimizer_dis = pytorch_optim(self.d.parameters(), lr=adv_dis_lr)

        if load_ckpt:
            state_dict = self.__load_model__(load_ckpt)['state_dict']
            own_state = model.state_dict()
            for name, param in state_dict.items():
                if name not in own_state:
                    continue
                own_state[name].copy_(param)
            start_iter = 0
        else:
            start_iter = 0

        if fp16:
            from apex import amp
            model, optimizer = amp.initialize(model, optimizer, opt_level='O1')

        model.train()
        if self.adv:
            self.d.train()

        # Training
        best_acc = 0
        not_best_count = 0 # Stop training after several epochs without improvement.
        iter_loss = 0.0
        iter_loss_dis = 0.0
        iter_right = 0.0
        iter_right_dis = 0.0
        iter_sample = 0.0
        for it in range(start_iter, start_iter + train_iter):
            if pair:
                batch, label = next(self.train_data_loader)
                if torch.cuda.is_available():
                    for k in batch:
                        batch[k] = batch[k].cuda()
                    label = label.cuda()
                logits, pred = model(batch, N_for_train, K, 
                        Q * N_for_train + na_rate * Q)
            else:
                support, query, label = next(self.train_data_loader)
                if torch.cuda.is_available():
                    for k in support:
                        support[k] = support[k].cuda()
                    for k in query:
                        query[k] = query[k].cuda()
                    label = label.cuda()

                logits, pred  = model(support, query, 
                        N_for_train, K, Q * N_for_train + na_rate * Q)
            loss = model.loss(logits, label) / float(grad_iter)
            right = model.accuracy(pred, label)
            if fp16:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
                # torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), 10)
            else:
                loss.backward()
                # torch.nn.utils.clip_grad_norm_(model.parameters(), 10)
            
            if it % grad_iter == 0:
                optimizer.step()
                scheduler.step()
                optimizer.zero_grad()
            
            # Adv part
            if self.adv:
                support_adv = next(self.adv_data_loader)
                if torch.cuda.is_available():
                    for k in support_adv:
                        support_adv[k] = support_adv[k].cuda()

                features_ori = model.sentence_encoder(support)
                features_adv = model.sentence_encoder(support_adv)
                features = torch.cat([features_ori, features_adv], 0) 
                total = features.size(0)
                dis_labels = torch.cat([torch.zeros((total//2)).long().cuda(),
                    torch.ones((total//2)).long().cuda()], 0)
                dis_logits = self.d(features)
                loss_dis = self.adv_cost(dis_logits, dis_labels)
                _, pred = dis_logits.max(-1)
                right_dis = float((pred == dis_labels).long().sum()) / float(total)
                
                loss_dis.backward(retain_graph=True)
                optimizer_dis.step()
                optimizer_dis.zero_grad()
                optimizer_encoder.zero_grad()

                loss_encoder = self.adv_cost(dis_logits, 1 - dis_labels)
    
                loss_encoder.backward(retain_graph=True)
                optimizer_encoder.step()
                optimizer_dis.zero_grad()
                optimizer_encoder.zero_grad()

                iter_loss_dis += self.item(loss_dis.data)
                iter_right_dis += right_dis

            iter_loss += self.item(loss.data)
            iter_right += self.item(right.data)
            iter_sample += 1
            if self.adv:
                sys.stdout.write('step: {0:4} | loss: {1:2.6f}, accuracy: {2:3.2f}%, dis_loss: {3:2.6f}, dis_acc: {4:2.6f}'
                    .format(it + 1, iter_loss / iter_sample, 
                        100 * iter_right / iter_sample,
                        iter_loss_dis / iter_sample,
                        100 * iter_right_dis / iter_sample) +'\r')
            else:
                sys.stdout.write('step: {0:4} | loss: {1:2.6f}, accuracy: {2:3.2f}%'.format(it + 1, iter_loss / iter_sample, 100 * iter_right / iter_sample) +'\r')
            sys.stdout.flush()

            if (it + 1) % val_step == 0:
                acc = self.eval(model, B, N_for_eval, K, Q, val_iter, 
                        na_rate=na_rate, pair=pair)
                model.train()
                if acc > best_acc:
                    print('Best checkpoint')
                    torch.save({'state_dict': model.state_dict()}, save_ckpt)
                    best_acc = acc
                iter_loss = 0.
                iter_loss_dis = 0.
                iter_right = 0.
                iter_right_dis = 0.
                iter_sample = 0.
                
        print("\n####################\n")
        print("Finish training " + model_name)
class Trainer(object):
    def __init__(self, args):
        self.args = args
        self.train_batch_num = args['train_batch']
        self.Dataloader = Dataloader(args)
        print("preparing the train_data")
        self.train_data = self.Dataloader.load_train_batches()
        print("preparing the val_data")

        print("train data len:", len(self.train_data) * self.train_batch_num)
        self.cuda_gpu = (torch.cuda.is_available() and args['use_gpu'])

        print("build modeling:")
        self.global_model = Global_Model(args)

        if (self.cuda_gpu):
            # torch.nn.DataParallel (self.global_model, device_ids=gpus).cuda ()
            self.global_model = self.global_model.cuda()

        self.global_optimer = AdamW(self.global_model.parameters(),
                                    lr=args['global_lr'])

        num_total_steps = len(self.train_data) * args['global_epoch']
        num_warmup_steps = int(args['global_warmup_rate'] * num_total_steps)

        self.global_scheduler = WarmupLinearSchedule(
            self.global_optimer,
            warmup_steps=num_warmup_steps,
            t_total=num_total_steps)

    def train_global(self):
        epoches = self.args['global_epoch']
        times = time()

        #self.writeglobal_features ()
        max_acc = 0
        for epoch in range(epoches):
            train_datas = self.Dataloader.load_train_batches()

            loss_list = []
            acc_list = []
            times = time()
            for batch in range(len(train_datas)):

                text_feature = torch.tensor(train_datas[batch]["text"])
                text_pos_feature = torch.tensor(train_datas[batch]["pos_text"])
                text_neg_feature = torch.tensor(train_datas[batch]["neg_text"])

                if self.cuda_gpu:
                    text_feature = text_feature.cuda()
                    text_pos_feature = text_pos_feature.cuda()
                    text_neg_feature = text_neg_feature.cuda()

                text_emb = self.global_model(text_feature)
                pos_emb = self.global_model(text_pos_feature)
                neg_emb = self.global_model(text_neg_feature)

                #print(text_emb.shape)
                pos_dis, neg_dis = euclidean_distance(
                    text_emb, pos_emb), euclidean_distance(text_emb, neg_emb)
                pos_origin_dis, neg_origin_dis = torch.cosine_similarity(
                    text_emb, pos_emb,
                    dim=1), torch.cosine_similarity(text_emb, neg_emb, dim=1)
                #pos_dis,neg_dis = torch.cosine_similarity(text_emb,pos_emb,dim=1),torch.cosine_similarity(text_emb,neg_emb,dim=1)
                mean_pos_dis, mean_neg_dis = torch.mean(pos_origin_dis).detach(
                ).numpy(), torch.mean(neg_origin_dis).detach().numpy()
                acc = torch.mean(dis_acc(pos_dis, neg_dis)).detach().numpy()
                acc_list.append(acc)
                loss = triplet_loss(pos_dis, neg_dis)
                loss_np = loss
                if self.cuda_gpu:
                    loss_np = loss_np.cpu()
                loss_np = loss_np.detach().numpy()
                loss_list.append(loss_np)
                self.global_optimer.zero_grad()

                loss.backward()
                self.global_optimer.step()
                self.global_scheduler.step()

                if batch % 200 == 0:

                    print(
                        "batch: %d loss:%.4f acc:%.4f pos_dis:%f neg_dis:%f " %
                        (batch, loss_np, acc, mean_pos_dis, mean_neg_dis))

            mean_acc = np.mean(acc_list)
            print("epoch:%d loss:%.4f acc:%.4f time:[%.2fs]" %
                  (epoch, np.mean(loss_list), mean_acc, time() - times))

        torch.save(self.global_model.state_dict(),
                   self.args['global_model_save_path'])
        self.writeglobal_features()
        print("training_complete!")

    def writeglobal_features(self):
        print("writing_train_features:")
        train_feature_data = self.Dataloader.train_features
        val_feature = self.Dataloader.val_features
        test_feauture = self.Dataloader.test_features
        train_out_path = self.args['feature_global_train_path']
        global_feature_dict = {}
        for keys in train_feature_data:
            input_tensor = np.array(train_feature_data[keys].values)
            input_tensor = torch.tensor([input_tensor])
            output_tensor = self.global_model(input_tensor)
            output_np = (output_tensor[0]).detach().numpy()
            global_feature_dict[keys] = output_np
        pd.to_pickle(global_feature_dict, train_out_path)

        val_out_path = self.args['feature_global_val_path']
        val_feature_dict = {}
        for keys in val_feature:
            input_tensor = np.array(val_feature[keys].values)
            input_tensor = torch.tensor([input_tensor])
            output_tensor = self.global_model(input_tensor)
            output_np = (output_tensor[0]).detach().numpy()
            val_feature_dict[keys] = output_np
        pd.to_pickle(val_feature_dict, val_out_path)

        test_out_path = self.args['feature_global_test_path']
        test_feature_dict = {}
        for keys in test_feauture:
            input_tensor = np.array(test_feauture[keys].values)
            input_tensor = torch.tensor([input_tensor])
            output_tensor = self.global_model(input_tensor)
            output_np = (output_tensor[0]).detach().numpy()
            test_feature_dict[keys] = output_np
        pd.to_pickle(test_feature_dict, test_out_path)
Example #6
0
def train(args, train_iter, dev, test, src_field, tgt_field, tag_field,
          checkpoint):
    # srcpadid = src_field.vocab.stoi['<pad>']
    tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')

    model = Classify_Extractor(args, tgt_field)

    if torch.cuda.is_available():
        model.cuda()

    print_params(model)

    decay = args.decay

    if args.optimizer == 'bert':
        weight_decay = 0.0
        no_decay = ['bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [{
            'params': [
                p for n, p in model.named_parameters()
                if not any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            weight_decay
        }, {
            'params': [
                p for n, p in model.named_parameters()
                if any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            0.0
        }]
        opt = AdamW(optimizer_grouped_parameters, lr=args.lr, eps=1e-8)
        totalnum = 0
        for i in train_iter:
            totalnum += 1
        #print(args.lr)
        #print(args.maximum_steps)
        #exit()
        t_total = totalnum // decay * args.maximum_steps
        scheduler = WarmupLinearSchedule(opt, warmup_steps=0, t_total=t_total)
    else:
        opt = torch.optim.Adadelta(model.parameters(), lr=args.lr)

    best_e = 0.0
    best_c = 0.0
    best_epoch_for_c = 0
    best_epoch_for_e = 0
    offset = 0.0
    pre_epoch = 0
    patience_c = 0
    patience_e = 0

    if checkpoint is not None:
        print('model.load_state_dict(checkpoint[model])')
        model.load_state_dict(checkpoint['model'])
        if args.resume:
            opt.load_state_dict(checkpoint['optim'])

            best_f = checkpoint['f']
            offset = checkpoint['iters']
            pre_epoch = checkpoint['epoch']

            print('*************************************')
            print('resume from {} epoch {} iters and best_f {}'.format(
                pre_epoch, offset, best_f))
            print('*************************************')

    print("**************start training****************")
    start = time.time()

    for epoch in range(args.maxepoch):
        train_iter.init_epoch()
        epoch += pre_epoch

        for iters, train_batch in enumerate(train_iter):
            iters += offset
            model.train()
            # model.zero_grad()
            # model.constrain_transition()
            t1 = time.time()
            batch_src = train_batch.src
            #print(batch_src)
            #exit()
            src = [tokenizer.convert_tokens_to_ids(s) for s in batch_src]
            maxlen = max([len(s) for s in batch_src])

            src_mask = []
            padded_sents = []
            for s in src:
                new_s = s + [0] * (maxlen - len(s))
                padded_sents.append(new_s)
                mask = [1] * len(s) + [0] * (maxlen - len(s))
                src_mask.append(mask)
            # B T
            src = torch.tensor(padded_sents).long().cuda()
            # B T
            src_mask = torch.tensor(src_mask).byte().cuda()
            # src, src_mask = prepare_src(train_batch.src, srcpadid)
            tgt = prepare_tgt(train_batch.tgt)
            tag = train_batch.tag

            loss = model(src, src_mask, tgt, tag)

            # "update parameters"

            if decay > 1:
                loss = loss / decay

            loss.backward()

            # if args.grad_clip:
            #     torch.nn.utils.clip_grad_norm_(model.parameters(), args.grad_clip)

            if (iters + 1) % decay == 0:
                opt.step()
                scheduler.step()  # Update learning rate schedule
                opt.zero_grad()

            # opt.step()

            t2 = time.time()

            loss = loss.item()

            print("epoch:{} iters:{} src:({},{}) tgt:({},{}) "
                  "loss:{:.2f} t:{:.2f}".format(epoch + 1, iters + 1,
                                                *src.size(), *tgt.size(), loss,
                                                t2 - t1))

        # if torch.cuda.is_available():
        #     torch.cuda.empty_cache()

        if (epoch + 1) % 1 == 0:
            print("=============validate model==============")
            with torch.no_grad():
                dev.init_epoch()
                model.eval()
                # model.constrain_transition()
                sents = []
                cy_true = []
                cy_pred = []
                for j, dev_batch in enumerate(dev):
                    t1 = time.time()
                    # src, src_mask = prepare_src(dev_batch.src, srcpadid)
                    batch_src = dev_batch.src
                    src = [
                        tokenizer.convert_tokens_to_ids(s) for s in batch_src
                    ]
                    maxlen = max([len(s) for s in batch_src])

                    src_mask = []
                    padded_sents = []
                    for s in src:
                        new_s = s + [0] * (maxlen - len(s))
                        padded_sents.append(new_s)
                        mask = [1] * len(s) + [0] * (maxlen - len(s))
                        src_mask.append(mask)
                    # B T
                    src = torch.tensor(padded_sents).long().cuda()
                    # B T
                    src_mask = torch.tensor(src_mask).byte().cuda()

                    tgt = prepare_tgt(dev_batch.tgt)
                    tag = dev_batch.tag.squeeze(-1)
                    _, pre_tag = model.component_extraction(src, src_mask)
                    pre_ctag = model.simile_classify(src, src_mask)
                    cy_true.extend(tag.tolist())
                    cy_pred.extend(pre_ctag.tolist())

                    for sen, tags, p_tags, c_tags in zip(
                            src, tgt, pre_tag, tag):
                        sen = sen[:len(p_tags)].tolist()
                        tags = tags[:len(p_tags)].tolist()
                        if c_tags == 1:
                            sents.append([
                                sen, [tgt_field.vocab.itos[t] for t in tags],
                                [tgt_field.vocab.itos[t] for t in p_tags]
                            ])
                    print('dev iters: {}, t:{}'.format(j, time.time() - t1))

                _, eprecision, erecall, ef1 = evaluate(sents)

                cprecision = precision_score(cy_true, cy_pred)
                crecall = recall_score(cy_true, cy_pred)
                cf1 = f1_score(cy_true, cy_pred)

                print(
                    'epoch: {} classify--> precision: {} recall: {} f1: {} best:{}'
                    .format(epoch + 1, cprecision, crecall, cf1, best_c))
                print('extractor--> precision: {} recall: {} f1: {} best: {}'.
                      format(eprecision, erecall, ef1, best_e))

                if cf1 > best_c:
                    best_c = cf1
                    best_epoch_for_c = epoch + 1

                    print(
                        'save best classifier model at epoch={}'.format(epoch +
                                                                        1))
                    checkpoint = {
                        'model': model.state_dict(),
                        'optim': opt.state_dict(),
                        'args': args
                    }
                    torch.save(
                        checkpoint, '{}/{}.classify.best.pt'.format(
                            args.model_path, args.model))
                    patience_c = 0
                else:
                    patience_c += 1

                if ef1 > best_e:
                    best_e = ef1
                    best_epoch_for_e = epoch + 1

                    print(
                        'save best extractor model at epoch={}'.format(epoch +
                                                                       1))
                    checkpoint = {
                        'model': model.state_dict(),
                        'optim': opt.state_dict(),
                        'args': args
                    }
                    torch.save(
                        checkpoint, '{}/{}.extractor.best.pt'.format(
                            args.model_path, args.model))
                    patience_e = 0
                else:
                    patience_e += 1

        if patience_c > args.patience and patience_e > args.patience:
            print("early stop at {}".format(epoch))
            break

        if args.decay:
            opt.param_groups[0]['lr'] = opt.param_groups[0]['lr'] * args.decay

    print('*******Done********{}'.format(
        time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())))
    minutes = (time.time() - start) // 60
    if minutes < 60:
        print(
            'best_c:{}, best_e:{} best_epoch_c:{}, best_epoch_e:{}, time:{} mins'
            .format(best_c, best_e, best_epoch_for_c, best_epoch_for_e,
                    minutes))
    else:
        hours = minutes / 60
        print(
            'best_c:{}, best_e:{} best_epoch_c:{}, best_epoch_e:{}, time:{:.1f} hours'
            .format(best_c, best_e, best_epoch_for_c, best_epoch_for_e, hours))

    print('*******Testing************')
    model1 = Classify_Extractor(args, tgt_field)
    model1.cuda()
    load_from = '{}/{}.classify.best.pt'.format(args.model_path, args.model)
    print('load the best model {}'.format(load_from))
    checkpoint = torch.load(load_from, map_location='cpu')
    print('load parameters')
    model1.load_state_dict(checkpoint['model'])

    model2 = Classify_Extractor(args, tgt_field)
    model2.cuda()
    load_from = '{}/{}.extractor.best.pt'.format(args.model_path, args.model)
    print('load the best model {}'.format(load_from))
    checkpoint = torch.load(load_from, map_location='cpu')
    print('load parameters')
    model2.load_state_dict(checkpoint['model'])
    with torch.no_grad():
        test.init_epoch()
        model1.eval()
        model2.eval()
        sents = []
        cy_true = []
        cy_pred = []
        for j, test_batch in enumerate(test):
            t1 = time.time()
            # src, src_mask = prepare_src(test_batch.src, srcpadid)
            batch_src = test_batch.src
            src = [tokenizer.convert_tokens_to_ids(s) for s in batch_src]
            maxlen = max([len(s) for s in batch_src])

            src_mask = []
            padded_sents = []
            for s in src:
                new_s = s + [0] * (maxlen - len(s))
                padded_sents.append(new_s)
                mask = [1] * len(s) + [0] * (maxlen - len(s))
                src_mask.append(mask)
            # B T
            src = torch.tensor(padded_sents).long().cuda()
            # B T
            src_mask = torch.tensor(src_mask).byte().cuda()

            tgt = prepare_tgt(test_batch.tgt)
            tag = test_batch.tag.squeeze(-1)
            _, pre_tag = model2.component_extraction(src, src_mask)
            pre_ctag = model1.simile_classify(src, src_mask)
            cy_true.extend(tag.tolist())
            cy_pred.extend(pre_ctag.tolist())

            # for sen, tags, p_tags in zip(src, tgt, pre_tag):
            #     sen = sen[:len(p_tags)].tolist()
            #     tags = tags[:len(p_tags)].tolist()
            #     sents.append([sen, [tgt_field.vocab.itos[t] for t in tags],
            #                  [tgt_field.vocab.itos[t] for t in p_tags]])
            for sen, tags, p_tags, c_tags in zip(src, tgt, pre_tag, pre_ctag):
                sen = sen[:len(p_tags)].tolist()
                tags = tags[:len(p_tags)].tolist()
                if c_tags == 1:
                    sents.append([
                        sen, [tgt_field.vocab.itos[t] for t in tags],
                        [tgt_field.vocab.itos[t] for t in p_tags]
                    ])
                elif c_tags == 0:
                    sents.append([
                        sen, [tgt_field.vocab.itos[t] for t in tags],
                        ['O' for t in p_tags]
                    ])

            print('test iters: {}, t:{}'.format(j, time.time() - t1))

        _, eprecision, erecall, ef1 = evaluate(sents)

        cprecision = precision_score(cy_true, cy_pred)
        crecall = recall_score(cy_true, cy_pred)
        cf1 = f1_score(cy_true, cy_pred)

        print('Testing classify--> precision: {} recall: {} f1: {}'.format(
            cprecision, crecall, cf1))
        print('extractor--> precision: {} recall: {} f1: {}'.format(
            eprecision, erecall, ef1))
Example #7
0
    def train(self):
        if not os.path.exists(self.output_dir):
            os.makedirs(self.output_dir)

        # logger.info(f'Fold {split_index + 1}')
        train_dataloader, eval_dataloader, train_examples, eval_examples = self.create_dataloader(
        )

        num_train_optimization_steps = self.train_steps

        # Prepare model
        config = BertConfig.from_pretrained(self.model_name_or_path,
                                            num_labels=self.num_labels)
        model = BertForSequenceClassification.from_pretrained(
            self.model_name_or_path, self.args, config=config)
        model.to(self.device)
        model.train()
        # Prepare optimizer
        param_optimizer = list(model.named_parameters())
        param_optimizer = [n for n in param_optimizer]

        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [{
            'params': [
                p for n, p in param_optimizer
                if not any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            self.weight_decay
        }, {
            'params':
            [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
            'weight_decay':
            0.0
        }]

        optimizer = AdamW(optimizer_grouped_parameters,
                          lr=self.learning_rate,
                          eps=self.adam_epsilon)
        scheduler = WarmupLinearSchedule(optimizer,
                                         warmup_steps=self.warmup_steps,
                                         t_total=self.train_steps)

        global_step = 0

        logger.info("***** Running training *****")
        logger.info("  Num examples = %d", len(train_examples))
        logger.info("  Batch size = %d", self.train_batch_size)
        logger.info("  Num steps = %d", num_train_optimization_steps)

        best_acc = 0
        best_MRR = 0
        tr_loss = 0
        nb_tr_examples, nb_tr_steps = 0, 0
        train_dataloader = cycle(train_dataloader)

        for step in range(num_train_optimization_steps):
            batch = next(train_dataloader)
            batch = tuple(t.to(self.device) for t in batch)
            input_ids, input_mask, segment_ids, utterance_mask, response_mask, history_mask, label_ids = batch
            loss = model(input_ids=input_ids,
                         token_type_ids=segment_ids,
                         attention_mask=input_mask,
                         utterance_mask=utterance_mask,
                         response_mask=response_mask,
                         history_mask=history_mask,
                         labels=label_ids)
            tr_loss += loss.item()
            train_loss = round(tr_loss / (nb_tr_steps + 1), 4)

            nb_tr_examples += input_ids.size(0)
            nb_tr_steps += 1

            loss.backward()
            if (nb_tr_steps + 1) % self.gradient_accumulation_steps == 0:

                optimizer.step()
                optimizer.zero_grad()
                scheduler.step()
                global_step += 1

            if (step + 1) % (self.eval_steps *
                             self.gradient_accumulation_steps) == 0:
                tr_loss = 0
                nb_tr_examples, nb_tr_steps = 0, 0
                logger.info("***** Report result *****")
                logger.info("  %s = %s", 'global_step', str(global_step))
                logger.info("  %s = %s", 'train loss', str(train_loss))

            if self.do_eval and (step + 1) % (
                    self.eval_steps * self.gradient_accumulation_steps) == 0:
                for file in ['dev.csv']:
                    inference_labels = []
                    gold_labels = []
                    inference_logits = []
                    scores = []
                    ID = [x.guid for x in eval_examples]

                    logger.info("***** Running evaluation *****")
                    logger.info("  Num examples = %d", len(eval_examples))
                    logger.info("  Batch size = %d", self.eval_batch_size)

                    model.eval()
                    eval_loss, eval_accuracy = 0, 0
                    nb_eval_steps, nb_eval_examples = 0, 0
                    for input_ids, input_mask, segment_ids, utterance_mask, response_mask, history_mask, label_ids in eval_dataloader:
                        input_ids = input_ids.to(self.device)
                        input_mask = input_mask.to(self.device)
                        segment_ids = segment_ids.to(self.device)
                        utterance_mask = utterance_mask.to(self.device)
                        response_mask = response_mask.to(self.device)
                        history_mask = history_mask.to(self.device)
                        label_ids = label_ids.to(self.device)

                        with torch.no_grad():
                            tmp_eval_loss = model(
                                input_ids=input_ids,
                                token_type_ids=segment_ids,
                                attention_mask=input_mask,
                                utterance_mask=utterance_mask,
                                response_mask=response_mask,
                                history_mask=history_mask,
                                labels=label_ids)
                            logits = model(
                                input_ids=input_ids,
                                token_type_ids=segment_ids,
                                attention_mask=input_mask,
                                utterance_mask=utterance_mask,
                                response_mask=response_mask,
                                history_mask=history_mask,
                            )

                        logits = logits.detach().cpu().numpy()
                        label_ids = label_ids.to('cpu').numpy()
                        inference_labels.append(np.argmax(logits, axis=1))
                        scores.append(logits)
                        gold_labels.append(label_ids)
                        inference_logits.append(logits)
                        eval_loss += tmp_eval_loss.mean().item()
                        nb_eval_examples += input_ids.size(0)
                        nb_eval_steps += 1

                    gold_labels = np.concatenate(gold_labels, 0)
                    inference_logits = np.concatenate(inference_logits, 0)
                    scores = np.concatenate(scores, 0)
                    model.train()
                    eval_loss = eval_loss / nb_eval_steps
                    eval_accuracy = accuracyCQA(inference_logits, gold_labels)
                    eval_DOUBAN_MRR, eval_DOUBAN_mrr, eval_DOUBAN_MAP, eval_Precision1 = compute_DOUBAN(
                        ID, scores, gold_labels)
                    r_at_1 = r_at_k(ID, scores, gold_labels, 1)
                    r_at_2 = r_at_k(ID, scores, gold_labels, 2)
                    r_at_5 = r_at_k(ID, scores, gold_labels, 5)
                    # print('eval_mrr',eval_mrr)
                    print('eval_F1', eval_accuracy, 'eval_MRR',
                          eval_DOUBAN_MRR, 'eval_MAP', eval_DOUBAN_MAP,
                          'eval_Precision1', eval_Precision1, 'r10@1', r_at_1,
                          'r10@2', r_at_2, 'r10@5', r_at_5, 'global_step',
                          global_step, 'loss', train_loss)
                    result = {
                        'eval_loss': eval_loss,
                        'eval_F1': eval_accuracy,
                        'eval_MRR': eval_DOUBAN_MRR,
                        'eval_MAP': eval_DOUBAN_MAP,
                        'eval_Precision1': eval_Precision1,
                        'r10@1': r_at_1,
                        'r10@2': r_at_2,
                        'r10@5': r_at_5,
                        'global_step': global_step,
                        'loss': train_loss
                    }

                    output_eval_file = os.path.join(self.output_dir,
                                                    "eval_results.txt")
                    with open(output_eval_file, "a") as writer:
                        for key in sorted(result.keys()):
                            logger.info("  %s = %s", key, str(result[key]))
                            writer.write("%s = %s\n" % (key, str(result[key])))
                        writer.write('*' * 80)
                        writer.write('\n')
                    # if eval_accuracy > best_acc :
                    if eval_DOUBAN_MRR > best_MRR:
                        print("=" * 80)
                        print("Best MRR", eval_DOUBAN_MRR)
                        print("Saving Model......")
                        # best_acc = eval_accuracy
                        best_MRR = eval_DOUBAN_MRR
                        # Save a trained model
                        model_to_save = model.module if hasattr(
                            model, 'module') else model
                        output_model_file = os.path.join(
                            self.output_dir, "pytorch_model.bin")
                        torch.save(model_to_save.state_dict(),
                                   output_model_file)
                        print("=" * 80)
                    else:
                        print("=" * 80)
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--model_name',
                        type=str,
                        default='openai-gpt',
                        help='pretrained model name')
    parser.add_argument("--do_train",
                        action='store_true',
                        help="Whether to run training.")
    parser.add_argument("--do_eval",
                        action='store_true',
                        help="Whether to run eval on the dev set.")
    parser.add_argument(
        "--output_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The output directory where the model predictions and checkpoints will be written."
    )
    parser.add_argument('--train_dataset', type=str, default='')
    parser.add_argument('--eval_dataset', type=str, default='')
    parser.add_argument('--seed', type=int, default=42)
    parser.add_argument('--num_train_epochs', type=int, default=3)
    parser.add_argument('--train_batch_size', type=int, default=8)
    parser.add_argument('--eval_batch_size', type=int, default=16)
    parser.add_argument("--adam_epsilon",
                        default=1e-8,
                        type=float,
                        help="Epsilon for Adam optimizer.")
    parser.add_argument('--max_grad_norm', type=int, default=1)
    parser.add_argument("--max_steps",
                        default=-1,
                        type=int,
                        help="If > 0: set total number of training \
                        steps to perform. Override num_train_epochs.")
    parser.add_argument('--gradient_accumulation_steps',
                        type=int,
                        default=1,
                        help="Number of updates steps to accumulate before\
                        performing a backward/update pass.")
    parser.add_argument('--learning_rate', type=float, default=6.25e-5)
    parser.add_argument("--warmup_steps",
                        default=0,
                        type=int,
                        help="Linear warmup over warmup_steps.")
    parser.add_argument('--lr_schedule', type=str, default='warmup_linear')
    parser.add_argument('--weight_decay', type=float, default=0.01)
    parser.add_argument('--lm_coef', type=float, default=0.9)
    parser.add_argument('--n_valid', type=int, default=374)

    parser.add_argument('--server_ip',
                        type=str,
                        default='',
                        help="Can be used for distant debugging.")
    parser.add_argument('--server_port',
                        type=str,
                        default='',
                        help="Can be used for distant debugging.")
    args = parser.parse_args()
    print(args)

    if args.server_ip and args.server_port:
        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
        import ptvsd
        print("Waiting for debugger attach")
        ptvsd.enable_attach(address=(args.server_ip, args.server_port),
                            redirect_output=True)
        ptvsd.wait_for_attach()

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    n_gpu = torch.cuda.device_count()
    logger.info("device: {}, n_gpu {}".format(device, n_gpu))

    if not args.do_train and not args.do_eval:
        raise ValueError(
            "At least one of `do_train` or `do_eval` must be True.")

    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)

    # Load tokenizer and model
    # This loading functions also add new tokens and embeddings called `special tokens`
    # These new embeddings will be fine-tuned on the RocStories dataset
    special_tokens = ['_start_', '_delimiter_', '_classify_']
    tokenizer = OpenAIGPTTokenizer.from_pretrained(args.model_name)
    tokenizer.add_special_tokens({
        'cls_token': '<CLS>',
        'sep_token': '<SEP>',
        'pad_token': '<PAD>',
        'eos_token': '<EOS>'
    })
    model = OpenAIGPTDoubleHeadsModel.from_pretrained(args.model_name)
    model.resize_token_embeddings(len(tokenizer))
    special_tokens_ids = [
        tokenizer.convert_tokens_to_ids(special_token)
        for special_token in ['<PAD>', '<CLS>', '<SEP>', '<EOS>']
    ]
    model.to(device)

    # Load and encode the datasets
    if not args.train_dataset and not args.eval_dataset:
        roc_stories = cached_path(ROCSTORIES_URL)

    def tokenize_and_encode(obj):
        """ Tokenize and encode a nested object """
        if isinstance(obj, str):
            return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(obj))
        elif isinstance(obj, int):
            return obj
        return list(tokenize_and_encode(o) for o in obj)

    logger.info("Encoding dataset...")
    train_dataset = load_rocstories_dataset(args.train_dataset)
    eval_dataset = load_rocstories_dataset(args.eval_dataset)
    datasets = (train_dataset, eval_dataset)
    encoded_datasets = tokenize_and_encode(datasets)

    # Compute the max input length for the Transformer
    max_length = model.config.n_positions // 2 - 2
    input_length = max(len(story[:max_length]) + max(len(cont1[:max_length]), len(cont2[:max_length])) + 3  \
                           for dataset in encoded_datasets for story, cont1, cont2, _ in dataset)
    input_length = min(input_length, model.config.n_positions
                       )  # Max size of input for the pre-trained model

    # Prepare inputs tensors and dataloaders
    tensor_datasets = pre_process_datasets(encoded_datasets, input_length,
                                           max_length, *special_tokens_ids)
    train_tensor_dataset, eval_tensor_dataset = tensor_datasets[
        0], tensor_datasets[1]

    train_data = TensorDataset(*train_tensor_dataset)
    train_sampler = RandomSampler(train_data)
    train_dataloader = DataLoader(train_data,
                                  sampler=train_sampler,
                                  batch_size=args.train_batch_size)

    eval_data = TensorDataset(*eval_tensor_dataset)
    eval_sampler = SequentialSampler(eval_data)
    eval_dataloader = DataLoader(eval_data,
                                 sampler=eval_sampler,
                                 batch_size=args.eval_batch_size)

    # Prepare optimizer
    if args.do_train:
        if args.max_steps > 0:
            t_total = args.max_steps
            args.num_train_epochs = args.max_steps //\
                (len(train_dataloader) // args.gradient_accumulation_steps) + 1
        else:
            t_total = len(train_dataloader)\
                // args.gradient_accumulation_steps * args.num_train_epochs

        param_optimizer = list(model.named_parameters())
        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [{
            'params': [
                p for n, p in param_optimizer
                if not any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            args.weight_decay
        }, {
            'params':
            [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
            'weight_decay':
            0.0
        }]
        optimizer = AdamW(optimizer_grouped_parameters,
                          lr=args.learning_rate,
                          eps=args.adam_epsilon)
        scheduler = WarmupLinearSchedule(optimizer,
                                         warmup_steps=args.warmup_steps,
                                         t_total=t_total)

    if args.do_train:
        nb_tr_steps, tr_loss, exp_average_loss = 0, 0, None
        model.train()
        for _ in trange(int(args.num_train_epochs), desc="Epoch"):
            tr_loss = 0
            nb_tr_steps = 0
            tqdm_bar = tqdm(train_dataloader, desc="Training")
            for step, batch in enumerate(tqdm_bar):
                batch = tuple(t.to(device) for t in batch)
                input_ids, mc_token_ids, lm_labels, mc_labels = batch
                losses = model(input_ids, mc_token_ids, lm_labels, mc_labels)
                loss = args.lm_coef * losses[0] + losses[1]
                loss.backward()
                scheduler.step()
                optimizer.step()
                optimizer.zero_grad()
                tr_loss += loss.item()
                exp_average_loss = loss.item(
                ) if exp_average_loss is None else 0.7 * exp_average_loss + 0.3 * loss.item(
                )
                nb_tr_steps += 1
                tqdm_bar.desc = "Training loss: {:.2e} lr: {:.2e}".format(
                    exp_average_loss,
                    scheduler.get_lr()[0])

    # Save a trained model
    if args.do_train:
        # Save a trained model, configuration and tokenizer
        model_to_save = model.module if hasattr(
            model, 'module') else model  # Only save the model it-self

        # If we save using the predefined names, we can load using `from_pretrained`
        output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME)
        output_config_file = os.path.join(args.output_dir, CONFIG_NAME)

        torch.save(model_to_save.state_dict(), output_model_file)
        model_to_save.config.to_json_file(output_config_file)
        tokenizer.save_vocabulary(args.output_dir)

        # Load a trained model and vocabulary that you have fine-tuned
        model = OpenAIGPTDoubleHeadsModel.from_pretrained(args.output_dir)
        tokenizer = OpenAIGPTTokenizer.from_pretrained(args.output_dir)
        model.to(device)

    if args.do_eval:
        model.eval()
        eval_loss, eval_accuracy = 0, 0
        nb_eval_steps, nb_eval_examples = 0, 0
        for batch in tqdm(eval_dataloader, desc="Evaluating"):
            batch = tuple(t.to(device) for t in batch)
            input_ids, mc_token_ids, lm_labels, mc_labels = batch
            with torch.no_grad():
                _, mc_loss, _, mc_logits = model(input_ids, mc_token_ids,
                                                 lm_labels, mc_labels)

            mc_logits = mc_logits.detach().cpu().numpy()
            mc_labels = mc_labels.to('cpu').numpy()
            tmp_eval_accuracy = accuracy(mc_logits, mc_labels)

            eval_loss += mc_loss.mean().item()
            eval_accuracy += tmp_eval_accuracy

            nb_eval_examples += input_ids.size(0)
            nb_eval_steps += 1

        eval_loss = eval_loss / nb_eval_steps
        eval_accuracy = eval_accuracy / nb_eval_examples
        train_loss = tr_loss / nb_tr_steps if args.do_train else None
        result = {
            'eval_loss': eval_loss,
            'eval_accuracy': eval_accuracy,
            'train_loss': train_loss
        }

        output_eval_file = os.path.join(args.output_dir, "eval_results.txt")
        with open(output_eval_file, "w") as writer:
            logger.info("***** Eval results *****")
            for key in sorted(result.keys()):
                logger.info("  %s = %s", key, str(result[key]))
                writer.write("%s = %s\n" % (key, str(result[key])))
Example #9
0
    def train(self):
        if not os.path.exists(self.output_dir):
            os.makedirs(self.output_dir)

        data_splitList = DATACQA.load_data(os.path.join(self.data_dir, 'train.csv'),n_splits=5)
        for split_index,each_data in enumerate(data_splitList):
            # Prepare model
            config = BertConfig.from_pretrained(self.model_name_or_path, num_labels=self.num_labels)
            model = BertForSequenceClassification.from_pretrained(self.model_name_or_path, self.args, config=config)
            model.to(self.device)

            logger.info(f'Fold {split_index + 1}')
            train_dataloader, eval_dataloader, train_examples, eval_examples = self.create_dataloader(each_data)

            num_train_optimization_steps = self.train_steps

            # Prepare optimizer

            param_optimizer = list(model.named_parameters())
            param_optimizer = [n for n in param_optimizer]

            no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
            optimizer_grouped_parameters = [
                {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
                 'weight_decay': self.weight_decay},
                {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
            ]

            optimizer = AdamW(optimizer_grouped_parameters, lr=self.learning_rate, eps=self.adam_epsilon)
            scheduler = WarmupLinearSchedule(optimizer, warmup_steps=self.warmup_steps, t_total=self.train_steps)

            global_step = 0

            logger.info("***** Running training *****")
            logger.info("  Num examples = %d", len(train_examples))
            logger.info("  Batch size = %d", self.train_batch_size)
            logger.info("  Num steps = %d", num_train_optimization_steps)

            best_acc = 0
            model.train()
            tr_loss = 0
            nb_tr_examples, nb_tr_steps = 0, 0
            train_dataloader = cycle(train_dataloader)

            for step in range(num_train_optimization_steps):
                batch = next(train_dataloader)
                batch = tuple(t.to(self.device) for t in batch)
                input_ids, input_mask, segment_ids, label_ids = batch
                loss = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=label_ids)
                tr_loss += loss.item()
                train_loss = round(tr_loss / (nb_tr_steps + 1), 4)

                nb_tr_examples += input_ids.size(0)
                nb_tr_steps += 1

                loss.backward()

                if (nb_tr_steps + 1) % self.gradient_accumulation_steps == 0:

                    scheduler.step()
                    optimizer.step()
                    optimizer.zero_grad()
                    global_step += 1

                if (step + 1) % (self.eval_steps * self.gradient_accumulation_steps) == 0:
                    tr_loss = 0
                    nb_tr_examples, nb_tr_steps = 0, 0
                    logger.info("***** Report result *****")
                    logger.info("  %s = %s", 'global_step', str(global_step))
                    logger.info("  %s = %s", 'train loss', str(train_loss))

                if self.do_eval and (step + 1) % (self.eval_steps * self.gradient_accumulation_steps) == 0:
                    for file in ['dev.csv']:
                        inference_labels = []
                        gold_labels = []
                        inference_logits = []
                        scores = []
                        questions = [x.text_a for x in eval_examples]

                        logger.info("***** Running evaluation *****")
                        logger.info("  Num examples = %d", len(eval_examples))
                        logger.info("  Batch size = %d", self.eval_batch_size)

                        # Run prediction for full data

                        model.eval()
                        eval_loss, eval_accuracy = 0, 0
                        nb_eval_steps, nb_eval_examples = 0, 0
                        for input_ids, input_mask, segment_ids, label_ids in eval_dataloader:
                            input_ids = input_ids.to(self.device)
                            input_mask = input_mask.to(self.device)
                            segment_ids = segment_ids.to(self.device)
                            label_ids = label_ids.to(self.device)

                            with torch.no_grad():
                                tmp_eval_loss = model(
                                    input_ids=input_ids,
                                    token_type_ids=segment_ids,
                                    attention_mask=input_mask,
                                    labels=label_ids)
                                logits = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask)

                            logits = logits.detach().cpu().numpy()
                            label_ids = label_ids.to('cpu').numpy()
                            inference_labels.append(np.argmax(logits, axis=1))
                            scores.append(logits)
                            gold_labels.append(label_ids)
                            inference_logits.append(logits)
                            eval_loss += tmp_eval_loss.mean().item()
                            nb_eval_examples += input_ids.size(0)
                            nb_eval_steps += 1

                        gold_labels = np.concatenate(gold_labels, 0)
                        inference_logits = np.concatenate(inference_logits, 0)
                        scores = np.concatenate(scores, 0)
                        model.train()
                        eval_loss = eval_loss / nb_eval_steps
                        eval_accuracy = accuracyCQA(inference_logits, gold_labels)
                        eval_mrr = compute_MRR_CQA(scores,gold_labels,questions)
                        eval_5R20 = compute_5R20(scores,gold_labels,questions)

                        result = {'eval_loss': eval_loss,
                                  'eval_F1': eval_accuracy,
                                  'eval_MRR':eval_mrr,
                                  'eval_5R20':eval_5R20,
                                  'global_step': global_step,
                                  'loss': train_loss}

                        output_eval_file = os.path.join(self.output_dir, "eval_results.txt")
                        with open(output_eval_file, "a") as writer:
                            for key in sorted(result.keys()):
                                logger.info("  %s = %s", key, str(result[key]))
                                writer.write("%s = %s\n" % (key, str(result[key])))
                            writer.write('*' * 80)
                            writer.write('\n')
                        if eval_accuracy > best_acc :
                            print("=" * 80)
                            print("Best F1", eval_accuracy)
                            print("Saving Model......")
                            best_acc = eval_accuracy
                            # Save a trained model
                            model_to_save = model.module if hasattr(model,'module') else model
                            output_model_file = os.path.join(self.output_dir, "pytorch_model_{}.bin".format(split_index))
                            torch.save(model_to_save.state_dict(), output_model_file)
                            print("=" * 80)
                        else:
                            print("=" * 80)

            del model
            gc.collect()
Example #10
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--model_type",
                        type=str,
                        default="openai-gpt",
                        help="model type: openai-gpt/gpt2/xlnet/...")
    parser.add_argument("--model_name_or_path",
                        type=str,
                        default="openai-gpt",
                        help="pretrained model path")
    parser.add_argument("--toy", action="store_true", help="test code")

    parser.add_argument("--do_train", action="store_true", help="do training")
    parser.add_argument("--do_eval",
                        action="store_true",
                        help="do evaluation in the end")
    parser.add_argument(
        "--output_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The output directory where the model predictions and checkpoints will be written."
    )
    parser.add_argument("--train_dataset",
                        type=str,
                        nargs="+",
                        default=["data/conceptnet/train100k_CN.txt"])
    parser.add_argument(
        "--eval_dataset",
        type=str,
        nargs="+",
        default=["data/conceptnet/dev1_CN.txt", "data/conceptnet/dev2_CN.txt"])
    parser.add_argument("--test_dataset",
                        type=str,
                        nargs="+",
                        default=["data/conceptnet/test_CN.txt"])

    parser.add_argument(
        "--add_prefix",
        action="store_true",
        help=
        "add a prefix at the beginning of each input when train with multiple dataset"
    )
    parser.add_argument("--add_separator",
                        action="store_true",
                        help="add <sep> between sub/rel/obj")
    parser.add_argument("--predict_part",
                        type=str,
                        default="obj",
                        choices=["sub", "rel", "obj", "all"],
                        help="predict which part of the triples")
    parser.add_argument("--max_e1", type=int, default=10)
    parser.add_argument("--max_r", type=int, default=5)
    parser.add_argument("--max_e2", type=int, default=15)

    parser.add_argument("--seed", type=int, default=123)
    parser.add_argument("--no_pretrain",
                        action="store_true",
                        help="w/o pretrained parameters initialized")
    parser.add_argument("--train_batch_size", type=int, default=32)
    parser.add_argument("--eval_batch_size", type=int, default=16)
    parser.add_argument(
        '--gradient_accumulation_steps',
        type=int,
        default=1,
        help=
        "Number of updates steps to accumulate before performing a backward/update pass."
    )
    parser.add_argument('--logging_steps', type=int, default=250)
    parser.add_argument("--eval_per_steps", type=int, default=500)
    parser.add_argument("--num_train_epochs", type=int, default=-1)
    parser.add_argument(
        "--max_steps",
        default=100000,
        type=int,
        help=
        "If > 0: set total number of training steps to perform. Override num_train_epochs."
    )

    parser.add_argument("--max_grad_norm", type=int, default=1)
    parser.add_argument("--learning_rate", type=float, default=1e-5)
    parser.add_argument("--warmup_proportion", type=float, default=0.002)
    parser.add_argument("--lr_schedule", type=str, default="warmup_linear")
    parser.add_argument("--weight_decay", type=float, default=0.0)
    parser.add_argument("--adam_epsilon", type=float, default=1e-8)

    args = parser.parse_args()
    print(args)

    assert (args.predict_part == "obj" or args.model_type == "xlnet")

    set_seed(args.seed)

    n_gpu = torch.cuda.device_count()
    logger.info("device: {}, n_gpu {}".format(device, n_gpu))

    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)

    MODEL_CLASSES = {
        "gpt2": (GPT2LMHeadModel, GPT2Tokenizer, GPT2Config),
        "openai-gpt":
        (OpenAIGPTLMHeadModel, OpenAIGPTTokenizer, OpenAIGPTConfig),
        "xlnet": (XLNetLMHeadModel, XLNetTokenizer, XLNetConfig),
    }
    Model, Tokenizer, Config = MODEL_CLASSES[args.model_type]

    # load pretrained model
    tokenizer = Tokenizer.from_pretrained(args.model_name_or_path)
    # add special tokens
    # TODO: something feels not so right
    print("\nspecial tokens:", tokenizer.special_tokens_map)
    if not tokenizer.eos_token:
        tokenizer.add_special_tokens({"eos_token": "<eos>"})
    if not tokenizer.sep_token:
        tokenizer.add_special_tokens({"sep_token": "<sep>"})

    tokenizer.add_tokens(["<from_CN>", "<from_VG>", "<from_FB>"])

    if args.no_pretrain:
        # from scratch
        config = Config.from_pretrained(args.model_type)
        model = Model(config)
    else:
        model = Model.from_pretrained(args.model_name_or_path)

    print("vocab size:", len(tokenizer))
    model.resize_token_embeddings(len(tokenizer))
    # Here is a bug:
    # the original HuggingFace code only resize LMHead weight but not LMHead bias, it will cause runtime error
    # here we manually change the size of LMHead bias in a silly way
    if args.model_type == "xlnet":
        from torch.nn.parameter import Parameter
        model.lm_loss.bias = Parameter(torch.Tensor(len(tokenizer)))
        fan_in, _ = torch.nn.init._calculate_fan_in_and_fan_out(
            model.lm_loss.weight)
        bound = 1 / math.sqrt(fan_in)
        torch.nn.init.uniform_(model.lm_loss.bias, -bound, bound)
        print("weight size:", model.lm_loss.weight.size())
        print("bias size:", model.lm_loss.bias.size())
    model.to(device)

    print("\nspecial tokens:", tokenizer.special_tokens_map)

    # Load and encode the datasets
    logger.info("Loading datasets ...")

    def prefix_mapping(filename):
        if "vg" in filename.lower():
            return "<from_VG>"
        elif "cn" in filename.lower():
            return "<from_CN>"
        elif "fb" in filename.lower():
            return "<from_FB>"

    def rel_lang(filename):
        if "vg" in filename.lower():
            return False
        elif "cn" in filename.lower():
            return True
        elif "easyfb" in filename.lower():
            return False
        elif "fb" in filename.lower():
            return True

    train_datasets = [
        load_comet_dataset(
            dataset_path=train_dataset,
            eos_token=tokenizer.eos_token,
            sep_token=tokenizer.sep_token,
            rel_lang=rel_lang(train_dataset),
            toy=args.toy,
            discard_negative=True,
            add_sep=args.add_separator,
            prefix=prefix_mapping(train_dataset) if args.add_prefix else None)
        for train_dataset in args.train_dataset
    ]
    eval_datasets = [
        load_comet_dataset(
            dataset_path=eval_dataset,
            eos_token=tokenizer.eos_token,
            sep_token=tokenizer.sep_token,
            rel_lang=rel_lang(eval_dataset),
            toy=args.toy,
            discard_negative=True,
            add_sep=args.add_separator,
            prefix=prefix_mapping(eval_dataset) if args.add_prefix else None)
        for eval_dataset in args.eval_dataset
    ]
    test_datasets = [
        load_comet_dataset(
            dataset_path=test_dataset,
            eos_token=tokenizer.eos_token,
            sep_token=tokenizer.sep_token,
            rel_lang=rel_lang(test_dataset),
            toy=args.toy,
            discard_negative=True,
            add_sep=args.add_separator,
            prefix=prefix_mapping(test_dataset) if args.add_prefix else None)
        for test_dataset in args.test_dataset
    ]
    train_datasets = [
        data for train_dataset in train_datasets for data in train_dataset
    ]
    eval_datasets = [
        data for eval_dataset in eval_datasets for data in eval_dataset
    ]
    test_datasets = [
        data for test_dataset in test_datasets for data in test_dataset
    ]
    datasets = (train_datasets, eval_datasets, test_datasets)
    logger.info("Encoding datasets ...")
    encoded_datasets = tokenize_and_encode(datasets, tokenizer)
    max_e1 = args.max_e1 if not args.add_separator else (args.max_e1 + 1)
    max_r = args.max_r if not args.add_separator else (args.max_r + 1)
    max_e2 = args.max_e2 + 1  # always add <eos>
    best_loss = 1e10

    # Prepare inputs tensors and dataloaders
    tensor_datasets = pre_process_datasets(encoded_datasets,
                                           max_e1,
                                           max_r,
                                           max_e2,
                                           predict_part=args.predict_part)
    train_tensor_dataset, eval_tensor_dataset, test_tensor_dataset = tensor_datasets[
        0], tensor_datasets[1], tensor_datasets[2]

    train_data = TensorDataset(*train_tensor_dataset)
    train_sampler = RandomSampler(train_data)
    train_dataloader = DataLoader(train_data,
                                  sampler=train_sampler,
                                  batch_size=args.train_batch_size)

    eval_data = TensorDataset(*eval_tensor_dataset)
    eval_sampler = SequentialSampler(eval_data)
    eval_dataloader = DataLoader(eval_data,
                                 sampler=eval_sampler,
                                 batch_size=args.eval_batch_size)

    test_data = TensorDataset(*test_tensor_dataset)
    test_sampler = SequentialSampler(test_data)
    test_dataloader = DataLoader(test_data,
                                 sampler=test_sampler,
                                 batch_size=args.eval_batch_size)

    if args.max_steps > 0:
        t_total = args.max_steps
        args.num_train_epochs = args.max_steps // (
            len(train_dataloader) // args.gradient_accumulation_steps) + 1
    else:
        t_total = len(
            train_dataloader
        ) // args.gradient_accumulation_steps * args.num_train_epochs

    if args.do_train:
        logger.info("***** Running training *****")
        logger.info("  Num examples = %d", len(train_datasets))
        logger.info("  Num Epochs = %d", args.num_train_epochs)
        logger.info(
            "  Each Epoch has %d steps, and %d actual steps w/ accumulation",
            len(train_dataloader),
            len(train_dataloader) // args.gradient_accumulation_steps)
        logger.info("  Total train batch size (w. accumulation) = %d",
                    args.train_batch_size * args.gradient_accumulation_steps)
        logger.info("  Gradient Accumulation steps = %d",
                    args.gradient_accumulation_steps)
        logger.info("  Total optimization steps = %d", t_total)

        param_optimizer = list(model.named_parameters())
        no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
        optimizer_grouped_parameters = [{
            "params": [
                p for n, p in param_optimizer
                if not any(nd in n for nd in no_decay)
            ],
            "weight_decay":
            args.weight_decay
        }, {
            "params":
            [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
            "weight_decay":
            0.0
        }]
        print("total steps:", t_total)
        num_warmup_steps = args.warmup_proportion * t_total
        optimizer = AdamW(optimizer_grouped_parameters,
                          lr=args.learning_rate,
                          eps=args.adam_epsilon)
        scheduler = WarmupLinearSchedule(optimizer,
                                         warmup_steps=num_warmup_steps,
                                         t_total=t_total)

        global_steps = 0
        tr_loss, logging_loss = 0.0, 0.0
        model.train()
        for cur_epoch_num in range(int(args.num_train_epochs)):
            print("Epoch:", cur_epoch_num)
            for step, batch in enumerate(train_dataloader):
                batch = tuple(t.to(device) for t in batch)
                batch_size = len(batch)
                loss, logits = batch_step(model, args.model_type, batch,
                                          args.predict_part, max_e1, max_r,
                                          max_e2, args.add_prefix)
                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps
                loss.backward()
                torch.nn.utils.clip_grad_norm_(model.parameters(),
                                               args.max_grad_norm)
                tr_loss += loss.item()
                if (step + 1) % args.gradient_accumulation_steps == 0:
                    scheduler.step()
                    optimizer.step()
                    optimizer.zero_grad()
                    global_steps += 1
                    if global_steps % args.logging_steps == 0:
                        loss = (tr_loss - logging_loss) / args.logging_steps
                        PPL = np.exp(loss) if loss < 300 else np.inf
                        print("Step", global_steps, "Training Loss:", loss,
                              "ppl:", PPL)
                        logging_loss = tr_loss

                    if global_steps % args.eval_per_steps == 0:
                        model.eval()
                        # evaluate
                        eval_loss = evaluate(model, args.model_type,
                                             args.predict_part,
                                             eval_dataloader, tokenizer,
                                             max_e1, max_r, max_e2,
                                             args.add_prefix)
                        print("\n\nevaluating\neval loss:", eval_loss, "ppl",
                              np.exp(eval_loss) if eval_loss < 300 else np.inf)
                        # decide to save
                        if eval_loss < best_loss:
                            # save
                            save_model(model, tokenizer, args.output_dir)
                            print("model saved at step", global_steps)
                            print(str(datetime.datetime.now()))
                            print("prev loss:", best_loss, "cur loss:",
                                  eval_loss)
                            best_loss = eval_loss
                        # test
                        test_loss = evaluate(model, args.model_type,
                                             args.predict_part,
                                             test_dataloader, tokenizer,
                                             max_e1, max_r, max_e2,
                                             args.add_prefix)
                        print("\n\ntesting\ntest loss:", test_loss, "ppl:",
                              np.exp(test_loss) if test_loss < 300 else np.inf)
                        model.train()

    if args.do_eval:
        model.eval()
        eval_loss = evaluate(model, args.model_type, args.predict_part,
                             eval_dataloader, tokenizer, max_e1, max_r, max_e2,
                             args.add_prefix)
        print("\n\nevaluating\neval loss:", eval_loss, "ppl",
              np.exp(eval_loss) if eval_loss < 300 else np.inf)
        test_loss = evaluate(model, args.model_type, args.predict_part,
                             test_dataloader, tokenizer, max_e1, max_r, max_e2,
                             args.add_prefix)
        print("\n\ntesting\ntest loss:", test_loss, "ppl:",
              np.exp(test_loss) if test_loss < 300 else np.inf)
Example #11
0
def train(args,
          train_dataset,
          model,
          tokenizer,
          mask_generator,
          training=False,
          meta_training=True):
    """ Train the model"""
    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)

    if meta_training:
        args.save_history = True
        train_dataset.add_label(mask_generator, tokenizer, args, model)

    train_sampler = RandomSampler(
        train_dataset) if args.local_rank == -1 else DistributedSampler(
            train_dataset)
    train_dataloader = DataLoader(train_dataset,
                                  sampler=train_sampler,
                                  batch_size=args.train_batch_size)

    if args.max_steps > 0:
        t_total = args.max_steps
        args.num_train_epochs = args.max_steps // (
            len(train_dataloader) // args.gradient_accumulation_steps) + 1
    else:
        t_total = len(
            train_dataloader
        ) // args.gradient_accumulation_steps * args.num_train_epochs
    # Prepare optimizer and schedule (linear warmup and decay)
    no_decay = ['bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params': [
            p for n, p in model.named_parameters()
            if not any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        args.weight_decay
    }, {
        'params': [
            p for n, p in model.named_parameters()
            if any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        0.0
    }]
    optimizer = AdamW(optimizer_grouped_parameters,
                      lr=args.learning_rate,
                      eps=args.adam_epsilon)
    scheduler = WarmupLinearSchedule(optimizer,
                                     warmup_steps=args.warmup_steps,
                                     t_total=t_total)

    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model,
                                      args.task_devices,
                                      output_device=0)
    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(
            model,
            device_ids=[args.local_rank],
            output_device=args.local_rank,
            find_unused_parameters=True)
    # Train!
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_dataset))
    logger.info("  Num Epochs = %d", args.num_train_epochs)
    logger.info("  Instantaneous batch size per GPU = %d",
                args.per_gpu_train_batch_size)
    logger.info(
        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
        args.train_batch_size * args.gradient_accumulation_steps *
        (torch.distributed.get_world_size() if args.local_rank != -1 else 1))
    logger.info("  Gradient Accumulation steps = %d",
                args.gradient_accumulation_steps)
    logger.info("  Total optimization steps = %d", t_total)
    global_step = 0
    tr_loss, logging_loss = 0.0, 0.0
    tr_acc, logging_acc = 0.0, 0.0

    model.zero_grad()
    train_iterator = trange(int(args.num_train_epochs),
                            desc="Pre-Training",
                            disable=args.local_rank not in [-1, 0])
    for epoch in train_iterator:
        epoch_iterator = tqdm(train_dataloader,
                              desc="Iteration",
                              disable=args.local_rank not in [-1, 0])
        for step, batch in enumerate(epoch_iterator):
            if epoch > 0: args.save_history = False
            else: args.save_history = True
            if meta_training:
                inputs, labels = batch
            else:
                inputs, labels = mask_generator.mask(
                    batch,
                    tokenizer,
                    args,
                    model=model.module if hasattr(model, 'module') else model)
            inputs = inputs.to(args.device)
            labels = labels.to(args.device)
            input_mask = ~inputs.eq(args.pad_token)
            model.train()
            _inputs = {
                'input_ids': inputs,
                'masked_lm_labels': labels,
                'attention_mask': input_mask,
            }

            outputs = model(**_inputs)
            loss = outputs[
                0]  # model outputs are always tuple in pytorch-transformers (see doc)
            if args.n_gpu > 1:
                loss = loss.mean(
                )  # mean() to average on multi-gpu parallel training
            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps
            loss.backward()

            tr_loss += loss.item()
            if (step + 1) % args.gradient_accumulation_steps == 0:
                torch.nn.utils.clip_grad_norm_(model.parameters(),
                                               args.max_grad_norm)
                optimizer.step()
                scheduler.step()  # Update learning rate schedule
                model.zero_grad()
                optimizer.zero_grad()
                global_step += 1

            if args.max_steps > 0 and global_step > args.max_steps:
                epoch_iterator.close()
                break
        # print(loss)
        # print("Global step: {} / Training Loss: {}".format(global_step, tr_loss / global_step))
        # print("Global Accuracy: {} / Training Accuracy: {}".format(global_step, tr_acc / global_step))
        if args.max_steps > 0 and global_step > args.max_steps:
            train_iterator.close()
            break
    del optimizer
    del scheduler
    return global_step, tr_loss / global_step
def Train(inputIds, attention_masks, labels, batch_size=24, epochs=10):
    train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(
        inputIds, labels, random_state=2020, test_size=0.2)
    train_masks, validation_masks, _, _ = train_test_split(attention_masks,
                                                           inputIds,
                                                           random_state=2020,
                                                           test_size=0.2)
    # Turn data into torch tensors
    train_inputs = torch.tensor(train_inputs)
    validation_inputs = torch.tensor(validation_inputs)
    train_labels = torch.tensor(train_labels)
    validation_labels = torch.tensor(validation_labels)
    train_masks = torch.tensor(train_masks)
    validation_masks = torch.tensor(validation_masks)

    # Create Iterators of the datasets
    train_data = TensorDataset(train_inputs, train_masks, train_labels)
    train_sampler = RandomSampler(train_data)
    train_dataloader = DataLoader(train_data,
                                  sampler=train_sampler,
                                  batch_size=batch_size)
    validation_data = TensorDataset(validation_inputs, validation_masks,
                                    validation_labels)
    validation_sampler = SequentialSampler(validation_data)
    validation_dataloader = DataLoader(validation_data,
                                       sampler=validation_sampler,
                                       batch_size=batch_size)

    model = XLNetForSequenceClassification.from_pretrained('xlnet-base-cased',
                                                           num_labels=2)
    # Loads model into GPU memory
    model.cuda()

    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'gamma', 'beta']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay_rate':
        0.01
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay_rate':
        0.0
    }]
    optimizer = AdamW(optimizer_grouped_parameters, lr=2e-5)

    # train_loss_set = []

    # Find GPU or CPU
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    trainLoss = []
    valAcc = []
    for _ in trange(epochs, desc='Epoch'):
        # Train
        model.train()

        trainLoss.append(0)
        nb_tr_examples, nb_tr_steps = 0, 0

        for step, batch in enumerate(train_dataloader):
            batch = tuple(t.to(device) for t in batch)
            b_input_ids, b_input_mask, b_labels = batch
            optimizer.zero_grad()
            # Forward pass and loss calculation
            outputs = model(b_input_ids,
                            token_type_ids=None,
                            attention_mask=b_input_mask,
                            labels=b_labels)
            loss = outputs[0]
            logits = outputs[1]
            # Calculate gradients
            loss.backward()
            # Update weights using gradients
            optimizer.step()

            trainLoss[-1] += loss.item()
            nb_tr_examples += b_input_ids.size(0)
            nb_tr_steps += 1

        print('\nTrain loss: {}'.format(trainLoss[-1] / nb_tr_steps))

        # Valuation
        model.eval()

        nb_eval_steps = 0
        valAcc.append(0)
        for batch in validation_dataloader:
            batch = tuple(t.to(device) for t in batch)
            b_input_ids, b_input_mask, b_labels = batch
            # Don't calculate gradients since we are evaluating the model
            with torch.no_grad():
                output = model(b_input_ids,
                               token_type_ids=None,
                               attention_mask=b_input_mask)
                logits = output[0]
            # Grab logistic values from GPU
            logits = logits.detach().cpu().numpy()
            label_ids = b_labels.to('cpu').numpy()

            tmp_eval_accuracy = flat_accuracy(logits, label_ids)

            valAcc[-1] += tmp_eval_accuracy
            nb_eval_steps += 1

        print('\nValidation Accuracy: {}\n'.format(valAcc[-1] / nb_eval_steps))

    return model, trainLoss, valAcc
Example #13
0
    def train(self):
        if not os.path.exists(self.output_dir):
            os.makedirs(self.output_dir)

        # Prepare model
        config = BertConfig.from_pretrained(self.model_name_or_path,
                                            num_labels=3)
        model = BertForSequenceClassification.from_pretrained(
            self.model_name_or_path, self.args, config=config)
        model.to(self.device)

        data_splitList = DATABDCI.load_data(os.path.join(
            self.data_dir, 'train.csv'),
                                            n_splits=5)
        for split_index, each_data in enumerate(data_splitList):
            logger.info(f'Fold {split_index + 1}')
            train_dataloader, eval_dataloader, train_examples, eval_examples = self.create_dataloader(
                each_data)

            num_train_optimization_steps = self.train_steps

            # Prepare optimizer

            param_optimizer = list(model.named_parameters())
            param_optimizer = [n for n in param_optimizer]

            no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
            optimizer_grouped_parameters = [{
                'params': [
                    p for n, p in param_optimizer
                    if not any(nd in n for nd in no_decay)
                ],
                'weight_decay':
                self.weight_decay
            }, {
                'params': [
                    p for n, p in param_optimizer
                    if any(nd in n for nd in no_decay)
                ],
                'weight_decay':
                0.0
            }]

            optimizer = AdamW(optimizer_grouped_parameters,
                              lr=self.learning_rate,
                              eps=self.adam_epsilon)
            scheduler = WarmupLinearSchedule(optimizer,
                                             warmup_steps=self.warmup_steps,
                                             t_total=self.train_steps)

            global_step = 0

            logger.info("***** Running training *****")
            logger.info("  Num examples = %d", len(train_examples))
            logger.info("  Batch size = %d", self.train_batch_size)
            logger.info("  Num steps = %d", num_train_optimization_steps)

            best_acc = 0
            model.train()
            tr_loss = 0
            nb_tr_examples, nb_tr_steps = 0, 0
            train_dataloader = cycle(train_dataloader)

            for step in range(num_train_optimization_steps):
                batch = next(train_dataloader)
                batch = tuple(t.to(self.device) for t in batch)
                input_ids, input_mask, segment_ids, label_ids = batch
                loss = model(input_ids=input_ids,
                             token_type_ids=segment_ids,
                             attention_mask=input_mask,
                             labels=label_ids)
                tr_loss += loss.item()
                train_loss = round(tr_loss / (nb_tr_steps + 1), 4)

                nb_tr_examples += input_ids.size(0)
                nb_tr_steps += 1

                loss.backward()

                if (nb_tr_steps + 1) % self.gradient_accumulation_steps == 0:

                    scheduler.step()
                    optimizer.step()
                    optimizer.zero_grad()
                    global_step += 1

                if (step + 1) % (self.eval_steps *
                                 self.gradient_accumulation_steps) == 0:
                    tr_loss = 0
                    nb_tr_examples, nb_tr_steps = 0, 0
                    logger.info("***** Report result *****")
                    logger.info("  %s = %s", 'global_step', str(global_step))
                    logger.info("  %s = %s", 'train loss', str(train_loss))

                if self.do_eval and (
                        step + 1) % (self.eval_steps *
                                     self.gradient_accumulation_steps) == 0:
                    for file in ['dev.csv']:
                        inference_labels = []
                        gold_labels = []
                        inference_logits = []

                        logger.info("***** Running evaluation *****")
                        logger.info("  Num examples = %d", len(eval_examples))
                        logger.info("  Batch size = %d", self.eval_batch_size)

                        # Run prediction for full data

                        model.eval()
                        eval_loss, eval_accuracy = 0, 0
                        nb_eval_steps, nb_eval_examples = 0, 0
                        for input_ids, input_mask, segment_ids, label_ids in eval_dataloader:
                            input_ids = input_ids.to(self.device)
                            input_mask = input_mask.to(self.device)
                            segment_ids = segment_ids.to(self.device)
                            label_ids = label_ids.to(self.device)

                            with torch.no_grad():
                                tmp_eval_loss = model(
                                    input_ids=input_ids,
                                    token_type_ids=segment_ids,
                                    attention_mask=input_mask,
                                    labels=label_ids)
                                logits = model(input_ids=input_ids,
                                               token_type_ids=segment_ids,
                                               attention_mask=input_mask)

                            logits = logits.detach().cpu().numpy()
                            label_ids = label_ids.to('cpu').numpy()
                            inference_labels.append(np.argmax(logits, axis=1))
                            gold_labels.append(label_ids)
                            inference_logits.append(logits)
                            eval_loss += tmp_eval_loss.mean().item()
                            nb_eval_examples += input_ids.size(0)
                            nb_eval_steps += 1

                        gold_labels = np.concatenate(gold_labels, 0)
                        inference_logits = np.concatenate(inference_logits, 0)
                        model.train()
                        eval_loss = eval_loss / nb_eval_steps
                        eval_accuracy = accuracyBDCI(inference_logits,
                                                     gold_labels)

                        result = {
                            'eval_loss': eval_loss,
                            'eval_F1': eval_accuracy,
                            'global_step': global_step,
                            'loss': train_loss
                        }

                        output_eval_file = os.path.join(
                            self.output_dir, "eval_results.txt")
                        with open(output_eval_file, "a") as writer:
                            for key in sorted(result.keys()):
                                logger.info("  %s = %s", key, str(result[key]))
                                writer.write("%s = %s\n" %
                                             (key, str(result[key])))
                            writer.write('*' * 80)
                            writer.write('\n')
                        if eval_accuracy > best_acc and 'dev' in file:
                            print("=" * 80)
                            print("Best F1", eval_accuracy)
                            print("Saving Model......")
                            best_acc = eval_accuracy
                            # Save a trained model
                            model_to_save = model.module if hasattr(
                                model, 'module') else model
                            output_model_file = os.path.join(
                                self.output_dir,
                                "pytorch_model_{}.bin".format(split_index))
                            torch.save(model_to_save.state_dict(),
                                       output_model_file)
                            print("=" * 80)
                        else:
                            print("=" * 80)
        if self.do_test:
            del model
            gc.collect()
            self.do_train = False
            data = DATABDCI(debug=False,
                            data_dir='/home/lsy2018/文本匹配/DATA/DATA_BDCI/',
                            data_process_output=
                            '/home/lsy2018/文本匹配/DATA/DATA_BDCI/data_1014/')
            model = BertForSequenceClassification.from_pretrained(
                os.path.join(self.output_dir, "pytorch_model.bin"),
                self.args,
                config=config)
            model.to(self.device)

            for file, flag in [('dev.csv', 'dev'), ('test.csv', 'test')]:
                inference_labels = []
                gold_labels = []
                eval_examples = data.read_examples(os.path.join(
                    self.data_dir, file),
                                                   is_training=False)
                print('exa', len(eval_examples))
                eval_features = data.convert_examples_to_features(
                    eval_examples, self.tokenizer, self.max_seq_length)
                all_input_ids = torch.tensor(data.select_field(
                    eval_features, 'input_ids'),
                                             dtype=torch.long)
                all_input_mask = torch.tensor(data.select_field(
                    eval_features, 'input_mask'),
                                              dtype=torch.long)
                all_segment_ids = torch.tensor(data.select_field(
                    eval_features, 'segment_ids'),
                                               dtype=torch.long)
                all_label = torch.tensor([f.label for f in eval_features],
                                         dtype=torch.long)

                eval_data = TensorDataset(all_input_ids, all_input_mask,
                                          all_segment_ids, all_label)
                # Run prediction for full data
                eval_sampler = SequentialSampler(eval_data)
                eval_dataloader = DataLoader(eval_data,
                                             sampler=eval_sampler,
                                             batch_size=self.eval_batch_size)

                model.eval()
                eval_loss, eval_accuracy = 0, 0
                nb_eval_steps, nb_eval_examples = 0, 0
                for input_ids, input_mask, segment_ids, label_ids in eval_dataloader:
                    input_ids = input_ids.to(self.device)
                    input_mask = input_mask.to(self.device)
                    segment_ids = segment_ids.to(self.device)
                    label_ids = label_ids.to(self.device)

                    with torch.no_grad():
                        logits = model(
                            input_ids=input_ids,
                            token_type_ids=segment_ids,
                            attention_mask=input_mask).detach().cpu().numpy()
                    label_ids = label_ids.to('cpu').numpy()
                    inference_labels.append(logits)
                    gold_labels.append(label_ids)
                gold_labels = np.concatenate(gold_labels, 0)
                logits = np.concatenate(inference_labels, 0)
                if flag == 'dev':
                    print(flag, accuracyBDCI(logits, gold_labels))
                if flag == 'test':
                    df = pd.read_csv(os.path.join(self.data_dir, file),
                                     names=['id', 'content', 'title', 'label'])
                    predict = np.argmax(logits, axis=1).tolist()
                    print(df.shape[0])
                    print(len(predict))
                    df['labelpre'] = predict
                    df[['id', 'labelpre'
                        ]].to_csv(os.path.join(self.output_dir, "sub.csv"),
                                  index=False,
                                  header=False)
Example #14
0
def main():
    args = parse_arguments()

    # ====== Set random seed =========
    random.seed(args.seed)
    torch.random.manual_seed(args.seed)
    torch.cuda.manual_seed(args.seed)
    # ======= Prepare ==========
    logging.basicConfig(level=logging.INFO)
    USE_CUDA = torch.cuda.is_available()
    FloatTensor = torch.cuda.FloatTensor if USE_CUDA else torch.FloatTensor
    LongTensor = torch.cuda.LongTensor if USE_CUDA else torch.LongTensor
    ByteTensor = torch.cuda.ByteTensor if USE_CUDA else torch.ByteTensor

    model, tokenizer = load_model(args)
    # =============== Load & process data ==============
    split_size = {'train': 0.85, 'test': 0.1, 'val': 0.05}
    data_loader, val_loader = get_data(args,
                                       split_size=split_size,
                                       tokenizer=tokenizer)
    # ========== Prepare optimizer =============
    # the gpt2 model from library has unnamed LM head. LM head's weights are tied to input embedding
    num_train_optimization_steps = len(
        data_loader) * args.num_train_epochs // args.train_batch_size

    param_optimizer = list(model.named_parameters())
    optimizer_grouped_parameters = construct_grouped_parameters(
        param_optimizer, args.learning_rate, use_discr=args.use_disc_lr)

    lm_funcs = get_unfreezing_funcs(optimizer_grouped_parameters,
                                    warmup_portion=args.warmup_proportion,
                                    total_steps=num_train_optimization_steps,
                                    use_unfreezing=args.use_unfreezing)

    optimizer = AdamW(optimizer_grouped_parameters,
                      lr=args.learning_rate,
                      correct_bias=False)
    scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer,
                                                  lr_lambda=lm_funcs)

    # Training
    print("Start training.")
    model.train()
    exp_average_loss = None
    progress_bar = trange(int(args.num_train_epochs), desc="Epoch", leave=True)
    min_eval_loss = 100  # large enough number
    early_terminate_counter = 0
    for _ in progress_bar:
        # for _ in range(int(args.num_train_epochs)):
        for sample in tqdm(data_loader):
            # for sample in data_loader:
            if args.keyword:
                x, type_x, pos_x, lm_x, x_len, _, keyword_x = sample
            else:
                x, type_x, pos_x, lm_x, x_len, _ = sample
                keyword_x = None
            input_len = x_len[0]
            lm_x[:, x_len[0] + 1 + args.first_K_tokens:-1] = -1
            loss = model(x,
                         position_ids=pos_x,
                         token_type_ids=type_x,
                         labels=lm_x,
                         key_word=keyword_x,
                         use_keyword=args.keyword)[0]
            loss.backward()
            optimizer.step()
            scheduler.step()
            optimizer.zero_grad()
            exp_average_loss = loss.item(
            ) if exp_average_loss is None else 0.7 * exp_average_loss + 0.3 * loss.item(
            )
            progress_bar.set_description(
                "Training loss: {}".format(exp_average_loss))

        eval_loss = evaluate(model, val_loader, use_keyword=args.keyword)
        print("Eval loss: {}".format(eval_loss))
        # if eval_loss < min_eval_loss:  # save the model only when the loss is the smallest
        if True:
            early_terminate_counter = 0
            min_eval_loss = eval_loss
            # ==== Save the model ====
            # Save a trained model, configuration and tokenizer
            model_to_save = model.module if hasattr(
                model, 'module') else model  # Only save the model it-self
            # If we save using the predefined names, we can load using `from_pretrained`
            output_dir = '../models/'
            output_model_file = os.path.join(output_dir + args.output_dir,
                                             WEIGHTS_NAME)
            output_config_file = os.path.join(output_dir + args.output_dir,
                                              CONFIG_NAME)

            torch.save(model_to_save.state_dict(), output_model_file)
            model_to_save.config.to_json_file(output_config_file)
            tokenizer.save_vocabulary(output_dir + args.output_dir)
        else:
            print("eval loss increasing!")
            early_terminate_counter += 1
            if early_terminate_counter > 5:  # if the eval loss does not decrease for 5 epochs, terminate early.
                return
Example #15
0
def train(model, tra_data, dev_data, tra_word_vocab, config):
    optimizer = AdamW(model.parameters(), lr=config.bert_lr, correct_bias=config.correct_bias, weight_decay=config.weight_decay)

    tra_word_data_iter = create_batch(tra_data, tra_word_vocab, config.batch_size, config, shuffle=False)
    dev_word_data_iter = create_batch(dev_data, tra_word_vocab, config.dev_batch_size, config, shuffle=False)

    random_word_iter = data_split(tra_word_data_iter, config.n_fold)
    tra_word_data_iter, dev_database = database(random_word_iter, config.k, config)

    # Get start!
    global_step = 0

    best_acc = 0
    best_tra_acc = 0

    for epoch in range(0, config.epoch):
        score = 0
        print('\nThe epoch is starting.')
        epoch_start_time = time.time()
        batch_iter = 0
        batch_num = int(len(tra_word_data_iter))
        print('The epoch is :', str(epoch))
        if config.use_lr_decay:
            optimizer = decay_learning_rate(config, optimizer, epoch)
            print("now word_ga lr is {}".format(optimizer.param_groups[0].get("lr")), '\n')
        for word_batch in tra_word_data_iter:
            start_time = time.time()
            model.train()
            batch_size = tra_word_data_iter[0][0].size(0) / 2
            src_premise_matrix, src_hypothesis_matrix, p_mask, h_mask, tag_matrix = word_batch[0], \
                                                                                    word_batch[1], \
                                                                                    word_batch[2], \
                                                                                    word_batch[3], \
                                                                                    word_batch[4]
            logit_a, logit_b = model(src_premise_matrix, src_hypothesis_matrix, p_mask, h_mask)
            loss, correct = tri_loss(logit_a, logit_b, config)
            loss = loss / config.update_every
            loss.backward()
            loss_value = loss.item()
            accuracy = 100.0 * int(correct) / batch_size
            during_time = float(time.time() - start_time)
            print('Step:{}, Epoch:{}, batch_iter:{}, accuracy:{:.4f}({}/{}),'
                  'time:{:.2f}, loss:{:.6f}'.format(global_step, epoch, batch_iter, accuracy, correct, batch_size,
                                                    during_time, loss_value))
            batch_iter += 1

            if batch_iter % config.update_every == 0 or batch_iter == batch_num:
                if config.clip_max_norm_use:
                    nn.utils.clip_grad_norm_(model.parameters(), max_norm=10)
                optimizer.step()
                optimizer.zero_grad()
                global_step += 1
            score += correct

            if batch_iter % config.test_interval == 0 or batch_iter == batch_num:
                dev_score = evaluate(model, dev_data, dev_word_data_iter, config)
                if best_acc < dev_score:
                    print('The best dev is' + str(dev_score))
                    best_acc = dev_score
                    if os.path.exists(config.save_model_path):
                        torch.save(model.state_dict(), config.bert_model_pkl)
                    else:
                        os.makedirs(config.save_model_path)
                        torch.save(model.state_dict(), config.bert_model_pkl)
        epoch_time = float(time.time() - epoch_start_time)
        tra_score = 100.0 * score / len(tra_data)
        if tra_score > best_tra_acc:
            best_tra_acc = tra_score
            print('the best_train score is:{}({}/{})'.format(tra_score, score, len(tra_data)))
        print("epoch_time is:", epoch_time)
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--model_name", type=str, help="pretrained_model.")
    parser.add_argument("--do_train",
                        action='store_true',
                        help="Whether to run training.")
    parser.add_argument("--do_eval",
                        action='store_true',
                        help="Whether to run eval on the dev set.")
    parser.add_argument("--do_probe",
                        action='store_true',
                        help="Whether to probe the representation we got.")
    parser.add_argument(
        "--output_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The output directory where the model predictions and checkpoints will be written."
    )
    parser.add_argument(
        '--data_dir',
        type=str,
        default=
        '/home/xiongyi/dataxyz/repos/SemSynLSTM/word_language_model/data/wikitext-2/'
    )
    parser.add_argument('--seed', type=int, default=42)
    parser.add_argument('--num_train_epochs', type=int, default=3)
    parser.add_argument('--train_batch_size', type=int, default=8)
    parser.add_argument('--eval_batch_size', type=int, default=16)
    parser.add_argument('--max_grad_norm', type=int, default=1)
    parser.add_argument('--learning_rate', type=float, default=6.25e-5)
    parser.add_argument('--warmup_proportion', type=float, default=0.002)
    parser.add_argument('--lr_schedule', type=str, default='warmup_linear')
    parser.add_argument('--weight_decay', type=float, default=0.01)
    parser.add_argument('--lm_coef', type=float, default=0.9)
    parser.add_argument('--n_valid', type=int, default=374)

    timenow = datetime.datetime.now().strftime("%b%d%H%M")
    model_option = 'adv'
    outdir = model_option + timenow

    args = parser.parse_args(
        ['--output_dir', outdir, '--do_probe', '--num_train_epochs', '50'])
    #args = parser.parse_args(['--output_dir', './tmp', '--do_eval', '--model_name', 'gpt2'])
    print(args)

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    n_gpu = torch.cuda.device_count()
    logger.info("device: {}, n_gpu {}".format(device, n_gpu))

    if not args.do_train and not args.do_eval and not args.do_probe:
        raise ValueError(
            "At least one of `do_train` or `do_eval` must be True.")

    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)

    # Compute the max input length for the Transformer
    # Todo: Where is this used?
    input_length = 128
    data_dir = '../SemSynLSTM/word_language_model/data/wikitext-2/' if args.data_dir is None else args.data_dir
    train_set, val_set, test_set, dictionary, pos_dictionary = load_tokenize_and_batchify(
        data_dir, input_length)

    # Prepare inputs tensors and dataloaders

    train_data = TensorDataset(*train_set)
    train_sampler = RandomSampler(train_data)
    train_dataloader = DataLoader(train_data,
                                  sampler=train_sampler,
                                  batch_size=32)

    eval_data = TensorDataset(*val_set)
    eval_sampler = SequentialSampler(eval_data)
    eval_dataloader = DataLoader(eval_data,
                                 sampler=eval_sampler,
                                 batch_size=32)

    # TODO: Load tokenizer and model
    # This loading functions also add new tokens and embeddings called `special tokens`
    # These new embeddings will be fine-tuned on the RocStories dataset
    #special_tokens = ['_start_', '_delimiter_']
    #special_tokens_ids = list(tokenizer.convert_tokens_to_ids(token) for token in special_tokens)

    # TODO: Add config
    config = GPT2Config(n_positions=input_length,
                        n_ctx=input_length,
                        n_layer=6,
                        n_head=8,
                        n_embd=384)
    config.vocab_size = dictionary.__len__()
    config.pos_vocab_size = pos_dictionary.__len__()
    if args.model_name:
        model = GPT2LMHeadModel.from_pretrained(args.model_name)
    else:
        model = GPT2_adverse(config=config)
    model.to(device)

    # TODO: Load and encode the datasets

    logger.info("Encoding dataset...")

    # Prepare optimizer
    if args.do_train:
        all_param = list(model.named_parameters())
        param_optimizer = [(n, p) for n, p in all_param
                           if 'pos_head_adv' not in n]
        param_optimizer_adv = [(n, p) for n, p in all_param
                               if 'pos_head_adv' in n]
        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [{
            'params': [
                p for n, p in param_optimizer
                if not any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            0.01
        }, {
            'params':
            [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
            'weight_decay':
            0.0
        }]

        optimizer_adv_grouped_parameters = [{
            'params': [
                p for n, p in param_optimizer_adv
                if not any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            0.01
        }, {
            'params': [
                p for n, p in param_optimizer_adv
                if any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            0.0
        }]
        num_train_optimization_steps = len(
            train_dataloader) * args.num_train_epochs
        optimizer = AdamW(
            optimizer_grouped_parameters,
            lr=args.learning_rate,
            #max_grad_norm=args.max_grad_norm,
            weight_decay=args.weight_decay)
        #t_total=num_train_optimization_steps)
        optimizer_adv = AdamW(
            optimizer_adv_grouped_parameters,
            lr=args.learning_rate,
            #max_grad_norm=args.max_grad_norm,
            weight_decay=args.weight_decay)

    if args.do_train:
        train_results = {}
        nb_tr_steps, tr_loss, exp_average_loss = 0, 0, None
        for epoch in trange(int(args.num_train_epochs), desc="Epoch"):
            ###eval on eval set
            model.eval()
            nb_eval_steps, nb_eval_examples = 0, 0
            perp = 0
            average_loss = np.asanyarray([0, 0, 0, 0], dtype='float')
            for batch in tqdm(eval_dataloader, desc="Evaluating"):
                batch = tuple(t.to(device) for t in batch)
                input_ids, input_pos_ids = batch

                with torch.no_grad():
                    #breakpoint()
                    loss = model(
                        input_ids, labels=input_ids,
                        pos_ids=input_pos_ids)[0].detach().cpu().numpy()
                    loss_syn = model(
                        input_ids, labels=input_ids,
                        pos_ids=input_pos_ids)[1].detach().cpu().numpy()
                    loss_sem = model(
                        input_ids, labels=input_ids,
                        pos_ids=input_pos_ids)[2].detach().cpu().numpy()
                    loss_lm = model(
                        input_ids, labels=input_ids,
                        pos_ids=input_pos_ids)[3].detach().cpu().numpy()
                    perp_batch = np.exp(loss_lm)
                    perp += perp_batch
                    average_loss += np.asanyarray(
                        [loss, loss_syn, loss_sem, loss_lm])
                nb_eval_steps += 1
            perp /= nb_eval_steps
            average_loss /= nb_eval_steps
            print('loss,loss_syn,loss_sem,loss_lm', average_loss, 'perp ',
                  perp, 'epoch ', epoch)
            train_results[epoch] = (perp, average_loss)

            model.train()

            tr_loss = 0
            nb_tr_steps = 0
            tqdm_bar = tqdm(train_dataloader, desc="Training")
            for step, batch in enumerate(tqdm_bar):
                batch = tuple(t.to(device) for t in batch)
                input_ids, input_pos_ids = batch
                loss = model(input_ids,
                             labels=input_ids,
                             pos_ids=input_pos_ids)[0]
                loss_lm = model(input_ids,
                                labels=input_ids,
                                pos_ids=input_pos_ids)[3]
                loss_sem = model(input_ids,
                                 labels=input_ids,
                                 pos_ids=input_pos_ids)[2]
                #breakpoint()
                #loss = args.lm_coef * losses[0] + losses[1]
                loss.backward()
                optimizer.step()
                optimizer.zero_grad()
                loss_sem.backward()
                optimizer_adv.step()
                optimizer_adv.zero_grad()
                tr_loss += loss.item()
                exp_average_loss = loss.item(
                ) if exp_average_loss is None else 0.7 * exp_average_loss + 0.3 * loss.item(
                )
                nb_tr_steps += 1
                tqdm_bar.desc = "Training loss: {:.2e} sem: {:.2e} lm: {:.2e}".format(
                    exp_average_loss, loss_sem.item(), loss_lm.item())
        print(train_results)
    # Save a trained model
    if args.do_train:
        all_param = list(model.named_parameters())
        param_optimizer = [(n, p) for n, p in all_param
                           if 'pos_head_adv' not in n]
        param_optimizer_adv = [(n, p) for n, p in all_param
                               if 'pos_head_adv' in n]
        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [{
            'params': [
                p for n, p in param_optimizer
                if not any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            0.01
        }, {
            'params':
            [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
            'weight_decay':
            0.0
        }]

        optimizer_adv_grouped_parameters = [{
            'params': [
                p for n, p in param_optimizer_adv
                if not any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            0.01
        }, {
            'params': [
                p for n, p in param_optimizer_adv
                if any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            0.0
        }]
        num_train_optimization_steps = len(
            train_dataloader) * args.num_train_epochs
        optimizer = AdamW(
            optimizer_grouped_parameters,
            lr=args.learning_rate,
            #max_grad_norm=args.max_grad_norm,
            weight_decay=args.weight_decay)
        #t_total=num_train_optimization_steps)
        optimizer_adv = AdamW(
            optimizer_adv_grouped_parameters,
            lr=args.learning_rate,
            #max_grad_norm=args.max_grad_norm,
            weight_decay=args.weight_decay)
    if args.do_train:
        # Save a trained model, configuration and tokenizer
        model_to_save = model.module if hasattr(
            model, 'module') else model  # Only save the model it-self

        # If we save using the predefined names, we can load using `from_pretrained`
        output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME)
        output_config_file = os.path.join(args.output_dir, CONFIG_NAME)

        torch.save(model_to_save.state_dict(), output_model_file)
        model_to_save.config.to_json_file(output_config_file)
        #tokenizer.save_vocabulary(args.output_dir)

        # Load a trained model and vocabulary that you have fine-tuned
        model = GPT2LMHeadModel.from_pretrained(args.output_dir)
        #tokenizer = OpenAIGPTTokenizer.from_pretrained(args.output_dir)
        model.to(device)

    if args.do_eval:
        model.eval()
        nb_eval_steps, nb_eval_examples = 0, 0
        log_probs_sum = 0
        perp = 0
        for batch in tqdm(eval_dataloader, desc="Evaluating"):
            batch = tuple(t.to(device) for t in batch)
            input_ids, input_pos_ids = batch

            with torch.no_grad():
                loss = model(input_ids,
                             labels=input_ids)[0].detach().cpu().numpy()
                perp_batch = np.exp(loss)
                perp += perp_batch
            nb_eval_steps += 1

        perp /= nb_eval_steps
        # perp_word = perp / 128
        print(perp)
        result = {'eval_perp': perp}
        logger.info("***** Eval results *****")
        logger.info("'eval_perp' = %s", str(result['eval_perp']))

    if args.do_probe:

        ##load model (how???)
        model_path = '/home/xiongyi/dataxyz/repos/pytorch-pretrained-BERT/examples/advJul232307/pytorch_model.bin'
        model.load_state_dict(torch.load(model_path))
        ##Add a mlp to the representation

        probe_model = ProbeModel(model, config)
        probe_model.to(device)
        ##train and eval
        all_param = list(probe_model.named_parameters())
        param_probe = [(n, p) for n, p in all_param if 'probe_cls' in n]
        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [{
            'params':
            [p for n, p in param_probe if not any(nd in n for nd in no_decay)],
            'weight_decay':
            0.01
        }, {
            'params':
            [p for n, p in param_probe if any(nd in n for nd in no_decay)],
            'weight_decay':
            0.0
        }]
        optimizer = AdamW(
            optimizer_grouped_parameters,
            lr=args.learning_rate,
            # max_grad_norm=args.max_grad_norm,
            weight_decay=args.weight_decay)
        # t_total=num_train_optimization_steps)
        train_results = {}
        nb_tr_steps, tr_loss, exp_average_loss = 0, 0, None
        for epoch in trange(int(args.num_train_epochs), desc="Epoch"):
            ###eval on eval set
            probe_model.eval()
            nb_eval_steps, nb_eval_examples = 0, 0
            average_loss = 0
            average_acc = 0
            for batch in tqdm(eval_dataloader, desc="Evaluating"):
                batch = tuple(t.to(device) for t in batch)
                input_ids, input_pos_ids = batch

                with torch.no_grad():
                    #breakpoint()
                    loss = probe_model(
                        input_ids, labels=input_ids,
                        pos_ids=input_pos_ids)[0].detach().cpu().numpy()
                    pos_logits = probe_model(
                        input_ids, labels=input_ids,
                        pos_ids=input_pos_ids)[1].detach().cpu().numpy()
                    predicted_labels = np.argmax(pos_logits, -1)
                    correct_rate = np.mean(predicted_labels == input_pos_ids.
                                           detach().cpu().numpy()[:, 1:])
                    average_acc += correct_rate
                    average_loss += loss
                nb_eval_steps += 1
            average_loss /= nb_eval_steps
            ##TODO Hard CODED!
            average_acc /= nb_eval_steps
            print('loss', average_loss, ' acc_rate ', average_acc, ' epoch ',
                  epoch)
            train_results[epoch] = (average_loss, average_acc)

            probe_model.train()

            tr_loss = 0
            nb_tr_steps = 0
            tqdm_bar = tqdm(train_dataloader, desc="Training")
            for step, batch in enumerate(tqdm_bar):
                batch = tuple(t.to(device) for t in batch)
                input_ids, input_pos_ids = batch
                loss = probe_model(input_ids,
                                   labels=input_ids,
                                   pos_ids=input_pos_ids)[0]

                # breakpoint()
                # loss = args.lm_coef * losses[0] + losses[1]
                loss.backward()
                optimizer.step()
                optimizer.zero_grad()
                tr_loss += loss.item()
                exp_average_loss = loss.item(
                ) if exp_average_loss is None else 0.7 * exp_average_loss + 0.3 * loss.item(
                )
                nb_tr_steps += 1
                tqdm_bar.desc = "Training loss: {:.2e}".format(
                    exp_average_loss)
Example #17
0
def main():

    parser = argparse.ArgumentParser()
    parser.add_argument('--model_name',
                        type=str,
                        default='gpt2-medium',
                        help='pretrained model name')
    parser.add_argument("--do_train",
                        action='store_true',
                        default=True,
                        help="Whether to run training.")
    parser.add_argument(
        "--output_dir",
        default='fintuned_gpt',
        type=str,
        help=
        "The output directory where the model predictions and checkpoints will be written."
    )
    parser.add_argument('--dataset', type=str, default='', required=True)
    parser.add_argument('--seed', type=int, default=42)
    parser.add_argument('--opt_level', type=str, default='O1')
    parser.add_argument('--num_train_epochs', type=int, default=3)
    parser.add_argument('--train_batch_size', type=int, default=8)
    parser.add_argument('--eval_batch_size', type=int, default=8)
    parser.add_argument('--num_prior', type=int, default=2)
    parser.add_argument("--adam_epsilon",
                        default=1e-8,
                        type=float,
                        help="Epsilon for Adam optimizer.")
    parser.add_argument('--max_grad_norm', type=int, default=1)
    parser.add_argument("--max_steps",
                        default=-1,
                        type=int,
                        help="If > 0: set total number of training \
                        steps to perform. Override num_train_epochs.")
    parser.add_argument('--gradient_accumulation_steps',
                        type=int,
                        default=1,
                        help="Number of updates steps to accumulate before\
                        performing a backward/update pass.")
    parser.add_argument('--learning_rate', type=float, default=6.25e-5)
    parser.add_argument("--warmup_steps",
                        default=0,
                        type=int,
                        help="Linear warmup over warmup_steps.")
    parser.add_argument('--lr_schedule', type=str, default='warmup_linear')
    parser.add_argument('--weight_decay', type=float, default=0.01)
    parser.add_argument('--lm_coef', type=float, default=0.9)

    parser.add_argument('--server_ip',
                        type=str,
                        default='',
                        help="Can be used for distant debugging.")
    parser.add_argument('--server_port',
                        type=str,
                        default='',
                        help="Can be used for distant debugging.")
    args = parser.parse_args()
    print(args)

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    n_gpu = torch.cuda.device_count()
    logger.info("device: {}, n_gpu {}".format(device, n_gpu))

    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)

    # Load tokenizer and model
    # This loading functions also add new tokens and embeddings called `special tokens`
    # These new embeddings will be fine-tuned on the RocStories dataset.
    # start_token, delimiter_token, clf_token

    special_tokens_dict = {
        'cls_token': '<|cls|>',
        'unk_token': '<|unk|>',
        'bos_token': '<|endoftext|>',
        'eos_token': '<|endoftext|>',
        'sep_token': '<|endoftext|>'
    }
    tokenizer = GPT2Tokenizer.from_pretrained(args.model_name)

    num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)
    print('We have added', num_added_toks, 'tokens')

    #start_token, delimiter_token, clf_token
    special_tokens_ids = list(
        tokenizer.convert_tokens_to_ids(token)
        for token in ['<|endoftext|>', '<|endoftext|>', '<|cls|>'])
    model = GPT2DoubleHeadsModel.from_pretrained(args.model_name)
    model.resize_token_embeddings(len(tokenizer))
    model.to(device)

    def tokenize_and_encode(obj):
        """ Tokenize and encode a nested object """
        if isinstance(obj, str):
            return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(obj))
        elif isinstance(obj, int):
            return obj
        return list(tokenize_and_encode(o) for o in obj)

    logger.info("Encoding dataset...")

    train_dataset = load_dataset(tokenizer,
                                 args.dataset,
                                 num_prior=args.num_prior)
    eval_dataset = load_dataset(tokenizer,
                                args.dataset,
                                num_prior=args.num_prior)

    datasets = (train_dataset, eval_dataset)
    encoded_datasets = tokenize_and_encode(datasets)

    # Compute the max input length for the Transformer
    max_length = model.config.n_positions // 2 - 2
    input_length = max(len(story[:max_length]) + max(len(cont1[:max_length]), len(cont2[:max_length])) + 3  \
                        for dataset in encoded_datasets for story, cont1, cont2, _ in dataset)
    input_length = min(input_length, model.config.n_positions
                       )  # Max size of input for the pre-trained model

    # Prepare inputs tensors and dataloaders
    tensor_datasets = pre_process_datasets(encoded_datasets, input_length,
                                           max_length, *special_tokens_ids)
    train_tensor_dataset, eval_tensor_dataset = tensor_datasets[
        0], tensor_datasets[1]

    train_data = TensorDataset(*train_tensor_dataset)
    train_sampler = RandomSampler(train_data)
    train_dataloader = DataLoader(train_data,
                                  sampler=train_sampler,
                                  batch_size=args.train_batch_size)

    eval_data = TensorDataset(*eval_tensor_dataset)
    eval_sampler = SequentialSampler(eval_data)
    eval_dataloader = DataLoader(eval_data,
                                 sampler=eval_sampler,
                                 batch_size=args.eval_batch_size)

    # Prepare optimizer

    if args.max_steps > 0:
        t_total = args.max_steps
        args.num_train_epochs = args.max_steps //\
            (len(train_dataloader) // args.gradient_accumulation_steps) + 1
    else:
        t_total = len(train_dataloader)\
            // args.gradient_accumulation_steps * args.num_train_epochs

    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        0.01
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]
    optimizer = AdamW(optimizer_grouped_parameters,
                      lr=args.learning_rate,
                      eps=args.adam_epsilon)
    scheduler = WarmupLinearSchedule(optimizer,
                                     warmup_steps=args.warmup_steps,
                                     t_total=t_total)

    model, optimizer = amp.initialize(model,
                                      optimizer,
                                      opt_level=args.opt_level,
                                      verbosity=1)

    nb_tr_steps, tr_loss, exp_average_loss = 0, 0, None
    model.train()
    for i, _ in enumerate(range(int(args.num_train_epochs))):
        print('Starting Epoch: {} of {}'.format(
            str(i + 1), str(int(args.num_train_epochs))))
        tr_loss = 0
        nb_tr_steps = 0
        tqdm_bar = tqdm(train_dataloader, desc="Training")
        for step, batch in enumerate(tqdm_bar):
            batch = tuple(t.to(device) for t in batch)
            input_ids, mc_token_ids, lm_labels, mc_labels = batch
            losses = model(input_ids, mc_token_ids, lm_labels, mc_labels)
            loss = args.lm_coef * losses[0] + losses[1]
            with amp.scale_loss(loss, optimizer) as scaled_loss:
                scaled_loss.backward()

            if (nb_tr_steps + 1) % args.gradient_accumulation_steps == 0:
                scheduler.step()
                optimizer.step()
                optimizer.zero_grad()

            tr_loss += loss.item()
            exp_average_loss = loss.item(
            ) if exp_average_loss is None else 0.7 * exp_average_loss + 0.3 * loss.item(
            )
            nb_tr_steps += 1
            tqdm_bar.desc = "Training loss: {:.2e} lr: {:.2e}".format(
                exp_average_loss,
                scheduler.get_lr()[0])

        if torch.cuda.is_available():
            torch.cuda.empty_cache()


# Save a trained model

# Save a trained model, configuration and tokenizer
    model_to_save = model.module if hasattr(
        model, 'module') else model  # Only save the model it-self

    # If we save using the predefined names, we can load using `from_pretrained`
    output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME)
    output_config_file = os.path.join(args.output_dir, CONFIG_NAME)

    torch.save(model_to_save.state_dict(), output_model_file)
    model_to_save.config.to_json_file(output_config_file)
    tokenizer.save_vocabulary(args.output_dir)

    # Load a trained model and vocabulary that you have fine-tuned
    model = GPT2DoubleHeadsModel.from_pretrained(args.output_dir)
    tokenizer = GPT2Tokenizer.from_pretrained(args.output_dir)
    model.to(device)
Example #18
0
class Trainer:
    def __init__(self, args, config, model, criterion, train_dataloader,
                 valid_dataloader, logger, save_path, tb_writer):

        self.args = args
        self.config = config
        self.model = model
        self.criterion = criterion
        self.train_dataloader = train_dataloader
        self.valid_dataloader = valid_dataloader
        self.logger = logger
        self.save_path = save_path
        self.tb_writer = tb_writer

        self.t_total = len(self.train_dataloader) * self.args.epoch
        self.device = self.config.device
        self.optimizer = AdamW(self.get_model_parameters(),
                               lr=self.config.learning_rate)
        self.scheduler = WarmupLinearSchedule(self.optimizer,
                                              0.1 * self.t_total, self.t_total)

        self.global_step = 0
        self.best_eval_acc = 0.2

    def get_model_parameters(self):
        # Optimizer & Loss
        param_optimizer = list(self.model.named_parameters())
        no_decay = ['bias', 'LayerNorm.weight', 'LayerNorm.bias']
        optimizer_grouped_parameters = [{
            'params': [
                p for n, p in param_optimizer
                if not any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            0.01
        }, {
            'params':
            [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
            'weight_decay':
            0.0
        }]

        return optimizer_grouped_parameters

    def train(self, do_eval=True, do_save=True):

        for epoch in range(self.args.epoch):
            self.train_epoch(epoch)
            self.evaluation(epoch)
            self.write_to_tb()
            self.save_model(epoch)

        self.tb_writer.close()

    def transform_to_bert_input(self, batch):

        input_ids, valid_length, token_type_ids = batch[0], batch[1], batch[2]

        input_ids = torch.from_numpy(input_ids).to(self.device)
        valid_length = valid_length.clone().detach().to(self.device)
        token_type_ids = torch.tensor(token_type_ids).long().to(self.device)

        return input_ids, valid_length, token_type_ids

    def compute_acc(self, y_hat, y, mean=True):
        if mean:
            yhat = y_hat.max(
                dim=-1)[1]  # [0]: max value, [1]: index of max value
            acc = (yhat == y).float().mean()  # padding은 acc에서 제거
            return acc
        else:
            correct_count = (yhat == y).long().sum()
            return correct_count

    def train_epoch(self, epoch):
        self.model.to(self.device)
        self.model.train()

        tr_correct_cnt, tr_total_cnt = 0, 0
        tr_loss = 0.0
        # train_loader = tqdm(self.train_dataloader)
        train_loader = self.train_dataloader

        for step, batch in enumerate(train_loader):

            self.model.zero_grad()

            sent1 = batch['sent1']
            input_1, valid_length_1, token_type_1 = self.transform_to_bert_input(
                sent1)
            embed1 = self.model(input_1, valid_length_1, token_type_1)

            sent2 = batch['sent2']
            input_2, valid_length_2, token_type_2 = self.transform_to_bert_input(
                sent2)
            embed2 = self.model(input_2, valid_length_2, token_type_2)

            label = batch['label']
            label = torch.tensor(label).long().to(self.device)

            pred = self.model.get_logit(embed1, embed2)
            loss = self.criterion(pred, label.view(-1))

            tr_loss += loss.item()
            loss.backward()

            if step > 0 and (
                    step) % self.config.gradient_accumulation_steps == 0:
                self.global_step += self.config.gradient_accumulation_steps

                self.optimizer.step()
                self.optimizer.zero_grad()
                self.scheduler.step()

                with torch.no_grad():
                    accuracy = self.compute_acc(pred, label)

                self.tr_acc = accuracy.item()
                self.tr_avg_loss = tr_loss / self.global_step

                if self.global_step % 100 == 0:  #int(len(self.train_dataloader)/5) ==0:

                    self.logger.info(
                        'epoch : {} /{}, global_step : {} /{}, tr_avg_loss: {:.3f}, tr_acc: {:.2%}'
                        .format(epoch + 1, self.args.epoch, self.global_step,
                                self.t_total, self.tr_avg_loss, self.tr_acc))

    def evaluation(self, epoch):
        self.model.eval()
        eval_correct_cnt, eval_total_cnt = 0, 0
        eval_loss = 0.0

        eval_acc = 0.0
        eval_step = 1

        self.logger.info('*****************Evaluation*****************')
        valid_loader = tqdm(self.valid_dataloader)
        for step, batch in enumerate(valid_loader):
            with torch.no_grad():

                sent1 = batch['sent1']
                input_1, valid_length_1, token_type_1 = self.transform_to_bert_input(
                    sent1)
                embed1 = self.model(input_1, valid_length_1, token_type_1)

                sent2 = batch['sent2']
                input_2, valid_length_2, token_type_2 = self.transform_to_bert_input(
                    sent2)
                embed2 = self.model(input_2, valid_length_2, token_type_2)

                label = batch['label']
                label = torch.tensor(label).long().to(self.device)
                pred = self.model.get_logit(embed1, embed2)

            loss = self.criterion(pred, label.view(-1))
            eval_loss += loss.item()

            acc = self.compute_acc(pred, label)
            eval_acc += acc.item()
            eval_step += 1.0

        self.eval_avg_loss = eval_loss / eval_step
        self.eval_avg_acc = eval_acc / eval_step

        self.logger.info(
            'epoch : {} /{}, global_step : {} /{}, eval_loss: {:.3f}, eval_acc: {:.2%}'
            .format(epoch + 1, self.args.epoch, self.global_step, self.t_total,
                    self.eval_avg_loss, self.eval_avg_acc))

    def save_model(self, epoch):
        if self.eval_avg_acc > self.best_eval_acc:
            self.best_eval_acc = self.eval_avg_acc

            self.model.to(torch.device('cpu'))
            state = {
                'epoch': epoch + 1,
                'model_state_dict': self.model.state_dict(),
                'opt_state_dict': self.optimizer.state_dict()
            }

            save_model_path = '{}/epoch_{}_step_{}_tr_acc_{:.3f}_tr_loss_{:.3f}_eval_acc_{:.3f}_eval_loss_{:.3f}.pt'.format(
                self.save_path, epoch + 1, self.global_step, self.tr_acc,
                self.tr_avg_loss, self.eval_avg_acc, self.eval_avg_loss)

            # Delte previous checkpoint
            if len(glob.glob(self.save_path + '/epoch*.pt')) > 0:
                os.remove(glob.glob(self.save_path + '/epoch*.pt')[0])
            torch.save(state, save_model_path)
            self.logger.info(' Model saved to {}'.format(save_model_path))

            os.mkdir(self.save_path +
                     '/epoch_{}_eval_loss_{:.3f}_eval_acc_{:.3f}'.format(
                         epoch + 1, self.eval_avg_loss, self.eval_avg_acc))

    def write_to_tb(self):
        self.tb_writer.add_scalars('loss', {
            'train': self.tr_avg_loss,
            'val': self.eval_avg_loss
        }, self.global_step)
        self.tb_writer.add_scalars('acc', {
            'train': self.tr_acc,
            'val': self.eval_avg_acc
        }, self.global_step)
Example #19
0
def train(**kwargs):
    # kwargs.update({'model': 'CNN'})
    opt.parse(kwargs)

    if (opt.use_gpu):
        torch.cuda.set_device(opt.gpu_id)

    if opt.encoder == 'BERT':
        encoder_model = BertForSequenceClassification.from_pretrained(
            "./downloaded_weights/downloaded_bert_base_uncased",
            num_labels=opt.rel_num)
        # print(encoder_model)
        opt.encoder_out_dimension = opt.rel_num
    else:
        encoder_model = getattr(encoder_models, opt.encoder)(opt)
        opt.encoder_out_dimension = encoder_model.out_dimension
    selector_model = getattr(selector_models, opt.selector)(opt)
    # encoder_model = torch.nn.DataParallel(encoder_model, device_ids=[3,6])

    if (opt.use_gpu):
        encoder_model = encoder_model.cuda()
        selector_model = selector_model.cuda()

    # Loading data
    DataModel = getattr(dataset, opt.data + 'Data')
    train_data = DataModel(opt.data_root,
                           train=True,
                           use_bert=opt.use_bert_tokenizer)
    train_data_loader = DataLoader(train_data,
                                   batch_size=opt.batch_size,
                                   shuffle=True,
                                   num_workers=opt.num_workers,
                                   collate_fn=collate_fn)
    print('train data: {}'.format(len(train_data)))

    test_data = DataModel(opt.data_root,
                          train=False,
                          use_bert=opt.use_bert_tokenizer)
    test_data_loader = DataLoader(test_data,
                                  batch_size=opt.batch_size,
                                  shuffle=False,
                                  num_workers=opt.num_workers,
                                  collate_fn=collate_fn)
    print('test data: {}'.format(len(test_data)))

    criterion = nn.CrossEntropyLoss()
    if opt.encoder == 'BERT':
        optimizer = AdamW(
            [{
                'params': encoder_model.parameters()
            }, {
                'params': selector_model.parameters()
            }],
            lr=opt.lr,
            correct_bias=True
        )  # To reproduce BertAdam specific behavior set correct_bias=False
    else:
        optimizer = optim.Adadelta([{
            'params': encoder_model.parameters()
        }, {
            'params': selector_model.parameters()
        }],
                                   lr=opt.lr,
                                   rho=1.0,
                                   eps=1e-6,
                                   weight_decay=opt.weight_decay)

    scheduler = WarmupLinearSchedule(optimizer, warmup_steps=2,
                                     t_total=3)  # PyTorch scheduler
    ### and used like this:
    # for batch in train_data:
    #     loss = model(batch)
    #     loss.backward()
    #     torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)  # Gradient clipping is not in AdamW anymore (so you can use amp without issue)

    #     optimizer.zero_grad()

    # if opt.encoder == "BERT" and False:
    #     optimizer = optim.SGD([
    #         {'params': selector_model.parameters()}
    #         ], lr=opt.lr)
    # else:

    optimizer = optim.SGD([{
        'params': encoder_model.parameters()
    }, {
        'params': selector_model.parameters()
    }],
                          lr=opt.lr)

    max_pre = 0.0
    max_rec = 0.0
    for epoch in range(opt.num_epochs):
        # if opt.encoder == "BERT":
        encoder_model.train()
        selector_model.train()
        print("*" * 50)
        print("Epoch {}".format(epoch))
        total_loss = 0
        max_insNum = 0
        for batch_num, (data, label_set) in enumerate(train_data_loader):
            # if (batch_num>2000):
            #     break
            # label_set is the label of each bag (there may be no more than 4 labels, but we only wants the first)

            labels = []
            outs = torch.empty([0, 53])

            empty = True  # if all labels of bags in one batch are zeros, then it's empty, continue to avoid error
            for l in label_set:
                if (l[0] != 0):
                    labels.append(l[0])
                    empty = False
            if empty:
                continue
            # labels = [l[0] for l in label_set]
            # Each time enters {batch_size} bags
            # Each time I want one bag!!
            # The model need to give me a representation of an instance!!!

            if opt.use_gpu:
                labels = torch.LongTensor(labels).cuda()
                outs = outs.cuda()
            else:
                labels = torch.LongTensor(labels)

            optimizer.zero_grad()
            train_cor = 0
            for idx, bag in enumerate(data):
                insNum = bag[1]
                # if insNum > max_insNum:
                #     max_insNum = insNum
                #     print(max_insNum)
                label = label_set[idx][0]  # Label of the current bag
                if (label_set[idx][0] == 0):
                    continue

                ins_outs = torch.empty(0)
                instances = bag[2]
                pf_list = []
                mask_list = []
                if opt.encoder != 'BERT':
                    pf_list = bag[3]
                    mask_list = bag[5]

                # pf_list = bag[3]
                ins_out = torch.empty(0)
                encoder_model.batch_size = insNum
                if opt.use_gpu:
                    instances = torch.LongTensor(instances).cuda()

                if opt.encoder == 'BERT':
                    # with torch.no_grad():
                    # print(instances.size(0))
                    if insNum > opt.max_sentence_in_bag:
                        ins_outs = encoder_model(
                            instances[:opt.max_sentence_in_bag])[0]
                    else:
                        ins_outs = encoder_model(instances)[0]
                    # ins_outs = ins_outs[0]
                    # print(ins_outs[0].size())
                else:

                    for idx, instance in enumerate(instances):
                        if opt.use_gpu:
                            pfs = torch.LongTensor(pf_list[idx]).cuda()
                            masks = torch.LongTensor(mask_list[idx]).cuda()
                        else:
                            pfs = torch.LongTensor(pf_list[idx])
                            masks = torch.LongTensor(mask_list[idx])

                        if opt.encoder == 'PCNN':
                            ins_out = encoder_model(instance, pfs, masks)
                        else:
                            ins_out = encoder_model(instance, pfs)

                        if (opt.use_gpu):
                            ins_out = ins_out.cuda()
                            ins_outs = ins_outs.cuda()

                        ins_outs = torch.cat((ins_outs, ins_out), 0)
                        del instance, ins_out

                        if idx >= opt.max_sentence_in_bag:
                            break

                bag_feature = selector_model(ins_outs)
                if opt.use_gpu: bag_feature = bag_feature.cuda()
                if (torch.max(bag_feature.squeeze(), 0)[1] == label):
                    train_cor += 1

                outs = torch.cat((outs, bag_feature), 0)
                del ins_outs, bag_feature

            # outs = outs.squeeze()
            # print("outs.size(): ", outs.size(), '\n', "labels.size(): ", labels.size())
            # print(outs,labels)
            loss = criterion(outs, labels)
            total_loss += loss.item()
            avg_loss = total_loss / (batch_num + 1)
            sys.stdout.write(
                "\rbatch number: {:6d}\tloss: {:7.4f}\ttrain_acc: {:7.2f}\t".
                format(batch_num, avg_loss, train_cor / len(labels)))
            sys.stdout.flush()
            # sys.stdout.write('\033')

            loss.backward()
            if opt.encoder == 'BERT':
                scheduler.step()
            optimizer.step()
            del outs, labels

        if (opt.skip_predict != True):
            with torch.no_grad():
                predict(encoder_model, selector_model, test_data_loader)

    t = time.strftime('%m_%d_%H_%M.pth')
    torch.save(encoder_model.state_dict(),
               'checkpoints/{}_{}'.format(opt.encoder, t))
    torch.save(selector_model.state_dict(),
               'checkpoints/{}_{}'.format(opt.selector, t))
Example #20
0
def main():
    parser = argparse.ArgumentParser()
    parser = add_xlmr_args(parser)
    parser.add_argument('--self_training', action='store_true', default=False)
    parser.add_argument('--unlabeled_data_dir',
                        type=str,
                        default='data/unlabeled_data')
    parser.add_argument('--self_training_confidence', type=float, default=0.9)
    parser.add_argument('--K', type=float, default=50)
    parser.add_argument('--patience', type=float, default=10)

    args = parser.parse_args()

    if args.gradient_accumulation_steps < 1:
        raise ValueError(
            "Invalid gradient_accumulation_steps parameter: {}, should be >= 1"
            .format(args.gradient_accumulation_steps))

    args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)

    if not args.do_train and not args.do_eval:
        raise ValueError(
            "At least one of `do_train` or `do_eval` must be True.")

    if os.path.exists(args.output_dir) and os.listdir(
            args.output_dir) and args.do_train:
        raise ValueError(
            "Output directory ({}) already exists and is not empty.".format(
                args.output_dir))
    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)

    data_processor = SequenceLabelingProcessor(task=args.task_name)
    label_list = data_processor.get_labels()
    num_labels = len(label_list) + 1  # add one for IGNORE label

    train_examples = None
    num_train_optimization_steps = 0

    if args.do_train:
        train_examples = data_processor.get_train_examples(args.data_dir)
        num_train_optimization_steps = int(
            len(train_examples) / args.train_batch_size /
            args.gradient_accumulation_steps) * args.num_train_epochs

        # preparing model configs
    hidden_size = 768 if 'base' in args.pretrained_path else 1024  # TODO: move this inside model.__init__

    device = 'cuda' if (torch.cuda.is_available()
                        and not args.no_cuda) else 'cpu'

    if args.use_crf:
        model_cls = XLMRForTokenClassificationWithCRF
    else:
        model_cls = XLMRForTokenClassification

    # creating model
    model = model_cls(pretrained_path=args.pretrained_path,
                      n_labels=num_labels,
                      hidden_size=hidden_size,
                      dropout_p=args.dropout,
                      device=device)

    model.to(device)

    if args.load_model is not None:
        logging.info("Loading saved model {}".format(args.load_model))
        state_dict = torch.load(args.load_model)
        model.load_state_dict(state_dict, strict=True)

    no_decay = ['bias', 'final_layer_norm.weight']

    params = list(model.named_parameters())

    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in params if not any(nd in n for nd in no_decay)],
        'weight_decay':
        args.weight_decay
    }, {
        'params': [p for n, p in params if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]
    warmup_steps = int(args.warmup_proportion * num_train_optimization_steps)

    # freeze model if necessary
    if args.freeze_model:
        logger.info("Freezing XLM-R model...")
        for n, p in model.named_parameters():
            if 'xlmr' in n and p.requires_grad:
                p.requires_grad = False

    if args.fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
            )
        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level=args.fp16_opt_level)

    label_map = {i: label for i, label in enumerate(label_list, 1)}
    if args.do_train:
        train_features = data_processor.convert_examples_to_features(
            train_examples, label_list, args.max_seq_length, model.encode_word)

        if args.self_training:
            self_training_examples = data_processor.get_unlabeled_examples(
                args.unlabeled_data_dir)
            self_training_features = data_processor.convert_examples_to_features(
                self_training_examples, label_list, args.max_seq_length,
                model.encode_word)

            logging.info("Loaded {} Unlabeled examples".format(
                len(self_training_examples)))

        logger.info("***** Running training *****")
        logger.info("  Num examples = %d", len(train_examples))
        logger.info("  Batch size = %d", args.train_batch_size)
        logger.info("  Num steps = %d", num_train_optimization_steps)

        train_data = create_ner_dataset(train_features)
        train_sampler = RandomSampler(train_data)
        train_dataloader = DataLoader(train_data,
                                      sampler=train_sampler,
                                      batch_size=args.train_batch_size)

        val_examples = data_processor.get_dev_examples(args.data_dir)
        val_features = data_processor.convert_examples_to_features(
            val_examples, label_list, args.max_seq_length, model.encode_word)

        val_data = create_ner_dataset(val_features)
        best_val_f1 = 0.0

        ############################# Self Training Loop ######################
        n_iter = 0
        optimizer = AdamW(optimizer_grouped_parameters,
                          lr=args.learning_rate,
                          eps=args.adam_epsilon)
        scheduler = WarmupLinearSchedule(optimizer,
                                         warmup_steps=warmup_steps,
                                         t_total=num_train_optimization_steps)
        patience = 0
        while 1:

            ############################ Inner Training Loop #####################

            #if n_iter >= 50:
            #    break

            # reset lr

            n_iter += 1

            print(len(train_dataloader))
            loss_fct = nn.BCELoss()
            for epoch_ in tqdm(range(args.num_train_epochs),
                               desc="Epoch",
                               disable=args.no_pbar):

                tr_loss = 0
                tbar = tqdm(train_dataloader,
                            desc="Iteration",
                            disable=args.no_pbar)

                model.train()
                for step, batch in enumerate(tbar):
                    batch = tuple(t.to(device) for t in batch)
                    input_ids, label_ids, l_mask, valid_ids, = batch
                    loss, _ = model(input_ids,
                                    label_ids,
                                    l_mask,
                                    valid_ids,
                                    get_sent_repr=True)

                    if args.gradient_accumulation_steps > 1:
                        loss = loss / args.gradient_accumulation_steps

                    tr_loss += loss.item()
                    if args.fp16:
                        with amp.scale_loss(loss, optimizer) as scaled_loss:
                            scaled_loss.backward()
                        torch.nn.utils.clip_grad_norm_(
                            amp.master_params(optimizer), args.max_grad_norm)
                    else:
                        optimizer.zero_grad()
                        loss.backward()
                        torch.nn.utils.clip_grad_norm_(model.parameters(),
                                                       args.max_grad_norm)

                    if (step + 1) % args.gradient_accumulation_steps == 0:
                        optimizer.step()
                        scheduler.step()  # Update learning rate schedule
                        model.zero_grad()

                    tbar.set_description('Loss = %.4f' % (tr_loss /
                                                          (step + 1)))
                logger.info("Evaluating on validation set...\n")
                #torch.save(model.state_dict(), open(os.path.join(args.output_dir, 'model.pt'), 'wb'))
                f1, report = evaluate_model_seq_labeling(
                    model, val_data, label_list, args.eval_batch_size,
                    args.use_crf, device)
                if f1 > best_val_f1:
                    best_val_f1 = f1
                    logger.info(
                        "\nFound better f1=%.4f on validation set. Saving model\n"
                        % (f1))
                    logger.info("\n%s\n" % (report))

                    torch.save(
                        model.state_dict(),
                        open(os.path.join(args.output_dir, 'model.pt'), 'wb'))
                    patience = 0

                else:
                    logger.info("\nNo better F1 score: {}\n".format(f1))
                    patience += 1

            ######################################################################
            if not args.self_training:
                break
            if patience >= args.patience:
                logger.info("No more patience. Existing")
                break
            ## get confidence and update train_data, train_dataloader
            # convert unlabeled examples to features

            if len(self_training_features) <= 0:  # no more self-training data
                break

            confident_features, self_training_features = get_top_confidence_samples_seq_labeling(
                model,
                self_training_features,
                batch_size=args.eval_batch_size,
                K=args.K)

            for f in confident_features:
                l_ids = f.label_id
                l_s = [label_map[i] for i in l_ids]
            logging.info("Got %d confident samples" %
                         (len(confident_features)))
            # append new features
            #train_features = data_processor.convert_examples_to_features(
            #         train_examples, label_list, args.max_seq_length, model.encode_word)

            train_features.extend(confident_features)

            print("now we have %d total examples" % len(train_features))

            train_data = create_ner_dataset(train_features)
            train_sampler = RandomSampler(train_data)
            train_dataloader = DataLoader(train_data,
                                          sampler=train_sampler,
                                          batch_size=args.train_batch_size)

            for g in optimizer.param_groups:
                g['lr'] = args.learning_rate

            scheduler.step(0)

            #print("Loading best last model...")
            #model.load_state_dict(torch.load(open(os.path.join(args.output_dir, 'model.pt'), 'rb')))

    # load best/ saved model
    state_dict = torch.load(
        open(os.path.join(args.output_dir, 'model.pt'), 'rb'))
    model.load_state_dict(state_dict)
    logger.info("Loaded saved model")

    model.to(device)

    if args.do_eval:
        if args.eval_on == "dev":
            eval_examples = data_processor.get_dev_examples(args.data_dir)
        elif args.eval_on == "test":
            eval_examples = data_processor.get_test_examples(args.data_dir)
        else:
            raise ValueError("eval on dev or test set only")
        eval_features = data_processor.convert_examples_to_features(
            eval_examples, label_list, args.max_seq_length, model.encode_word)

        logger.info("***** Running evaluation *****")
        logger.info("  Num examples = %d", len(eval_examples))
        logger.info("  Batch size = %d", args.eval_batch_size)

        eval_data = create_ner_dataset(eval_features)
        f1_score, report = evaluate_model_seq_labeling(model, eval_data,
                                                       label_list,
                                                       args.eval_batch_size,
                                                       args.use_crf, device)

        logger.info("\n%s", report)
        output_eval_file = os.path.join(args.output_dir, "eval_results.txt")
        logger.info("dataset = {}".format(args.data_dir))
        logger.info("model = {}".format(args.output_dir))
        with open(output_eval_file, "w") as writer:
            logger.info("***** Writing results to file *****")
            writer.write(report)
            logger.info("Done.")
Example #21
0
def main():
    parser = argparse.ArgumentParser()

    ## Required parameters(即required=True的参数必须在命令上出现)
    parser.add_argument(
        "--data_dir",
        default=None,
        type=str,
        required=True,
        help=
        "数据集路径. The input data dir. Should contain the .tsv files (or other data files) for the task."
    )
    parser.add_argument(
        "--model_type",
        default=None,
        type=str,
        required=True,
        help="模型类型(这里为bert). Model type selected in the list: " +
        ", ".join(MODEL_CLASSES.keys()))
    parser.add_argument(
        "--model_name_or_path",
        default=None,
        type=str,
        required=True,
        help=
        "下载好的预训练模型. Path to pre-trained model or shortcut name selected in the list: "
        + ", ".join(ALL_MODELS))
    parser.add_argument(
        "--meta_path",
        default=None,
        type=str,
        required=False,
        help="Path to pre-trained model or shortcut name selected in the list: "
        + ", ".join(ALL_MODELS))
    parser.add_argument(
        "--output_dir",
        default=None,
        type=str,
        required=True,
        help=
        "模型预测和断点文件的存放路径. The output directory where the model predictions and checkpoints will be written."
    )

    ## Other parameters
    parser.add_argument(
        "--config_name",
        default="",
        type=str,
        help=
        "预训练的配置名字或路径. Pretrained config name or path if not the same as model_name"
    )
    parser.add_argument(
        "--tokenizer_name",
        default="",
        type=str,
        help=
        "预训练分词器名字或路径. Pretrained tokenizer name or path if not the same as model_name"
    )
    parser.add_argument(
        "--cache_dir",
        default="",
        type=str,
        help=
        "从亚马逊s3下载的预训练模型存放路径. Where do you want to store the pre-trained models downloaded from s3"
    )
    parser.add_argument(
        "--max_seq_length",
        default=128,
        type=int,
        help=
        "最长序列长度. The maximum total input sequence length after tokenization. Sequences longer "
        "than this will be truncated, sequences shorter will be padded.")
    parser.add_argument("--do_train",
                        action='store_true',
                        help="是否训练. Whether to run training.")
    parser.add_argument("--do_test",
                        action='store_true',
                        help="是否测试. Whether to run testing.")
    parser.add_argument("--predict_eval",
                        action='store_true',
                        help="是否预测验证集. Whether to predict eval set.")
    parser.add_argument("--do_eval",
                        action='store_true',
                        help="是否验证. Whether to run eval on the dev set.")
    parser.add_argument(
        "--evaluate_during_training",
        action='store_true',
        help="是否训练中跑验证. Run evaluation during training at each logging step.")
    parser.add_argument(
        "--do_lower_case",
        action='store_true',
        help="是否用小写模型. Set this flag if you are using an uncased model.")

    parser.add_argument(
        "--per_gpu_train_batch_size",
        default=8,
        type=int,
        help="训练时每个GPU/CPU上的batch size. Batch size per GPU/CPU for training.")
    parser.add_argument(
        "--per_gpu_eval_batch_size",
        default=8,
        type=int,
        help="验证时每个GPU/CPU上的batch size. Batch size per GPU/CPU for evaluation."
    )
    parser.add_argument(
        '--gradient_accumulation_steps',
        type=int,
        default=1,
        help=
        "反向传播前梯度累计的次数. Number of updates steps to accumulate before performing a backward/update pass."
    )
    parser.add_argument("--learning_rate",
                        default=5e-5,
                        type=float,
                        help="Adam的初始学习率. The initial learning rate for Adam.")
    parser.add_argument("--weight_decay",
                        default=0.0,
                        type=float,
                        help="权重衰减系数. Weight deay if we apply some.")
    parser.add_argument("--adam_epsilon",
                        default=1e-8,
                        type=float,
                        help="Adam的Epsilon系数. Epsilon for Adam optimizer.")
    parser.add_argument(
        "--max_grad_norm",
        default=1.0,
        type=float,
        help=
        " 如果所有参数的gradient组成的向量的L2 norm大于max norm,那么需要根据L2 norm/max_norm进行缩放。从而使得L2 norm小于预设的clip_norm. Max gradient norm."
    )
    parser.add_argument(
        "--num_train_epochs",
        default=3.0,
        type=float,
        help="训练epoch数. Total number of training epochs to perform.")
    parser.add_argument(
        "--max_steps",
        default=-1,
        type=int,
        help=
        "If > 0: set total number of training steps to perform. Override num_train_epochs."
    )
    parser.add_argument("--eval_steps", default=-1, type=int, help="")
    parser.add_argument("--lstm_hidden_size", default=300, type=int, help="")
    parser.add_argument("--lstm_layers", default=2, type=int, help="")
    parser.add_argument("--lstm_dropout", default=0.5, type=float, help="")

    parser.add_argument("--train_steps", default=-1, type=int, help="")
    parser.add_argument("--report_steps", default=-1, type=int, help="")
    parser.add_argument(
        "--warmup_steps",
        default=0,
        type=int,
        help="线性warmup的steps. Linear warmup over warmup_steps.")
    parser.add_argument("--split_num",
                        default=3,
                        type=int,
                        help="测试集划分. text split")
    parser.add_argument('--logging_steps',
                        type=int,
                        default=50,
                        help="日志更新steps. Log every X updates steps.")
    parser.add_argument(
        '--save_steps',
        type=int,
        default=50,
        help="断点文件保存steps. Save checkpoint every X updates steps.")
    parser.add_argument(
        "--eval_all_checkpoints",
        action='store_true',
        help=
        "评估所有的断点. Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number"
    )
    parser.add_argument("--no_cuda",
                        action='store_true',
                        help="不用cuda. Avoid using CUDA when available")
    parser.add_argument(
        '--overwrite_output_dir',
        action='store_true',
        help="重写输出路径. Overwrite the content of the output directory")
    parser.add_argument(
        '--overwrite_cache',
        action='store_true',
        help="重写训练和评估的缓存. Overwrite the cached training and evaluation sets")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="初始化用的随机种子. random seed for initialization")

    parser.add_argument(
        '--fp16',
        action='store_true',
        help=
        "是否用16位混合精度. Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit"
    )
    parser.add_argument(
        '--fp16_opt_level',
        type=str,
        default='O1',
        help=
        "fp16的优化level. For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
        "See details at https://nvidia.github.io/apex/amp.html")
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="为了分布式训练. For distributed training: local_rank")
    parser.add_argument('--server_ip',
                        type=str,
                        default='',
                        help="远程debug用的ip. For distant debugging.")
    parser.add_argument('--server_port',
                        type=str,
                        default='',
                        help="远程debug用的端口. For distant debugging.")
    parser.add_argument("--freeze",
                        default=0,
                        type=int,
                        required=False,
                        help="冻结BERT. freeze bert.")
    parser.add_argument("--not_do_eval_steps",
                        default=0.35,
                        type=float,
                        help="not_do_eval_steps.")
    args = parser.parse_args()

    # Setup CUDA, GPU & distributed training
    if args.local_rank == -1 or args.no_cuda:
        # 如果无指定GPU或允许使用CUDA,就使用当前所有GPU
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        args.n_gpu = torch.cuda.device_count()
    else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        # 指定使用哪个GPU(local_rank代表当前程序进程使用的GPU标号)
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        torch.distributed.init_process_group(backend='nccl')
        args.n_gpu = 1
    args.device = device

    # Setup logging 初始化日志
    logging.basicConfig(
        format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
        datefmt='%m/%d/%Y %H:%M:%S',
        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
    logger.warning(
        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
        args.local_rank, device, args.n_gpu, bool(args.local_rank != -1),
        args.fp16)

    # Set seed 设置种子数
    set_seed(args)

    # 创建存放路径
    try:
        os.makedirs(args.output_dir)
    except:
        pass

    # 载入预训练好的BERT分词器
    tokenizer = BertTokenizer.from_pretrained(args.model_name_or_path,
                                              do_lower_case=args.do_lower_case)

    # 载入预设好的BERT配置文件
    config = BertConfig.from_pretrained(args.model_name_or_path, num_labels=2)

    # Prepare model 载入并配置好基于BERT的序列分类模型
    model = BertForSequenceClassification.from_pretrained(
        args.model_name_or_path, args, config=config)

    # 开启FP16
    if args.fp16:
        model.half()
    model.to(device)
    # 如果是指定了单个GPU,用DistributedDataParallel进行GPU训练
    if args.local_rank != -1:
        try:
            from apex.parallel import DistributedDataParallel as DDP
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training."
            )

        model = DDP(model)
    # 如果有多个GPU,就直接用torch.nn.DataParallel,会自动调用当前可用的多个GPU
    elif args.n_gpu > 1:
        model = torch.nn.DataParallel(model)
    # 总batch size = GPU数量 * 每个GPU上的mbatch size
    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
    args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
    if args.do_train:
        # Prepare data loader 导入数据并准备符合格式的输入
        train_examples = read_examples(os.path.join(args.data_dir,
                                                    'train.csv'),
                                       is_training=True)
        train_features = convert_examples_to_features(train_examples,
                                                      tokenizer,
                                                      args.max_seq_length,
                                                      args.split_num, True)
        all_input_ids = torch.tensor(select_field(train_features, 'input_ids'),
                                     dtype=torch.long)
        all_input_mask = torch.tensor(select_field(train_features,
                                                   'input_mask'),
                                      dtype=torch.long)
        all_segment_ids = torch.tensor(select_field(train_features,
                                                    'segment_ids'),
                                       dtype=torch.long)
        all_label = torch.tensor([f.label for f in train_features],
                                 dtype=torch.long)
        train_data = TensorDataset(all_input_ids, all_input_mask,
                                   all_segment_ids, all_label)
        # 如果无指定GPU就随机采样,如果指定了GPU就分布式采样
        if args.local_rank == -1:
            train_sampler = RandomSampler(train_data)
        else:
            train_sampler = DistributedSampler(train_data)
        # 准备dataloader
        train_dataloader = DataLoader(train_data,
                                      sampler=train_sampler,
                                      batch_size=args.train_batch_size //
                                      args.gradient_accumulation_steps)
        # 训练steps
        num_train_optimization_steps = args.train_steps

        # Prepare optimizer 准备优化器
        param_optimizer = list(model.named_parameters())

        # hack to remove pooler, which is not used
        # thus it produce None grad that break apex
        param_optimizer = [n for n in param_optimizer]

        # no_dacay内的参数不参与权重衰减
        # BN是固定C,[B,H,W]进行归一化处理(处理为均值0,方差1的正太分布上),适用于CNN
        # LN是固定N,[C,H,W]进行归一化处理,适用于RNN(BN适用于固定深度的前向神经网络,而RNN因输入序列长度不一致而深度不固定,因此BN不合适,而LN不依赖于batch的大小和输入sequence的深度,因此可以用于batchsize为1和RNN中对边长的输入sequence的normalize操作)
        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [{
            'params': [
                p for n, p in param_optimizer
                if not any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            args.weight_decay
        }, {
            'params':
            [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
            'weight_decay':
            0.0
        }]

        # 配置优化器和warmup机制
        optimizer = AdamW(optimizer_grouped_parameters,
                          lr=args.learning_rate,
                          eps=args.adam_epsilon)
        scheduler = WarmupLinearSchedule(optimizer,
                                         warmup_steps=args.warmup_steps,
                                         t_total=args.train_steps //
                                         args.gradient_accumulation_steps)

        global_step = 0

        logger.info("***** Running training *****")
        logger.info("  Num examples = %d", len(train_examples))
        logger.info("  Batch size = %d", args.train_batch_size)
        logger.info("  Num steps = %d", num_train_optimization_steps)

        best_acc = 0
        tr_loss = 0
        nb_tr_examples, nb_tr_steps = 0, 0
        bar = tqdm(range(num_train_optimization_steps),
                   total=num_train_optimization_steps)
        train_dataloader = cycle(train_dataloader)  # 循环遍历

        # 先做一个eval
        for file in ['dev.csv']:
            inference_labels = []
            gold_labels = []
            inference_logits = []
            eval_examples = read_examples(os.path.join(args.data_dir, file),
                                          is_training=True)
            eval_features = convert_examples_to_features(
                eval_examples, tokenizer, args.max_seq_length, args.split_num,
                False)
            all_input_ids = torch.tensor(select_field(eval_features,
                                                      'input_ids'),
                                         dtype=torch.long)
            all_input_mask = torch.tensor(select_field(eval_features,
                                                       'input_mask'),
                                          dtype=torch.long)
            all_segment_ids = torch.tensor(select_field(
                eval_features, 'segment_ids'),
                                           dtype=torch.long)
            all_label = torch.tensor([f.label for f in eval_features],
                                     dtype=torch.long)

            eval_data = TensorDataset(all_input_ids, all_input_mask,
                                      all_segment_ids, all_label)

            logger.info("***** Running evaluation *****")
            logger.info("  Num examples = %d", len(eval_examples))
            logger.info("  Batch size = %d", args.eval_batch_size)

            # Run prediction for full data 准备验证集的dataloader
            eval_sampler = SequentialSampler(eval_data)
            eval_dataloader = DataLoader(eval_data,
                                         sampler=eval_sampler,
                                         batch_size=args.eval_batch_size)
            # 开启预测模式(不用dropout和BN)
            model.eval()
            eval_loss, eval_accuracy = 0, 0
            nb_eval_steps, nb_eval_examples = 0, 0
            for input_ids, input_mask, segment_ids, label_ids in eval_dataloader:
                # 将数据放在GPU上
                input_ids = input_ids.to(device)
                input_mask = input_mask.to(device)
                segment_ids = segment_ids.to(device)
                label_ids = label_ids.to(device)

                # 禁止进行梯度更新
                with torch.no_grad():
                    tmp_eval_loss, logits = model(input_ids=input_ids,
                                                  token_type_ids=segment_ids,
                                                  attention_mask=input_mask,
                                                  labels=label_ids)
                    # logits = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask)

                logits = logits.detach().cpu().numpy()
                label_ids = label_ids.to('cpu').numpy()
                inference_labels.append(np.argmax(logits, axis=1))
                gold_labels.append(label_ids)
                inference_logits.append(logits)
                eval_loss += tmp_eval_loss.mean().item()
                nb_eval_examples += input_ids.size(0)
                nb_eval_steps += 1

            gold_labels = np.concatenate(gold_labels, 0)
            inference_logits = np.concatenate(inference_logits, 0)
            model.train()
            eval_loss = eval_loss / nb_eval_steps  # 计算验证集的预测损失
            eval_accuracy = accuracy(inference_logits,
                                     gold_labels)  # 计算验证集的预测准确性

            result = {
                'eval_loss': eval_loss,
                'eval_F1': eval_accuracy,
                'global_step': global_step
            }
            # 将验证集的预测评价写入到evel_results.txt中
            output_eval_file = os.path.join(args.output_dir,
                                            "eval_results.txt")
            with open(output_eval_file, "a") as writer:
                for key in sorted(result.keys()):
                    logger.info("  %s = %s", key, str(result[key]))
                    writer.write("%s = %s\n" % (key, str(result[key])))
                writer.write('*' * 80)
                writer.write('\n')
            # 如果当前训练的模型表现最佳,则保存该模型
            if eval_accuracy > best_acc and 'dev' in file:
                print("=" * 80)
                print("Best F1", eval_accuracy)
                print("Saving Model......")
                best_acc = eval_accuracy
                # Save a trained model
                model_to_save = model.module if hasattr(
                    model, 'module') else model  # Only save the model it-self
                output_model_file = os.path.join(args.output_dir,
                                                 "pytorch_model.bin")
                torch.save(model_to_save.state_dict(), output_model_file)
                print("=" * 80)
            else:
                print("=" * 80)

        model.train()

        # 分batch循环迭代训练模型
        for step in bar:
            batch = next(train_dataloader)
            batch = tuple(t.to(device) for t in batch)
            input_ids, input_mask, segment_ids, label_ids = batch
            loss, _ = model(input_ids=input_ids,
                            token_type_ids=segment_ids,
                            attention_mask=input_mask,
                            labels=label_ids)
            nb_tr_examples += input_ids.size(0)
            del input_ids, input_mask, segment_ids, label_ids
            if args.n_gpu > 1:
                loss = loss.mean()  # mean() to average on multi-gpu.
            if args.fp16 and args.loss_scale != 1.0:
                loss = loss * args.loss_scale
            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps
            tr_loss += loss.item()
            train_loss = round(
                tr_loss * args.gradient_accumulation_steps / (nb_tr_steps + 1),
                4)
            bar.set_description("loss {}".format(train_loss))

            nb_tr_steps += 1

            # 用FP16去做反向传播
            if args.fp16:
                optimizer.backward(loss)
            else:
                loss.backward()

            # 梯度累计后进行更新
            if (nb_tr_steps + 1) % args.gradient_accumulation_steps == 0:
                if args.fp16:
                    # modify learning rate with special warm up BERT uses
                    # if args.fp16 is False, BertAdam is used that handles this automatically
                    lr_this_step = args.learning_rate * warmup_linear.get_lr(
                        global_step, args.warmup_proportion)
                    for param_group in optimizer.param_groups:
                        param_group['lr'] = lr_this_step
                optimizer.step()  # 梯度更新
                scheduler.step()  # 梯度更新
                optimizer.zero_grad()  # 清空现有梯度,避免累计
                global_step += 1

            # 每隔args.eval_steps*args.gradient_accumulation_steps,打印训练过程中的结果
            if (step + 1) % (args.eval_steps *
                             args.gradient_accumulation_steps) == 0:
                tr_loss = 0
                nb_tr_examples, nb_tr_steps = 0, 0
                logger.info("***** Report result *****")
                logger.info("  %s = %s", 'global_step', str(global_step))
                logger.info("  %s = %s", 'train loss', str(train_loss))

            # 每隔args.eval_steps*args.gradient_accumulation_steps,预测验证集并评估结果
            if args.do_eval and step > num_train_optimization_steps * args.not_do_eval_steps and (
                    step + 1) % (args.eval_steps *
                                 args.gradient_accumulation_steps) == 0:
                for file in ['dev.csv']:
                    inference_labels = []
                    gold_labels = []
                    inference_logits = []
                    eval_examples = read_examples(os.path.join(
                        args.data_dir, file),
                                                  is_training=True)
                    eval_features = convert_examples_to_features(
                        eval_examples, tokenizer, args.max_seq_length,
                        args.split_num, False)
                    all_input_ids = torch.tensor(select_field(
                        eval_features, 'input_ids'),
                                                 dtype=torch.long)
                    all_input_mask = torch.tensor(select_field(
                        eval_features, 'input_mask'),
                                                  dtype=torch.long)
                    all_segment_ids = torch.tensor(select_field(
                        eval_features, 'segment_ids'),
                                                   dtype=torch.long)
                    all_label = torch.tensor([f.label for f in eval_features],
                                             dtype=torch.long)

                    eval_data = TensorDataset(all_input_ids, all_input_mask,
                                              all_segment_ids, all_label)

                    logger.info("***** Running evaluation *****")
                    logger.info("  Num examples = %d", len(eval_examples))
                    logger.info("  Batch size = %d", args.eval_batch_size)

                    # Run prediction for full data
                    eval_sampler = SequentialSampler(eval_data)
                    eval_dataloader = DataLoader(
                        eval_data,
                        sampler=eval_sampler,
                        batch_size=args.eval_batch_size)

                    model.eval()
                    eval_loss, eval_accuracy = 0, 0
                    nb_eval_steps, nb_eval_examples = 0, 0
                    for input_ids, input_mask, segment_ids, label_ids in eval_dataloader:
                        input_ids = input_ids.to(device)
                        input_mask = input_mask.to(device)
                        segment_ids = segment_ids.to(device)
                        label_ids = label_ids.to(device)

                        with torch.no_grad():
                            tmp_eval_loss, logits = model(
                                input_ids=input_ids,
                                token_type_ids=segment_ids,
                                attention_mask=input_mask,
                                labels=label_ids)
                            # logits = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask)

                        logits = logits.detach().cpu().numpy()
                        label_ids = label_ids.to('cpu').numpy()
                        inference_labels.append(np.argmax(logits, axis=1))
                        gold_labels.append(label_ids)
                        inference_logits.append(logits)
                        eval_loss += tmp_eval_loss.mean().item()
                        nb_eval_examples += input_ids.size(0)
                        nb_eval_steps += 1

                    gold_labels = np.concatenate(gold_labels, 0)
                    inference_logits = np.concatenate(inference_logits, 0)
                    model.train()
                    eval_loss = eval_loss / nb_eval_steps
                    eval_accuracy = accuracy(inference_logits, gold_labels)

                    result = {
                        'eval_loss': eval_loss,
                        'eval_F1': eval_accuracy,
                        'global_step': global_step,
                        'loss': train_loss
                    }

                    output_eval_file = os.path.join(args.output_dir,
                                                    "eval_results.txt")
                    with open(output_eval_file, "a") as writer:
                        for key in sorted(result.keys()):
                            logger.info("  %s = %s", key, str(result[key]))
                            writer.write("%s = %s\n" % (key, str(result[key])))
                        writer.write('*' * 80)
                        writer.write('\n')
                    if eval_accuracy > best_acc and 'dev' in file:
                        print("=" * 80)
                        print("Best F1", eval_accuracy)
                        print("Saving Model......")
                        best_acc = eval_accuracy
                        # Save a trained model
                        model_to_save = model.module if hasattr(
                            model,
                            'module') else model  # Only save the model it-self
                        output_model_file = os.path.join(
                            args.output_dir, "pytorch_model.bin")
                        torch.save(model_to_save.state_dict(),
                                   output_model_file)
                        print("=" * 80)
                    else:
                        print("=" * 80)

    # 预测测试集
    if args.do_test:
        del model
        gc.collect()  # 清理内存
        args.do_train = False  # 停止训练
        # 载入训练好的的最佳模型文件
        model = BertForSequenceClassification.from_pretrained(os.path.join(
            args.output_dir, "pytorch_model.bin"),
                                                              args,
                                                              config=config)
        if args.fp16:
            # nn.Module中的half()方法将模型中的float32转化为float16
            model.half()
        model.to(device)  # 将模型放在GPU上

        # 设置GPU训练方式
        if args.local_rank != -1:
            try:
                from apex.parallel import DistributedDataParallel as DDP
            except ImportError:
                raise ImportError(
                    "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training."
                )

            model = DDP(model)
        elif args.n_gpu > 1:
            model = torch.nn.DataParallel(model)

        #  预测验证集和测试集
        for file, flag in [('dev.csv', 'dev'), ('CSC_test.csv', 'CSC_test'),
                           ('NS_test.csv', 'NS_test')]:
            inference_labels = []
            gold_labels = []
            eval_examples = read_examples(os.path.join(args.data_dir, file),
                                          is_training=False)
            eval_features = convert_examples_to_features(
                eval_examples, tokenizer, args.max_seq_length, args.split_num,
                False)
            all_input_ids = torch.tensor(select_field(eval_features,
                                                      'input_ids'),
                                         dtype=torch.long)
            all_input_mask = torch.tensor(select_field(eval_features,
                                                       'input_mask'),
                                          dtype=torch.long)
            all_segment_ids = torch.tensor(select_field(
                eval_features, 'segment_ids'),
                                           dtype=torch.long)
            all_label = torch.tensor([f.label for f in eval_features],
                                     dtype=torch.long)

            eval_data = TensorDataset(all_input_ids, all_input_mask,
                                      all_segment_ids, all_label)
            # Run prediction for full data
            eval_sampler = SequentialSampler(eval_data)
            eval_dataloader = DataLoader(eval_data,
                                         sampler=eval_sampler,
                                         batch_size=args.eval_batch_size)

            model.eval()
            eval_loss, eval_accuracy = 0, 0
            nb_eval_steps, nb_eval_examples = 0, 0
            for input_ids, input_mask, segment_ids, label_ids in eval_dataloader:
                input_ids = input_ids.to(device)
                input_mask = input_mask.to(device)
                segment_ids = segment_ids.to(device)
                label_ids = label_ids.to(device)

                with torch.no_grad():
                    logits = model(
                        input_ids=input_ids,
                        token_type_ids=segment_ids,
                        attention_mask=input_mask).detach().cpu().numpy()
                label_ids = label_ids.to('cpu').numpy()
                inference_labels.append(logits)
                gold_labels.append(label_ids)
            gold_labels = np.concatenate(gold_labels, 0)
            logits = np.concatenate(inference_labels, 0)
            print(flag, accuracy(logits, gold_labels))
            # 保存预测结果文件
            if flag == 'CSC_test':
                df = pd.read_csv(os.path.join(args.data_dir, file))
                df['label_0'] = logits[:, 0]
                df['label_1'] = logits[:, 1]
                df[['qid', 'label_0',
                    'label_1']].to_csv(os.path.join(args.output_dir,
                                                    "sub_CSC.csv"),
                                       index=False)
            if flag == 'NS_test':
                df = pd.read_csv(os.path.join(args.data_dir, file))
                df['label_0'] = logits[:, 0]
                df['label_1'] = logits[:, 1]
                df[['qid', 'label_0',
                    'label_1']].to_csv(os.path.join(args.output_dir,
                                                    "sub_NS.csv"),
                                       index=False)
            if flag == 'dev':
                df = pd.read_csv(os.path.join(args.data_dir, file))
                df['label_0'] = logits[:, 0]
                df['label_1'] = logits[:, 1]
                df[['label_0',
                    'label_1']].to_csv(os.path.join(args.output_dir,
                                                    "sub_dev.csv"),
                                       index=False)
    # 只预测验证集
    if args.predict_eval:
        del model
        gc.collect()
        args.do_train = False
        model = BertForSequenceClassification.from_pretrained(os.path.join(
            args.output_dir, "pytorch_model.bin"),
                                                              args,
                                                              config=config)
        if args.fp16:
            model.half()
        model.to(device)
        if args.local_rank != -1:
            try:
                from apex.parallel import DistributedDataParallel as DDP
            except ImportError:
                raise ImportError(
                    "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training."
                )

            model = DDP(model)
        elif args.n_gpu > 1:
            model = torch.nn.DataParallel(model)

        for file, flag in [('dev.csv', 'dev')]:
            inference_labels = []
            gold_labels = []
            eval_examples = read_examples(os.path.join(args.data_dir, file),
                                          is_training=False)
            eval_features = convert_examples_to_features(
                eval_examples, tokenizer, args.max_seq_length, args.split_num,
                False)
            all_input_ids = torch.tensor(select_field(eval_features,
                                                      'input_ids'),
                                         dtype=torch.long)
            all_input_mask = torch.tensor(select_field(eval_features,
                                                       'input_mask'),
                                          dtype=torch.long)
            all_segment_ids = torch.tensor(select_field(
                eval_features, 'segment_ids'),
                                           dtype=torch.long)
            all_label = torch.tensor([f.label for f in eval_features],
                                     dtype=torch.long)

            eval_data = TensorDataset(all_input_ids, all_input_mask,
                                      all_segment_ids, all_label)
            # Run prediction for full data
            eval_sampler = SequentialSampler(eval_data)
            eval_dataloader = DataLoader(eval_data,
                                         sampler=eval_sampler,
                                         batch_size=args.eval_batch_size)

            model.eval()
            eval_loss, eval_accuracy = 0, 0
            nb_eval_steps, nb_eval_examples = 0, 0
            for input_ids, input_mask, segment_ids, label_ids in eval_dataloader:
                input_ids = input_ids.to(device)
                input_mask = input_mask.to(device)
                segment_ids = segment_ids.to(device)
                label_ids = label_ids.to(device)

                with torch.no_grad():
                    logits = model(
                        input_ids=input_ids,
                        token_type_ids=segment_ids,
                        attention_mask=input_mask).detach().cpu().numpy()
                label_ids = label_ids.to('cpu').numpy()
                inference_labels.append(logits)
                gold_labels.append(label_ids)
            gold_labels = np.concatenate(gold_labels, 0)
            logits = np.concatenate(inference_labels, 0)
            print(flag, accuracy(logits, gold_labels))
            if flag == 'dev':
                df = pd.read_csv(os.path.join(args.data_dir, file))
                df['label_0'] = logits[:, 0]
                df['label_1'] = logits[:, 1]
                df[['label_0',
                    'label_1']].to_csv(os.path.join(args.output_dir,
                                                    "sub_dev.csv"),
                                       index=False)
Example #22
0
    def train(self, dataloader: DataLoader, train_config: TrainConfig):
        """
        Train the model with the given data and config

        :param dataloader:
            the data for the training
        :param train_config:
            the configuration for the training
        """
        if train_config.output_path is not None:
            os.makedirs(train_config.output_path, exist_ok=True)
            if os.listdir(train_config.output_path):
                raise ValueError("Output directory ({}) already exists and is not empty.".format(
                    train_config.output_path))

            self.save(train_config.output_path, save_config=True, save_model=False)

        self.best_score = -9999
        num_train_steps = int(len(dataloader) / train_config.gradient_accumulation_steps * train_config.epochs)

        # Prepare optimizer
        param_optimizer = list(self.model.named_parameters())
        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [
            {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': train_config.weight_decay},
            {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
        ]
        t_total = num_train_steps
        if train_config.local_rank != -1:
            t_total = t_total // torch.distributed.get_world_size()

        optimizer = AdamW(optimizer_grouped_parameters, lr=train_config.learning_rate,
                          eps=train_config.adam_epsilon, correct_bias=train_config.correct_bias)
        scheduler = WarmupLinearSchedule(optimizer, warmup_steps=train_config.warmup_steps, t_total=t_total)

        if train_config.fp16:
            try:
                from apex import amp
            except ImportError:
                raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
            model, optimizer = amp.initialize(self.model, optimizer, opt_level=train_config.fp16_opt_level)


        global_step = 0

        for epoch in trange(train_config.epochs, desc="Epoch"):
            training_steps = 0
            self.model.train()
            for step, batch in enumerate(tqdm(dataloader, desc="Iteration")):
                batch = batch_to_device(batch, self.device)
                input_ids, segment_ids, input_masks, label_ids = batch
                loss = self.model(input_ids, segment_ids, input_masks, label_ids)

                if train_config.gradient_accumulation_steps > 1:
                    loss = loss / train_config.gradient_accumulation_steps

                if train_config.fp16:
                    with amp.scale_loss(loss, optimizer) as scaled_loss:
                        scaled_loss.backward()
                    torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), train_config.max_grad_norm)
                else:
                    loss.backward()
                    torch.nn.utils.clip_grad_norm_(self.model.parameters(), train_config.max_grad_norm)

                training_steps += 1
                if (step + 1) % train_config.gradient_accumulation_steps == 0:
                    scheduler.step()
                    optimizer.step()
                    optimizer.zero_grad()
                    global_step += 1

                if train_config.evaluation_steps > 0 and training_steps % train_config.evaluation_steps == 0:
                    self._eval_during_training(train_config, epoch, training_steps)
                    self.model.train()

            self._eval_during_training(train_config, epoch, -1)
Example #23
0
    def train(self):
        model = BertForSequenceClassification.from_pretrained(
            self.args.model_name_or_path, self.args, config=self.config)
        model.to(self.device)

        logger.info('准备数据')
        data = DATABDCI(
            debug=False,
            data_dir='/home/lsy2018/文本匹配/DATA/DATA_BDCI/',
            data_process_output='/home/lsy2018/文本匹配/DATA/DATA_BDCI/data_1014/')

        train_examples = data.read_examples(
            os.path.join(self.data_process_output, 'train.csv'))
        train_features = data.convert_examples_to_features(
            train_examples, self.tokenizer, self.max_seq_length)
        all_input_ids = torch.tensor(data.select_field(train_features,
                                                       'input_ids'),
                                     dtype=torch.long)
        all_input_mask = torch.tensor(data.select_field(
            train_features, 'input_mask'),
                                      dtype=torch.long)
        all_segment_ids = torch.tensor(data.select_field(
            train_features, 'segment_ids'),
                                       dtype=torch.long)
        all_label = torch.tensor([f.label for f in train_features],
                                 dtype=torch.long)
        train_data = TensorDataset(all_input_ids, all_input_mask,
                                   all_segment_ids, all_label)

        # 这步干嘛的?
        train_sampler = RandomSampler(train_data)
        train_dataloader = DataLoader(train_data,
                                      sampler=train_sampler,
                                      batch_size=self.batch_size //
                                      self.gradient_accumulation_steps)

        # Prepare optimizer

        param_optimizer = list(model.named_parameters())

        # hack to remove pooler, which is not used
        # thus it produce None grad that break apex
        param_optimizer = [n for n in param_optimizer]

        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [{
            'params': [
                p for n, p in param_optimizer
                if not any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            self.weight_decay
        }, {
            'params':
            [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
            'weight_decay':
            0.0
        }]

        optimizer = AdamW(optimizer_grouped_parameters,
                          lr=self.learning_rate,
                          eps=self.adam_epsilon)
        scheduler = WarmupLinearSchedule(optimizer,
                                         warmup_steps=self.warmup_steps,
                                         t_total=self.train_steps)

        best_acc = 0
        global_step = 0
        model.train()
        train_loss = 0
        nb_tr_examples, nb_tr_steps = 0, 0

        bar = tqdm(range(self.train_steps), total=self.train_steps)
        train_dataloader = cycle(train_dataloader)

        for step in bar:
            batch = next(train_dataloader)
            batch = tuple(t.to(self.device) for t in batch)
            input_ids, input_mask, segment_ids, label_ids = batch
            loss = model(input_ids=input_ids,
                         token_type_ids=segment_ids,
                         attention_mask=input_mask,
                         labels=label_ids)

            train_loss += loss.item()
            train_loss = round(
                train_loss * self.gradient_accumulation_steps /
                (nb_tr_steps + 1), 4)
            bar.set_description("loss {}".format(train_loss))
            nb_tr_examples += input_ids.size(0)
            nb_tr_steps += 1
            loss.backward()

            if (nb_tr_steps + 1) % self.gradient_accumulation_steps == 0:
                scheduler.step()
                optimizer.step()
                optimizer.zero_grad()
                global_step += 1

            if self.do_eval and (step + 1) % (
                    self.eval_steps * self.gradient_accumulation_steps) == 0:
                inference_labels = []
                scores = []
                gold_labels = []
                inference_logits = []
                eval_examples = data.read_examples(
                    os.path.join(self.data_process_output, 'dev.csv'))
                eval_features = data.convert_examples_to_features(
                    eval_examples, self.tokenizer, self.max_seq_length)
                ID1 = [x.sentence_ID1 for x in eval_examples]
                ID2 = [x.sentence_ID2 for x in eval_examples]

                all_input_ids = torch.tensor(data.select_field(
                    eval_features, 'input_ids'),
                                             dtype=torch.long)
                all_input_mask = torch.tensor(data.select_field(
                    eval_features, 'input_mask'),
                                              dtype=torch.long)
                all_segment_ids = torch.tensor(data.select_field(
                    eval_features, 'segment_ids'),
                                               dtype=torch.long)
                all_label = torch.tensor([f.label for f in eval_features],
                                         dtype=torch.long)

                eval_data = TensorDataset(all_input_ids, all_input_mask,
                                          all_segment_ids, all_label)

                logger.info("***** Running evaluation *****")
                logger.info("  Num examples = %d", len(eval_examples))
                logger.info("  Batch size = %d", self.batch_size)

                # Run prediction for full data
                eval_sampler = SequentialSampler(eval_data)
                eval_dataloader = DataLoader(eval_data,
                                             sampler=eval_sampler,
                                             batch_size=self.batch_size)

                model.eval()
                eval_loss, eval_accuracy = 0, 0
                nb_eval_steps, nb_eval_examples = 0, 0
                count = 0

                for input_ids, input_mask, segment_ids, label_ids in eval_dataloader:
                    # ID1_list_eachbatch = ID1[count*args.eval_batch_size:(count+1)*args.eval_batch_size]
                    # ID2_list_eachbatch = ID2[count * args.eval_batch_size:(count + 1) * args.eval_batch_size]
                    input_ids = input_ids.to(self.device)
                    input_mask = input_mask.to(self.device)
                    segment_ids = segment_ids.to(self.device)
                    label_ids = label_ids.to(self.device)

                    with torch.no_grad():
                        tmp_eval_loss = model(input_ids=input_ids,
                                              token_type_ids=segment_ids,
                                              attention_mask=input_mask,
                                              labels=label_ids)
                        logits = model(input_ids=input_ids,
                                       token_type_ids=segment_ids,
                                       attention_mask=input_mask)

                        logits = logits.detach().cpu().numpy()
                        label_ids = label_ids.to('cpu').numpy()
                        inference_labels.append(np.argmax(logits, axis=1))
                        # scores.append(logits)
                        gold_labels.append(label_ids)
                        inference_logits.append(logits)
                        eval_loss += tmp_eval_loss.mean().item()
                        nb_eval_examples += input_ids.size(0)
                        nb_eval_steps += 1

                    gold_labels = np.concatenate(gold_labels, 0)
                    inference_logits = np.concatenate(inference_logits, 0)
                    # scores = np.concatenate(scores, 0)
                    model.train()
                    eval_loss = eval_loss / nb_eval_steps
                    eval_accuracy = accuracy(inference_logits, gold_labels)
                    # eval_mrr = compute_MRR(scores, gold_labels, ID1, ID2)

                    result = {
                        'eval_loss': eval_loss,
                        'eval_F1': eval_accuracy,
                        'global_step': global_step,
                        # 'mrr':eval_mrr,
                        'loss': train_loss
                    }

                    output_eval_file = os.path.join(self.output_dir,
                                                    "eval_results.txt")
                    with open(output_eval_file, "a") as writer:
                        for key in sorted(result.keys()):
                            logger.info("  %s = %s", key, str(result[key]))
                            writer.write("%s = %s\n" % (key, str(result[key])))
                        writer.write('*' * 80)
                        writer.write('\n')
                    if eval_accuracy > best_acc:
                        print("=" * 80)
                        print("Best F1", eval_accuracy)
                        print("Saving Model......")
                        best_acc = eval_accuracy
                        # Save a trained model
                        model_to_save = model.module if hasattr(
                            model,
                            'module') else model  # Only save the model it-self
                        output_model_file = os.path.join(
                            self.output_dir, "pytorch_model.bin")
                        torch.save(model_to_save.state_dict(),
                                   output_model_file)
                        print("=" * 80)
                    else:
                        print("=" * 80)
Example #24
0
    def multitask_train(self, dataloaders: List[DataLoader], losses: List[LossFunction], train_config: TrainConfig):
        """
        Train the model with the given data and config with the given loss for each dataset

        Each dataloader is sampled in turn for one batch.
        We sample only as many batches from each dataloader as there are in the smallest one
        to make sure of equal training with each dataset.

        :param dataloaders:
            the data for the training
        :param losses:
            the losses for the dataloaders
            the losses still uses the configuration as given in sbert_config, so you cannot for example
            have two different SBERTLossFunction.SOFTMAX with different number of labels
        :param train_config:
            the configuration for the training
        """
        if train_config.output_path is not None:
            os.makedirs(train_config.output_path, exist_ok=True)
            if os.listdir(train_config.output_path):
                raise ValueError("Output directory ({}) already exists and is not empty.".format(
                    train_config.output_path))

            self.save(train_config.output_path, save_config=True, save_model=False)

        self.best_score = -9999

        min_batches = min([len(dataloader) for dataloader in dataloaders])
        num_dataloaders = len(dataloaders)
        num_train_steps = int(num_dataloaders*min_batches / train_config.gradient_accumulation_steps * train_config.epochs)

        # Prepare optimizer
        param_optimizer = list(self.model.named_parameters())
        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [
            {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
            {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
        ]
        t_total = num_train_steps
        if train_config.local_rank != -1:
            t_total = t_total // torch.distributed.get_world_size()

        optimizer = AdamW(optimizer_grouped_parameters, lr=train_config.learning_rate,
                          eps=train_config.adam_epsilon)
        scheduler = WarmupLinearSchedule(optimizer, warmup_steps=train_config.warmup_steps, t_total=t_total)

        if train_config.fp16:
            try:
                from apex import amp
            except ImportError:
                raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
            model, optimizer = amp.initialize(self.model, optimizer, opt_level=train_config.fp16_opt_level)

        global_step = 0

        for epoch in trange(train_config.epochs, desc="Epoch"):
            training_steps = 0
            self.model.train()
            iterators = [iter(dataloader) for dataloader in dataloaders]
            for step in trange(num_dataloaders*min_batches, desc="Iteration"):
                idx = step % num_dataloaders
                batch = batch_to_device(next(iterators[idx]), self.device)
                input_ids, segment_ids, input_masks, label_ids = batch
                loss = self.model(input_ids, segment_ids, input_masks, label_ids, losses[idx])

                if train_config.gradient_accumulation_steps > 1:
                    loss = loss / train_config.gradient_accumulation_steps

                if train_config.fp16:
                    with amp.scale_loss(loss, optimizer) as scaled_loss:
                        scaled_loss.backward()
                    torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), train_config.max_grad_norm)
                else:
                    loss.backward()
                    torch.nn.utils.clip_grad_norm_(self.model.parameters(), train_config.max_grad_norm)

                training_steps += 1
                if (step + 1) % train_config.gradient_accumulation_steps == 0:
                    scheduler.step()
                    optimizer.step()
                    optimizer.zero_grad()
                    global_step += 1

                if train_config.evaluation_steps > 0 and training_steps % train_config.evaluation_steps == 0:
                    self._eval_during_training(train_config, epoch, training_steps)
                    self.model.train()

            self._eval_during_training(train_config, epoch, -1)
Example #25
0
                    if fold not in [test_fold, val_fold]]

    acc = defaultdict(lambda: None)
    for epoch in range(args.n_epochs):
        print("Epoch:", epoch + 1)

        model.train()
        train_loss = val_loss = test_loss = 0
        for row in train_data:
            outputs = model(long_tensor(row["text_tokens"]).unsqueeze(0),
                            labels=long_tensor(
                                row["source_label"]).unsqueeze(0))
            loss, logits = outputs[:2]
            train_loss += loss.item()

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        print("train_loss:", train_loss)

        model.eval()
        with torch.no_grad():
            accs = defaultdict(list)
            for row in val_data:
                outputs = model(long_tensor(row["text_tokens"]).unsqueeze(0),
                                labels=long_tensor(
                                    row["source_label"]).unsqueeze(0))
                loss, logits = outputs[:2]
                val_loss += loss.item()
Example #26
0
    def train(self):
        if not os.path.exists(self.output_dir):
            os.makedirs(self.output_dir)


        # logger.info(f'Fold {split_index + 1}')
        train_dataloader, eval_dataloader, train_examples, eval_examples = self.create_dataloader()

        num_train_optimization_steps = self.train_steps

        # Prepare model
        config = BertConfig.from_pretrained(self.model_name_or_path)
        model = BertForTokenClassification.from_pretrained(self.model_name_or_path,self.args, config=config)
        model.to(self.device)
        model.train()
        # Prepare optimizer
        param_optimizer = list(model.named_parameters())
        param_optimizer = [n for n in param_optimizer]

        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [
            {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
             'weight_decay': self.weight_decay},
            {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
        ]

        optimizer = AdamW(optimizer_grouped_parameters, lr=self.learning_rate, eps=self.adam_epsilon)
        scheduler = WarmupLinearSchedule(optimizer, warmup_steps=self.warmup_steps, t_total=self.train_steps)

        global_step = 0

        logger.info("***** Running training *****")
        logger.info("  Num examples = %d", len(train_examples))
        logger.info("  Batch size = %d", self.train_batch_size)
        logger.info("  Num steps = %d", num_train_optimization_steps)

        best_acc = 0
        best_MRR = 0
        tr_loss = 0
        nb_tr_examples, nb_tr_steps = 0, 0
        train_dataloader = cycle(train_dataloader)

        for step in range(num_train_optimization_steps):
            batch = next(train_dataloader)
            batch = tuple(t.to(self.device) for t in batch)
            input_ids,input_mask,segment_ids,\
            utterance_mask,domain_mask, \
            slot_mask,hist_mask,\
            label_value_start,label_value_end,\
            label_domainslot = batch

            loss_tokenstart,loss_tokenend,loss_domainslot = model(
                input_ids=input_ids,
                token_type_ids=segment_ids,
                attention_mask=input_mask,
                utterance_mask = utterance_mask,
                domain_mask = domain_mask,
                slot_mask = slot_mask,
                hist_mask = hist_mask,
                label_value_start=label_value_start,
                label_value_end = label_value_end,
                label_domainslot = label_domainslot
            )
            loss = loss_tokenstart + loss_tokenend + loss_domainslot
            # loss = loss_domainslot
            tr_loss += loss.item()
            train_loss = round(tr_loss / (nb_tr_steps + 1), 4)

            nb_tr_examples += input_ids.size(0)
            nb_tr_steps += 1

            loss.backward()
            if (nb_tr_steps + 1) % self.gradient_accumulation_steps == 0:

                optimizer.step()
                optimizer.zero_grad()
                scheduler.step()
                global_step += 1

            if (step + 1) % (self.eval_steps * self.gradient_accumulation_steps) == 0:
                tr_loss = 0
                nb_tr_examples, nb_tr_steps = 0, 0
                logger.info("***** Report result *****")
                logger.info("  %s = %s", 'global_step', str(global_step))
                logger.info("  %s = %s", 'train loss', str(train_loss))

            if self.do_eval and (step + 1) % (self.eval_steps * self.gradient_accumulation_steps) == 0:
                for file in ['de.csv']:
                    gold_value_start = []
                    gold_value_end = []
                    gold_domainslot = []
                    scores_value_start = []
                    scores_value_end = []
                    scores_domainslot = []
                    dialogueID = [x.guid for x in eval_examples]
                    utterance_text = [x.text_eachturn for x in eval_examples]
                    logger.info("***** Running evaluation *****")
                    logger.info("  Num examples = %d", len(eval_examples))
                    logger.info("  Batch size = %d", self.eval_batch_size)

                    model.eval()
                    eval_loss_tokens_start,eval_loss_tokens_end,eval_loss_domainslot = 0,0,0
                    eval_F1_tokens_start,eval_F1_tokens_end = 0,0
                    eval_F1_sentence_domainslot,eval_F1_tokens_domainslot = 0,0
                    nb_eval_steps, nb_eval_examples = 0, 0
                    for input_ids,input_mask, segment_ids,\
                        utterance_mask,domain_mask, \
                        slot_mask,hist_mask,\
                        label_value_start,label_value_end,\
                        label_domainslot in eval_dataloader:
                        input_ids = input_ids.to(self.device)
                        input_mask = input_mask.to(self.device)
                        segment_ids = segment_ids.to(self.device)
                        utterance_mask = utterance_mask.to(self.device)
                        domain_mask = domain_mask.to(self.device)
                        slot_mask = slot_mask.to(self.device)
                        hist_mask = hist_mask.to(self.device)
                        label_value_start = label_value_start.to(self.device)
                        label_value_end = label_value_end.to(self.device)
                        label_domainslot = label_domainslot.to(self.device)


                        with torch.no_grad():
                            batch_eval_loss_value_start,batch_eval_loss_value_end,batch_eval_loss_domainslot = model(
                                input_ids=input_ids,
                                token_type_ids=segment_ids,
                                attention_mask=input_mask,
                                utterance_mask = utterance_mask,
                                domain_mask = domain_mask,
                                slot_mask = slot_mask,
                                hist_mask = hist_mask,
                                label_value_start = label_value_start,
                                label_value_end=label_value_end,
                                label_domainslot=label_domainslot
                            )
                            logits_value_start,logits_value_end,logits_domainslot = model(
                                input_ids=input_ids,
                                token_type_ids=segment_ids,
                                attention_mask=input_mask,
                                utterance_mask = utterance_mask,
                                domain_mask = domain_mask,
                                slot_mask = slot_mask,
                                hist_mask = hist_mask,
                            )
                        logits_value_start = logits_value_start.cpu().numpy()
                        logits_value_end = logits_value_end.cpu().numpy()
                        logits_domainslot = logits_domainslot.cpu().numpy()

                        label_value_start = label_value_start.to('cpu').numpy()
                        label_value_end = label_value_end.to('cpu').numpy()
                        label_domainslot = label_domainslot.to('cpu').numpy()

                        scores_value_start.append(logits_value_start)
                        scores_value_end.append(logits_value_end)
                        scores_domainslot.append(logits_domainslot)

                        gold_value_start.append(label_value_start)
                        gold_value_end.append(label_value_end)
                        gold_domainslot.append(label_domainslot)

                        eval_loss_tokens_start += batch_eval_loss_value_start.mean().item()
                        eval_loss_tokens_end += batch_eval_loss_value_end.mean().item()
                        eval_loss_domainslot += batch_eval_loss_domainslot.mean().item()

                        nb_eval_examples += input_ids.size(0)
                        nb_eval_steps += 1

                    gold_value_start = np.concatenate(gold_value_start,0)
                    gold_value_end = np.concatenate(gold_value_end,0)
                    gold_domainslot = np.concatenate(gold_domainslot,0)

                    scores_value_start = np.concatenate(scores_value_start, 0)
                    scores_value_end = np.concatenate(scores_value_end, 0)
                    scores_domainslot = np.concatenate(scores_domainslot,0)

                    model.train()
                    eval_loss_tokens_start = eval_loss_tokens_start/nb_eval_steps
                    eval_loss_tokens_end = eval_loss_tokens_end / nb_eval_steps
                    eval_loss_domainslot = eval_loss_domainslot /nb_eval_steps

                    # print(scores_domainslot.shape)
                    # print(gold_labels_domainslot.shape)
                    # print(scores_domainslot)
                    # print(gold_labels_domainslot)
                    # exit()
                    # eval_accuracy_token_start = accuracyF1(scores_domain, gold_labels_domain,mode='domain')
                    # eval_accuracy_token_end = accuracyF1(scores_dependcy, gold_labels_dependcy ,mode= 'dependcy')

                    eval_F1_valuestart,eval_F1_valueend,F1_domainslot = compute_jointGoal_domainslot(
                        dialogueID,
                        utterance_text,
                        scores_value_start,
                        scores_value_end,
                        scores_domainslot,
                        gold_value_start,
                        gold_value_end,
                        gold_domainslot
                    )


                    print(
                        'F1_domainslot',F1_domainslot,
                        'eval_F1_valuestart',eval_F1_valuestart,
                        'eval_F1_valueend', eval_F1_valueend,
                        'global_step',global_step,
                        'loss',train_loss
                    )
                    result = {

                        'eval_loss_tokens_start':eval_loss_tokens_start,
                        'eval_loss_tokens_end': eval_loss_tokens_end,
                        'eval_loss_domainslot':eval_loss_domainslot,

                        'F1_domainslot': F1_domainslot,
                        'eval_F1_valuestart': eval_F1_valuestart,
                        'eval_F1_valueend': eval_F1_valueend,
                        'global_step': global_step,
                        'loss': train_loss}

                    output_eval_file = os.path.join(self.output_dir, "eval_results.txt")
                    with open(output_eval_file, "a") as writer:
                        for key in sorted(result.keys()):
                            logger.info("  %s = %s", key, str(result[key]))
                            writer.write("%s = %s\n" % (key, str(result[key])))
                        writer.write('*' * 80)
                        writer.write('\n')
                    if eval_F1_valuestart > best_acc :
                        print("=" * 80)
                        print("Best jointGoal", eval_F1_valuestart)
                        print("Saving Model......")
                        # best_acc = eval_accuracy
                        best_acc = eval_F1_valuestart
                        # Save a trained model
                        model_to_save = model.module if hasattr(model,'module') else model
                        output_model_file = os.path.join(self.output_dir, "pytorch_model.bin")
                        torch.save(model_to_save.state_dict(), output_model_file)
                        print("=" * 80)
                    else:
                        print("=" * 80)
Example #27
0
def main():
    parser = argparse.ArgumentParser()

    ## Required parameters
    parser.add_argument(
        "--data_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The input data dir. Should contain the .tsv files (or other data files) for the task."
    )
    parser.add_argument("--model_type",
                        default=None,
                        type=str,
                        required=True,
                        help="Model type selected in the list: " +
                        ", ".join(MODEL_CLASSES.keys()))
    parser.add_argument(
        "--model_name_or_path",
        default=None,
        type=str,
        required=True,
        help="Path to pre-trained model or shortcut name selected in the list: "
        + ", ".join(ALL_MODELS))
    parser.add_argument(
        "--meta_path",
        default=None,
        type=str,
        required=False,
        help="Path to pre-trained model or shortcut name selected in the list: "
        + ", ".join(ALL_MODELS))
    parser.add_argument(
        "--output_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The output directory where the model predictions and checkpoints will be written."
    )

    ## Other parameters
    parser.add_argument(
        "--config_name",
        default="",
        type=str,
        help="Pretrained config name or path if not the same as model_name")
    parser.add_argument(
        "--tokenizer_name",
        default="",
        type=str,
        help="Pretrained tokenizer name or path if not the same as model_name")
    parser.add_argument(
        "--cache_dir",
        default="",
        type=str,
        help=
        "Where do you want to store the pre-trained models downloaded from s3")
    parser.add_argument(
        "--max_seq_length",
        default=128,
        type=int,
        help=
        "The maximum total input sequence length after tokenization. Sequences longer "
        "than this will be truncated, sequences shorter will be padded.")
    parser.add_argument("--do_train",
                        action='store_true',
                        help="Whether to run training.")
    parser.add_argument("--do_test",
                        action='store_true',
                        help="Whether to run training.")
    parser.add_argument("--do_eval",
                        action='store_true',
                        help="Whether to run eval on the dev set.")
    parser.add_argument(
        "--evaluate_during_training",
        action='store_true',
        help="Rul evaluation during training at each logging step.")
    parser.add_argument(
        "--do_lower_case",
        action='store_true',
        help="Set this flag if you are using an uncased model.")

    parser.add_argument("--per_gpu_train_batch_size",
                        default=8,
                        type=int,
                        help="Batch size per GPU/CPU for training.")
    parser.add_argument("--per_gpu_eval_batch_size",
                        default=8,
                        type=int,
                        help="Batch size per GPU/CPU for evaluation.")
    parser.add_argument(
        '--gradient_accumulation_steps',
        type=int,
        default=1,
        help=
        "Number of updates steps to accumulate before performing a backward/update pass."
    )
    parser.add_argument("--learning_rate",
                        default=5e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--weight_decay",
                        default=0.0,
                        type=float,
                        help="Weight deay if we apply some.")
    parser.add_argument("--adam_epsilon",
                        default=1e-8,
                        type=float,
                        help="Epsilon for Adam optimizer.")
    parser.add_argument("--max_grad_norm",
                        default=1.0,
                        type=float,
                        help="Max gradient norm.")
    parser.add_argument("--num_train_epochs",
                        default=3.0,
                        type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument(
        "--max_steps",
        default=-1,
        type=int,
        help=
        "If > 0: set total number of training steps to perform. Override num_train_epochs."
    )
    parser.add_argument("--eval_steps", default=-1, type=int, help="")
    parser.add_argument("--lstm_hidden_size", default=300, type=int, help="")
    parser.add_argument("--lstm_layers", default=2, type=int, help="")
    parser.add_argument("--lstm_dropout", default=0.5, type=float, help="")

    parser.add_argument("--train_steps", default=-1, type=int, help="")
    parser.add_argument("--report_steps", default=-1, type=int, help="")
    parser.add_argument("--warmup_steps",
                        default=0,
                        type=int,
                        help="Linear warmup over warmup_steps.")
    parser.add_argument("--split_num", default=3, type=int, help="text split")
    parser.add_argument('--logging_steps',
                        type=int,
                        default=50,
                        help="Log every X updates steps.")
    parser.add_argument('--save_steps',
                        type=int,
                        default=50,
                        help="Save checkpoint every X updates steps.")
    parser.add_argument(
        "--eval_all_checkpoints",
        action='store_true',
        help=
        "Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number"
    )
    parser.add_argument("--no_cuda",
                        action='store_true',
                        help="Avoid using CUDA when available")
    parser.add_argument('--overwrite_output_dir',
                        action='store_true',
                        help="Overwrite the content of the output directory")
    parser.add_argument(
        '--overwrite_cache',
        action='store_true',
        help="Overwrite the cached training and evaluation sets")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")

    parser.add_argument(
        '--fp16',
        action='store_true',
        help=
        "Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit"
    )
    parser.add_argument(
        '--fp16_opt_level',
        type=str,
        default='O1',
        help=
        "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
        "See details at https://nvidia.github.io/apex/amp.html")
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="For distributed training: local_rank")
    parser.add_argument('--server_ip',
                        type=str,
                        default='',
                        help="For distant debugging.")
    parser.add_argument('--server_port',
                        type=str,
                        default='',
                        help="For distant debugging.")
    args = parser.parse_args()

    # Setup CUDA, GPU & distributed training
    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        args.n_gpu = torch.cuda.device_count()
    else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        torch.distributed.init_process_group(backend='nccl')
        args.n_gpu = 1
    args.device = device

    # Setup logging
    logging.basicConfig(
        format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
        datefmt='%m/%d/%Y %H:%M:%S',
        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
    logger.warning(
        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
        args.local_rank, device, args.n_gpu, bool(args.local_rank != -1),
        args.fp16)

    # Set seed
    set_seed(args)

    try:
        os.makedirs(args.output_dir)
    except:
        pass

    tokenizer = BertTokenizer.from_pretrained(args.model_name_or_path,
                                              do_lower_case=args.do_lower_case)

    config = BertConfig.from_pretrained(args.model_name_or_path, num_labels=3)

    # Prepare model
    model = BertForSequenceClassification.from_pretrained(
        args.model_name_or_path, args, config=config)

    if args.fp16:
        model.half()
    model.to(device)
    if args.local_rank != -1:
        try:
            from apex.parallel import DistributedDataParallel as DDP
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training."
            )

        model = DDP(model)
    elif args.n_gpu > 1:
        model = torch.nn.DataParallel(model)
    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
    args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
    if args.do_train:

        # Prepare data loader

        train_examples = read_examples(os.path.join(args.data_dir,
                                                    'train.csv'),
                                       is_training=True)
        train_features = convert_examples_to_features(train_examples,
                                                      tokenizer,
                                                      args.max_seq_length,
                                                      args.split_num, True)
        all_input_ids = torch.tensor(select_field(train_features, 'input_ids'),
                                     dtype=torch.long)
        all_input_mask = torch.tensor(select_field(train_features,
                                                   'input_mask'),
                                      dtype=torch.long)
        all_segment_ids = torch.tensor(select_field(train_features,
                                                    'segment_ids'),
                                       dtype=torch.long)
        all_label = torch.tensor([f.label for f in train_features],
                                 dtype=torch.long)
        train_data = TensorDataset(all_input_ids, all_input_mask,
                                   all_segment_ids, all_label)
        if args.local_rank == -1:
            train_sampler = RandomSampler(train_data)
        else:
            train_sampler = DistributedSampler(train_data)
        train_dataloader = DataLoader(train_data,
                                      sampler=train_sampler,
                                      batch_size=args.train_batch_size //
                                      args.gradient_accumulation_steps)

        num_train_optimization_steps = args.train_steps

        # Prepare optimizer

        param_optimizer = list(model.named_parameters())

        # hack to remove pooler, which is not used
        # thus it produce None grad that break apex
        param_optimizer = [n for n in param_optimizer]

        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [{
            'params': [
                p for n, p in param_optimizer
                if not any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            args.weight_decay
        }, {
            'params':
            [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
            'weight_decay':
            0.0
        }]

        optimizer = AdamW(optimizer_grouped_parameters,
                          lr=args.learning_rate,
                          eps=args.adam_epsilon)
        scheduler = WarmupLinearSchedule(optimizer,
                                         warmup_steps=args.warmup_steps,
                                         t_total=args.train_steps)

        global_step = 0

        logger.info("***** Running training *****")
        logger.info("  Num examples = %d", len(train_examples))
        logger.info("  Batch size = %d", args.train_batch_size)
        logger.info("  Num steps = %d", num_train_optimization_steps)

        best_acc = 0
        model.train()
        tr_loss = 0
        nb_tr_examples, nb_tr_steps = 0, 0
        bar = tqdm(range(num_train_optimization_steps),
                   total=num_train_optimization_steps)
        train_dataloader = cycle(train_dataloader)

        for step in bar:
            batch = next(train_dataloader)
            batch = tuple(t.to(device) for t in batch)
            input_ids, input_mask, segment_ids, label_ids = batch
            loss = model(input_ids=input_ids,
                         token_type_ids=segment_ids,
                         attention_mask=input_mask,
                         labels=label_ids)
            if args.n_gpu > 1:
                loss = loss.mean()  # mean() to average on multi-gpu.
            if args.fp16 and args.loss_scale != 1.0:
                loss = loss * args.loss_scale
            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps
            tr_loss += loss.item()
            train_loss = round(
                tr_loss * args.gradient_accumulation_steps / (nb_tr_steps + 1),
                4)
            bar.set_description("loss {}".format(train_loss))
            nb_tr_examples += input_ids.size(0)
            nb_tr_steps += 1

            if args.fp16:
                optimizer.backward(loss)
            else:

                loss.backward()

            if (nb_tr_steps + 1) % args.gradient_accumulation_steps == 0:
                if args.fp16:
                    # modify learning rate with special warm up BERT uses
                    # if args.fp16 is False, BertAdam is used that handles this automatically
                    lr_this_step = args.learning_rate * warmup_linear.get_lr(
                        global_step, args.warmup_proportion)
                    for param_group in optimizer.param_groups:
                        param_group['lr'] = lr_this_step
                scheduler.step()
                optimizer.step()
                optimizer.zero_grad()
                global_step += 1

            if (step + 1) % (args.eval_steps *
                             args.gradient_accumulation_steps) == 0:
                tr_loss = 0
                nb_tr_examples, nb_tr_steps = 0, 0
                logger.info("***** Report result *****")
                logger.info("  %s = %s", 'global_step', str(global_step))
                logger.info("  %s = %s", 'train loss', str(train_loss))

            if args.do_eval and (step + 1) % (
                    args.eval_steps * args.gradient_accumulation_steps) == 0:
                for file in ['dev.csv']:
                    inference_labels = []
                    gold_labels = []
                    inference_logits = []
                    eval_examples = read_examples(os.path.join(
                        args.data_dir, file),
                                                  is_training=True)
                    eval_features = convert_examples_to_features(
                        eval_examples, tokenizer, args.max_seq_length,
                        args.split_num, False)
                    all_input_ids = torch.tensor(select_field(
                        eval_features, 'input_ids'),
                                                 dtype=torch.long)
                    all_input_mask = torch.tensor(select_field(
                        eval_features, 'input_mask'),
                                                  dtype=torch.long)
                    all_segment_ids = torch.tensor(select_field(
                        eval_features, 'segment_ids'),
                                                   dtype=torch.long)
                    all_label = torch.tensor([f.label for f in eval_features],
                                             dtype=torch.long)

                    eval_data = TensorDataset(all_input_ids, all_input_mask,
                                              all_segment_ids, all_label)

                    logger.info("***** Running evaluation *****")
                    logger.info("  Num examples = %d", len(eval_examples))
                    logger.info("  Batch size = %d", args.eval_batch_size)

                    # Run prediction for full data
                    eval_sampler = SequentialSampler(eval_data)
                    eval_dataloader = DataLoader(
                        eval_data,
                        sampler=eval_sampler,
                        batch_size=args.eval_batch_size)

                    model.eval()
                    eval_loss, eval_accuracy = 0, 0
                    nb_eval_steps, nb_eval_examples = 0, 0
                    for input_ids, input_mask, segment_ids, label_ids in eval_dataloader:
                        input_ids = input_ids.to(device)
                        input_mask = input_mask.to(device)
                        segment_ids = segment_ids.to(device)
                        label_ids = label_ids.to(device)

                        with torch.no_grad():
                            tmp_eval_loss = model(input_ids=input_ids,
                                                  token_type_ids=segment_ids,
                                                  attention_mask=input_mask,
                                                  labels=label_ids)
                            logits = model(input_ids=input_ids,
                                           token_type_ids=segment_ids,
                                           attention_mask=input_mask)

                        logits = logits.detach().cpu().numpy()
                        label_ids = label_ids.to('cpu').numpy()
                        inference_labels.append(np.argmax(logits, axis=1))
                        gold_labels.append(label_ids)
                        inference_logits.append(logits)
                        eval_loss += tmp_eval_loss.mean().item()
                        nb_eval_examples += input_ids.size(0)
                        nb_eval_steps += 1

                    gold_labels = np.concatenate(gold_labels, 0)
                    inference_logits = np.concatenate(inference_logits, 0)
                    model.train()
                    eval_loss = eval_loss / nb_eval_steps
                    eval_accuracy = accuracy(inference_logits, gold_labels)

                    result = {
                        'eval_loss': eval_loss,
                        'eval_F1': eval_accuracy,
                        'global_step': global_step,
                        'loss': train_loss
                    }

                    output_eval_file = os.path.join(args.output_dir,
                                                    "eval_results.txt")
                    with open(output_eval_file, "a") as writer:
                        for key in sorted(result.keys()):
                            logger.info("  %s = %s", key, str(result[key]))
                            writer.write("%s = %s\n" % (key, str(result[key])))
                        writer.write('*' * 80)
                        writer.write('\n')
                    if eval_accuracy > best_acc and 'dev' in file:
                        print("=" * 80)
                        print("Best F1", eval_accuracy)
                        print("Saving Model......")
                        best_acc = eval_accuracy
                        # Save a trained model
                        model_to_save = model.module if hasattr(
                            model,
                            'module') else model  # Only save the model it-self
                        output_model_file = os.path.join(
                            args.output_dir, "pytorch_model.bin")
                        torch.save(model_to_save.state_dict(),
                                   output_model_file)
                        print("=" * 80)
                    else:
                        print("=" * 80)
    if args.do_test:
        del model
        gc.collect()
        args.do_train = False
        model = BertForSequenceClassification.from_pretrained(os.path.join(
            args.output_dir, "pytorch_model.bin"),
                                                              args,
                                                              config=config)
        if args.fp16:
            model.half()
        model.to(device)
        if args.local_rank != -1:
            try:
                from apex.parallel import DistributedDataParallel as DDP
            except ImportError:
                raise ImportError(
                    "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training."
                )

            model = DDP(model)
        elif args.n_gpu > 1:
            model = torch.nn.DataParallel(model)

        for file, flag in [('dev.csv', 'dev'), ('test.csv', 'test')]:
            inference_labels = []
            gold_labels = []
            eval_examples = read_examples(os.path.join(args.data_dir, file),
                                          is_training=False)
            eval_features = convert_examples_to_features(
                eval_examples, tokenizer, args.max_seq_length, args.split_num,
                False)
            all_input_ids = torch.tensor(select_field(eval_features,
                                                      'input_ids'),
                                         dtype=torch.long)
            all_input_mask = torch.tensor(select_field(eval_features,
                                                       'input_mask'),
                                          dtype=torch.long)
            all_segment_ids = torch.tensor(select_field(
                eval_features, 'segment_ids'),
                                           dtype=torch.long)
            all_label = torch.tensor([f.label for f in eval_features],
                                     dtype=torch.long)

            eval_data = TensorDataset(all_input_ids, all_input_mask,
                                      all_segment_ids, all_label)
            # Run prediction for full data
            eval_sampler = SequentialSampler(eval_data)
            eval_dataloader = DataLoader(eval_data,
                                         sampler=eval_sampler,
                                         batch_size=args.eval_batch_size)

            model.eval()
            eval_loss, eval_accuracy = 0, 0
            nb_eval_steps, nb_eval_examples = 0, 0
            for input_ids, input_mask, segment_ids, label_ids in eval_dataloader:
                input_ids = input_ids.to(device)
                input_mask = input_mask.to(device)
                segment_ids = segment_ids.to(device)
                label_ids = label_ids.to(device)

                with torch.no_grad():
                    logits = model(
                        input_ids=input_ids,
                        token_type_ids=segment_ids,
                        attention_mask=input_mask).detach().cpu().numpy()
                label_ids = label_ids.to('cpu').numpy()
                inference_labels.append(logits)
                gold_labels.append(label_ids)
            gold_labels = np.concatenate(gold_labels, 0)
            logits = np.concatenate(inference_labels, 0)
            print(flag, accuracy(logits, gold_labels))
            if flag == 'test':
                df = pd.read_csv(os.path.join(args.data_dir, file))
                df['label_0'] = logits[:, 0]
                df['label_1'] = logits[:, 1]
                df['label_2'] = logits[:, 2]
                df[['id', 'label_0', 'label_1',
                    'label_2']].to_csv(os.path.join(args.output_dir,
                                                    "sub.csv"),
                                       index=False)
Example #28
0
def main():
    output_directory = "dofus-v2"
    num_train_epochs = 3
    train_batch_size = 4
    eval_batch_size = 2
    max_context_length = 512
    learning_rate = 6.25e-5
    weight_decay = 0.01

    nb_tr_steps, tr_loss, exp_average_loss = 0, 0, None
    global_step = 0

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    n_gpu = torch.cuda.device_count()
    logger.info("device: {}, n_gpu {}".format(device, n_gpu))

    if not os.path.exists(output_directory):
        os.makedirs(output_directory)

    # Load tokenizer and model
    tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
    model = GPT2LMHeadModel.from_pretrained("gpt2")
    model.to(device)

    train_data_loader, eval_data_loader = get_data_loaders(
        train_batch_size,
        eval_batch_size,
        max_context_length=max_context_length,
        device=device)

    # Preparing the optimizer
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        0.01
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]

    optimizer = AdamW(optimizer_grouped_parameters,
                      lr=learning_rate,
                      weight_decay=weight_decay)
    lr_scheduler.ExponentialLR(optimizer, 0.5)

    # Training the model
    model.train()
    previous_loss = float("inf")

    for _ in trange(num_train_epochs, desc="Epoch"):
        tr_loss = 0
        nb_tr_steps = 0
        tqdm_bar = tqdm(train_data_loader, desc="Training")

        for step, batch_element in enumerate(tqdm_bar):
            try:
                losses = model(batch_element, labels=batch_element)
                loss = losses[0]

                loss.backward()
                optimizer.step()

                tr_loss += loss.item()
                exp_average_loss = loss.item(
                ) if exp_average_loss is None else 0.7 * exp_average_loss + 0.3 * loss.item(
                )
                nb_tr_steps += 1
                global_step += 1
                tqdm_bar.desc = "Training loss: {:.2e} lr: {:.2e}".format(
                    exp_average_loss, optimizer.defaults["lr"])

                if step % 1000 == 0:
                    save(model, tokenizer, output_directory)

                if step % 1000 == 0:
                    log_tensorboard(model, writer, global_step,
                                    exp_average_loss, tokenizer, device)

                optimizer.zero_grad()
            except RuntimeError:
                print("There was a runtime error with batch:", batch_element)

        previous_loss = evaluate(model, tokenizer, eval_data_loader, tr_loss,
                                 previous_loss, nb_tr_steps, global_step,
                                 output_directory)
        model.train()

    save(model, tokenizer, output_directory)

    # Evaluating
    evaluate(model, eval_data_loader, device, tr_loss, nb_tr_steps,
             global_step, output_directory)
Example #29
0
def fit(model, training_iter, eval_iter, num_train_steps, device, n_gpu, verbose=1):
    # ------------------结果可视化------------------------
    if args.local_rank in [-1, 0]:
        TIMESTAMP = "{0:%Y-%m-%dT%H-%M-%S/}".format(datetime.now())
        tb_writer = SummaryWriter('log/%s'%TIMESTAMP)
    # ---------------------优化器-------------------------
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']

    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}]

    t_total = num_train_steps

    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
    scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total)#int(t_total*args.warmup_proportion)
    # ---------------------GPU半精度fp16-----------------------------
    if args.fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
        model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)

    # multi-gpu training (should be after apex fp16 initialization)
    if n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Distributed training (should be after apex fp16 initialization)
    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank],
                                                          output_device=args.local_rank,
                                                          find_unused_parameters=True)
    # ---------------------模型初始化----------------------
    model.to(device)
    tr_loss, logging_loss = 0.0, 0.0
    # ------------------------训练------------------------------
    best_f1 = 0
    #start = time.time()
    global_step = 0
    set_seed(args, n_gpu)  # Added here for reproductibility (even between python 2 and 3)
    bar = tqdm(range(t_total), total = t_total)
    nb_tr_examples, nb_tr_steps = 0, 0

    for step in bar:
        model.train()
        batch = next(training_iter)
        batch = tuple(t.to(device) for t in batch)
        inputs = {'input_ids': batch[0],
                  'attention_mask': batch[1],
                  'token_type_ids': batch[2] if args.model_type in ['bert', 'xlnet'] else None,
                  # XLM don't use segment_ids
                  'labels': batch[3]}
        encode = model(**inputs)
        encode = encode[0]#提取预测结果
        loss = model.loss_fn(encode, labels=inputs['labels'])

        if n_gpu > 1:
            loss = loss.mean()  # mean() to average on multi-gpu parallel training
        if args.gradient_accumulation_steps > 1:
            loss = loss / args.gradient_accumulation_steps

        if args.fp16:
            with amp.scale_loss(loss, optimizer) as scaled_loss:
                scaled_loss.backward()
            #torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
        else:
            loss.backward()
            #torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)

        tr_loss += loss.item()
        train_loss = round(tr_loss*args.gradient_accumulation_steps/(nb_tr_steps+1),4)
        bar.set_description("loss {}".format(train_loss))
        nb_tr_examples += inputs['input_ids'].size(0)
        nb_tr_steps += 1

        if (nb_tr_steps + 1) % args.gradient_accumulation_steps == 0:
            optimizer.step()
            scheduler.step()  # Update learning rate schedule
            optimizer.zero_grad()
            global_step += 1

        if (step + 1) %(args.eval_steps*args.gradient_accumulation_steps)==0:
            tr_loss = 0
            nb_tr_examples, nb_tr_steps = 0, 0
            logger.info("***** Report result *****")
            logger.info("  %s = %s", 'global_step', str(global_step))
            logger.info("  %s = %s", 'train loss', str(train_loss))


        if args.local_rank in [-1, 0] and \
                args.do_eval and (step+1)%(args.eval_steps*args.gradient_accumulation_steps)==0:

            # -----------------------验证----------------------------
            model.eval()
            y_predicts, y_labels = [], []
            eval_loss, eval_acc, eval_f1 = 0, 0, 0
            nb_eval_steps, nb_eval_examples = 0, 0

            for _, batch in enumerate(eval_iter):
                batch = tuple(t.to(device) for t in batch)
                inputs = {'input_ids': batch[0],
                          'attention_mask': batch[1],
                          'token_type_ids': batch[2] if args.model_type in ['bert', 'xlnet'] else None,
                          # XLM don't use segment_ids
                          'labels': batch[3]}
                with torch.no_grad():
                    encode = model(**inputs)
                    encode = encode[0]  # 提取预测结果
                    eval_los = model.loss_fn(encode, labels=inputs['labels'])

                    predicts = model.predict(encode)#.detach().cpu().numpy()

                nb_eval_examples += inputs['input_ids'].size(0)
                nb_eval_steps += 1
                eval_loss += eval_los.mean().item()
                y_predicts.append(torch.from_numpy(predicts))

                labels = inputs['labels'].view(1, -1)
                labels = labels[labels != -1]
                y_labels.append(labels)

            eval_loss = eval_loss / nb_eval_steps
            eval_predicted = torch.cat(y_predicts, dim=0).cpu().numpy()
            eval_labeled = torch.cat(y_labels, dim=0).cpu().numpy()

            eval_f1 = model.acc_rec_f1(eval_predicted, eval_labeled)#eval_acc, eval_rec,

            logger.info(
                '\n\nglobal_step %d - train_loss: %4f - eval_loss: %4f - eval_f1:%4f\n'
                % (global_step,
                   train_loss,
                   eval_loss,
                   eval_f1))

            # 保存最好的模型
            if eval_f1 > best_f1:
                best_f1 = eval_f1
                save_model(model, args.output_dir)

            if args.local_rank in [-1, 0]:
                tb_writer.add_scalar('train_loss', train_loss, step)#.item()
                tb_writer.add_scalar('eval_loss', eval_loss, step)#.item() / count
                tb_writer.add_scalar('eval_f1', eval_f1, step)#eval_acc

            tb_writer.add_scalar('lr', scheduler.get_lr()[0], global_step)

    if args.local_rank in [-1, 0]:
        tb_writer.close()
Example #30
0
    def train(self):
        if not os.path.exists(self.output_dir):
            os.makedirs(self.output_dir)

        # logger.info(f'Fold {split_index + 1}')
        train_dataloader, eval_dataloader, train_examples, eval_examples = self.create_dataloader(
        )

        num_train_optimization_steps = self.train_steps

        # Prepare model
        config = BertConfig.from_pretrained(self.model_name_or_path)
        model = BertForTokenClassification.from_pretrained(
            self.model_name_or_path, self.args, config=config)
        model.to(self.device)
        model.train()
        # Prepare optimizer
        param_optimizer = list(model.named_parameters())
        param_optimizer = [n for n in param_optimizer]

        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [{
            'params': [
                p for n, p in param_optimizer
                if not any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            self.weight_decay
        }, {
            'params':
            [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
            'weight_decay':
            0.0
        }]

        optimizer = AdamW(optimizer_grouped_parameters,
                          lr=self.learning_rate,
                          eps=self.adam_epsilon)
        scheduler = WarmupLinearSchedule(optimizer,
                                         warmup_steps=self.warmup_steps,
                                         t_total=self.train_steps)

        global_step = 0

        logger.info("***** Running training *****")
        logger.info("  Num examples = %d", len(train_examples))
        logger.info("  Batch size = %d", self.train_batch_size)
        logger.info("  Num steps = %d", num_train_optimization_steps)

        best_acc = 0
        best_MRR = 0
        tr_loss = 0
        nb_tr_examples, nb_tr_steps = 0, 0
        train_dataloader = cycle(train_dataloader)

        for step in range(num_train_optimization_steps):
            batch = next(train_dataloader)
            batch = tuple(t.to(self.device) for t in batch)
            input_ids, input_mask, segment_ids, label_domain, label_dependcy = batch

            loss_domain, loss_dependcy = model(input_ids=input_ids,
                                               token_type_ids=segment_ids,
                                               attention_mask=input_mask,
                                               label_domain=label_domain,
                                               label_dependcy=label_dependcy)
            loss = loss_domain + loss_dependcy
            tr_loss += loss.item()
            train_loss = round(tr_loss / (nb_tr_steps + 1), 4)

            nb_tr_examples += input_ids.size(0)
            nb_tr_steps += 1

            loss.backward()
            if (nb_tr_steps + 1) % self.gradient_accumulation_steps == 0:

                optimizer.step()
                optimizer.zero_grad()
                scheduler.step()
                global_step += 1

            if (step + 1) % (self.eval_steps *
                             self.gradient_accumulation_steps) == 0:
                tr_loss = 0
                nb_tr_examples, nb_tr_steps = 0, 0
                logger.info("***** Report result *****")
                logger.info("  %s = %s", 'global_step', str(global_step))
                logger.info("  %s = %s", 'train loss', str(train_loss))

            if self.do_eval and (step + 1) % (
                    self.eval_steps * self.gradient_accumulation_steps) == 0:
                for file in ['dev.csv']:
                    inference_labels = []
                    gold_labels_domain = []
                    gold_labels_dependcy = []
                    inference_logits = []
                    scores_domain = []
                    scores_dependcy = []
                    ID = [x.guid for x in eval_examples]

                    logger.info("***** Running evaluation *****")
                    logger.info("  Num examples = %d", len(eval_examples))
                    logger.info("  Batch size = %d", self.eval_batch_size)

                    model.eval()
                    eval_loss_domain, eval_loss_dependcy, eval_accuracy_domain, eval_accuracy_dependcy = 0, 0, 0, 0
                    nb_eval_steps, nb_eval_examples = 0, 0
                    for input_ids, input_mask, segment_ids, label_domain, label_dependcy in eval_dataloader:
                        input_ids = input_ids.to(self.device)
                        input_mask = input_mask.to(self.device)
                        segment_ids = segment_ids.to(self.device)
                        label_domain = label_domain.to(self.device)
                        label_dependcy = label_dependcy.to(self.device)

                        with torch.no_grad():
                            batch_eval_loss_domain, batch_eval_loss_dependcy = model(
                                input_ids=input_ids,
                                token_type_ids=segment_ids,
                                attention_mask=input_mask,
                                label_domain=label_domain,
                                label_dependcy=label_dependcy)
                            logits_domain, logits_dependcy = model(
                                input_ids=input_ids,
                                token_type_ids=segment_ids,
                                attention_mask=input_mask)

                        logits_domain = logits_domain.view(
                            -1, self.num_labels_domain).detach().cpu().numpy()
                        logits_dependcy = logits_dependcy.view(
                            -1,
                            self.num_labels_dependcy).detach().cpu().numpy()

                        label_domain = label_domain.view(-1).to('cpu').numpy()
                        label_dependcy = label_dependcy.view(-1).to(
                            'cpu').numpy()

                        scores_domain.append(logits_domain)
                        scores_dependcy.append(logits_dependcy)

                        gold_labels_domain.append(label_domain)
                        gold_labels_dependcy.append(label_dependcy)

                        eval_loss_domain += batch_eval_loss_domain.mean().item(
                        )
                        eval_loss_dependcy += batch_eval_loss_dependcy.mean(
                        ).item()
                        nb_eval_examples += input_ids.size(0)
                        nb_eval_steps += 1

                    gold_labels_domain = np.concatenate(gold_labels_domain, 0)
                    gold_labels_dependcy = np.concatenate(
                        gold_labels_dependcy, 0)
                    scores_domain = np.concatenate(scores_domain, 0)
                    scores_dependcy = np.concatenate(scores_dependcy, 0)
                    model.train()
                    eval_loss_domain = eval_loss_domain / nb_eval_steps
                    eval_loss_dependcy = eval_loss_dependcy / nb_eval_steps

                    eval_accuracy_domain = accuracyF1(scores_domain,
                                                      gold_labels_domain,
                                                      mode='domain')
                    eval_accuracy_dependcy = accuracyF1(scores_dependcy,
                                                        gold_labels_dependcy,
                                                        mode='dependcy')
                    print('eval_F1_domain', eval_accuracy_domain,
                          'eval_F1_dependcy', eval_accuracy_dependcy,
                          'global_step', global_step, 'loss', train_loss)
                    result = {
                        'eval_loss_domain': eval_loss_domain,
                        'eval_loss_dependcy': eval_loss_dependcy,
                        'eval_F1_domain': eval_accuracy_domain,
                        'eval_F1_dependcy': eval_accuracy_dependcy,
                        'global_step': global_step,
                        'loss': train_loss
                    }

                    output_eval_file = os.path.join(self.output_dir,
                                                    "eval_results.txt")
                    with open(output_eval_file, "a") as writer:
                        for key in sorted(result.keys()):
                            logger.info("  %s = %s", key, str(result[key]))
                            writer.write("%s = %s\n" % (key, str(result[key])))
                        writer.write('*' * 80)
                        writer.write('\n')
                    if eval_accuracy_domain > best_acc:
                        print("=" * 80)
                        print("Best F1", eval_accuracy_domain)
                        print("Saving Model......")
                        # best_acc = eval_accuracy
                        best_acc = eval_accuracy_domain
                        # Save a trained model
                        model_to_save = model.module if hasattr(
                            model, 'module') else model
                        output_model_file = os.path.join(
                            self.output_dir, "pytorch_model.bin")
                        torch.save(model_to_save.state_dict(),
                                   output_model_file)
                        print("=" * 80)
                    else:
                        print("=" * 80)