Example #1
0
 def setup_default_optimizer(self,
                             weight_decay: float = 0.0,
                             learning_rate: float = 5e-5,
                             adam_epsilon: float = 1e-8,
                             warmup_steps: int = 0,
                             total_steps: int = 0):
     # Prepare optimizer and schedule (linear warmup and decay)
     no_decay = ['bias', 'LayerNorm.weight']
     optimizer_grouped_parameters = [{
         'params': [
             p for n, p in self.model.named_parameters()
             if not any(nd in n for nd in no_decay)
         ],
         'weight_decay':
         weight_decay
     }, {
         'params': [
             p for n, p in self.model.named_parameters()
             if any(nd in n for nd in no_decay)
         ],
         'weight_decay':
         0.0
     }]
     self.optimizer = AdamW(optimizer_grouped_parameters,
                            lr=learning_rate,
                            eps=adam_epsilon)
     self.scheduler = WarmupLinearSchedule(self.optimizer,
                                           warmup_steps=warmup_steps,
                                           t_total=total_steps)
    def __init__(self, args):
        self.args = args
        self.train_batch_num = args['train_batch']
        self.Dataloader = Dataloader(args)
        print("preparing the train_data")
        self.train_data = self.Dataloader.load_train_batches()
        print("preparing the val_data")

        print("train data len:", len(self.train_data) * self.train_batch_num)
        self.cuda_gpu = (torch.cuda.is_available() and args['use_gpu'])

        print("build modeling:")
        self.global_model = Global_Model(args)

        if (self.cuda_gpu):
            # torch.nn.DataParallel (self.global_model, device_ids=gpus).cuda ()
            self.global_model = self.global_model.cuda()

        self.global_optimer = AdamW(self.global_model.parameters(),
                                    lr=args['global_lr'])

        num_total_steps = len(self.train_data) * args['global_epoch']
        num_warmup_steps = int(args['global_warmup_rate'] * num_total_steps)

        self.global_scheduler = WarmupLinearSchedule(
            self.global_optimer,
            warmup_steps=num_warmup_steps,
            t_total=num_total_steps)
    def test_warmup_linear_scheduler(self):
        scheduler = WarmupLinearSchedule(self.optimizer, warmup_steps=2, t_total=10)
        lrs = unwrap_schedule(scheduler, self.num_steps)
        expected_learning_rates = [5.0, 10.0, 8.75, 7.5, 6.25, 5.0, 3.75, 2.5, 1.25, 0.0]
        self.assertEqual(len(lrs[0]), 1)
        self.assertListEqual([l[0] for l in lrs], expected_learning_rates)

        scheduler = WarmupLinearSchedule(self.optimizer, warmup_steps=2, t_total=10)
        lrs_2 = unwrap_and_save_reload_schedule(scheduler, self.num_steps)
        self.assertListEqual([l[0] for l in lrs], [l[0] for l in lrs_2])
Example #4
0
def train_task(args, model, memory, train_dataset, valid_dataset):

    # train_dataloader = DataLoader(train_dataset, batch_size=args.batch_size, num_workers=args.n_workers,
    #                               shuffle=not args.reproduce, collate_fn=dynamic_collate_fn)
    train_dataloader = DataLoader(train_dataset, num_workers=args.n_workers, collate_fn=dynamic_collate_fn,
                                  batch_sampler=DynamicBatchSampler(train_dataset, args.batch_size))
    # if valid_dataset:
    #     valid_dataloader = DataLoader(valid_dataset, batch_size=args.batch_size * 6,
    #                                   num_workers=args.n_workers, collate_fn=dynamic_collate_fn)
    no_decay = ['bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay},
        {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
    ]
    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
    scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=len(train_dataset)//10)

    model.zero_grad()
    tot_epoch_loss, tot_n_inputs = 0, 0

    def update_parameters(loss):
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
        scheduler.step()
        optimizer.step()
        model.zero_grad()

    for step, batch in enumerate(train_dataloader):
        model.train()
        n_inputs, input_ids, masks, labels = prepare_inputs(batch)
        memory.add(input_ids, masks, labels)
        loss = model(input_ids=input_ids, attention_mask=masks, labels=labels)[0]
        update_parameters(loss)
        tot_n_inputs += n_inputs
        tot_epoch_loss += loss.item() * n_inputs

        if (step+1) % args.logging_steps == 0:
            logger.info("progress: {:.2f} , step: {} , lr: {:.2E} , avg batch size: {:.1f} , avg loss: {:.3f}".format(
                tot_n_inputs/args.n_train, step+1, scheduler.get_lr()[0], tot_n_inputs//(step+1), tot_epoch_loss/tot_n_inputs))

        if args.replay_interval >= 1 and (step+1) % args.replay_interval == 0:
            torch.cuda.empty_cache()
            del loss, input_ids, masks, labels
            input_ids, masks, labels = memory.sample(tot_n_inputs // (step + 1))
            loss = model(input_ids=input_ids, attention_mask=masks, labels=labels)[0]
            update_parameters(loss)


    logger.info("Finsih training, avg loss: {:.3f}".format(tot_epoch_loss/tot_n_inputs))
    del optimizer, optimizer_grouped_parameters
    assert tot_n_inputs == len(train_dataset) == args.n_train
Example #5
0
def build_optimizer(model, num_train_steps, learning_rate, adam_eps,
                    warmup_steps, weight_decay):
    # Prepare optimizer and schedule (linear warmup and decay)
    no_decay = ['bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params': [
            p for n, p in model.named_parameters()
            if not any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        weight_decay
    }, {
        'params': [
            p for n, p in model.named_parameters()
            if any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        0.0
    }]

    optimizer = AdamW(optimizer_grouped_parameters,
                      lr=learning_rate,
                      eps=adam_eps)
    scheduler = WarmupLinearSchedule(optimizer,
                                     warmup_steps=warmup_steps,
                                     t_total=num_train_steps)

    return optimizer, scheduler
Example #6
0
def initialize_training(args, device):
    """Initalize the tokenizer, the data loaders, the model and the tools of the optimization process."""
    # Create tokenizer, datasets and loaders
    tokenizer = EpisodeSummaryTokenizer.from_pretrained(
        args.gpt2_version,
        max_num_words=args.max_num_words,
        size_variance_handling=args.size_var_handling)
    train_dataset, val_dataset = create_datasets_from_jsons(
        args.json_paths, tokenizer, args.val_split)

    dataloaders = {
        'train':
        DataLoader(train_dataset,
                   shuffle=True,
                   batch_size=args.batch_size,
                   collate_fn=tokenizer.pad_batch_to_same_size),
        'val':
        DataLoader(val_dataset,
                   shuffle=False,
                   batch_size=args.batch_size,
                   collate_fn=tokenizer.pad_batch_to_same_size)
    }

    # Load pre-trained network weights
    model = GPT2LMHeadModel.from_pretrained(args.gpt2_version)
    model = model.to(device)

    # Prepare optimizer and scheduler
    no_decay = ['bias',
                'LayerNorm.weight']  # no decay for biases and layer norm
    optimizer_grouped_parameters = [{
        'params': [
            p for n, p in model.named_parameters()
            if not any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        args.weight_decay
    }, {
        'params': [
            p for n, p in model.named_parameters()
            if any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        0.0
    }]
    optimizer = AdamW(optimizer_grouped_parameters,
                      lr=args.learning_rate,
                      eps=args.adam_epsilon)
    scheduler = WarmupLinearSchedule(optimizer,
                                     warmup_steps=0,
                                     t_total=args.max_steps)
    model.zero_grad()

    train_state = make_train_state(
        save_path=args.model_save_path,
        early_stopping_patience=args.early_stopping_patience)

    return tokenizer, dataloaders, model, optimizer, scheduler, train_state
def train_eval_bert(corpus, res_dir, model_name, cfg_model):
    idx2tag, tag2idx = make_bert_tag_dict_from_flair_corpus(corpus)
    model = BertForTokenClassificationCustom.from_pretrained(model_name, 
                                                             cache_dir=cfg_model.cache_dir, 
                                                             num_labels=len(tag2idx))
    
    tokenizer =  BertTokenizer.from_pretrained(cfg_model.tokenizer, 
                                               cache_dir=cfg_model.cache_dir, 
                                               do_lower_case=('uncasaed' in cfg_model.tokenizer))

    w_decay = 0.01
    model = model.cuda()

    seq_tagger = SequenceTaggerBert(bert_model=model, bpe_tokenizer=tokenizer, 
                                    idx2tag=idx2tag, tag2idx=tag2idx, max_len=cfg_model.max_len)
    
    train_dataset = prepare_flair_corpus(corpus.train)
    val_dataset = prepare_flair_corpus(corpus.dev)
    test_dataset = prepare_flair_corpus(corpus.test)
 
    optimizer = AdamW(get_model_parameters(model), 
                      lr=cfg_model.lr, betas=(0.9, 0.999), 
                      eps=1e-6, weight_decay=w_decay, correct_bias=True) 
    
    if cfg_model.sched == 'warmup':
        lr_scheduler = WarmupLinearSchedule(optimizer, warmup_steps=0.1, 
                                            t_total=(len(corpus.train) / cfg_model.bs)*cfg_model.n_epochs)
        trainer = ModelTrainerBert(model=seq_tagger, 
                                   optimizer=optimizer, 
                                   lr_scheduler=lr_scheduler,
                                   train_dataset=train_dataset, 
                                   val_dataset=val_dataset,
                                   update_scheduler='es',
                                   validation_metrics=[f1_entity_level],
                                   batch_size=cfg_model.bs)
    elif cfg_model.sched == 'plateau':
        lr_scheduler = ReduceLROnPlateau(optimizer, factor=0.5, patience=cfg_model.patience, mode='min')
        trainer = ModelTrainerBert(model=seq_tagger, 
                                   optimizer=optimizer, 
                                   lr_scheduler=lr_scheduler,
                                   train_dataset=train_dataset, 
                                   val_dataset=val_dataset,
                                   keep_best_model=True,
                                   restore_bm_on_lr_change=True,
                                   update_scheduler='ee',
                                   validation_metrics=[f1_entity_level],
                                   decision_metric=lambda metrics: -metrics[1],
                                   batch_size=cfg_model.bs)
        
    trainer.train(epochs=cfg_model.n_epochs)
    
    _, __, metrics = seq_tagger.predict(test_dataset, evaluate=True, 
                                        metrics=[f1_entity_level, f1_token_level])
    print('Test performance:', metrics)
    
    return seq_tagger, metrics
Example #8
0
    def __init__(self, args, config, model, criterion, train_dataloader,
                 valid_dataloader, logger, save_path, tb_writer):

        self.args = args
        self.config = config
        self.model = model
        self.criterion = criterion
        self.train_dataloader = train_dataloader
        self.valid_dataloader = valid_dataloader
        self.logger = logger
        self.save_path = save_path
        self.tb_writer = tb_writer

        self.t_total = len(self.train_dataloader) * self.args.epoch
        self.device = self.config.device
        self.optimizer = AdamW(self.get_model_parameters(),
                               lr=self.config.learning_rate)
        self.scheduler = WarmupLinearSchedule(self.optimizer,
                                              0.1 * self.t_total, self.t_total)

        self.global_step = 0
        self.best_eval_acc = 0.2
Example #9
0
def train(train_iter, val_iter, model):
    opt = AdamW(model.parameters(), lr=1e-4, eps=1e-8)
    scheduler = WarmupLinearSchedule(opt, warmup_steps=20, t_total=2500)
    model.train()
    losses = []
    for i, ex in enumerate(train_iter):
        opt.zero_grad()
        words, mapper, _ = ex.word
        label, lengths = ex.head
        batch, _ = label.shape

        # Model
        final = model(words.cuda(), mapper)
        for b in range(batch):
            final[b, lengths[b]-1:, :] = 0
            final[b, :, lengths[b]-1:] = 0

        if not lengths.max() <= final.shape[1] + 1:
            print("fail")
            continue
        dist = DependencyCRF(final, lengths=lengths)

        labels = dist.struct.to_parts(label, lengths=lengths).type_as(final)
        log_prob = dist.log_prob(labels)

        loss = log_prob.sum()
        (-loss).backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        opt.step()
        scheduler.step()
        losses.append(loss.detach())
        if i % 50 == 1:
            print(-torch.tensor(losses).mean(), words.shape)
            losses = []
        if i % 600 == 500:
            validate(val_iter)
Example #10
0
    def setup_optim(self, t_total, warmup_step):
        args, model = self.args, self.model

        # Prepare optimizer and schedule (linear warmup and decay)
        no_decay = ['bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = []
        optimizer_grouped_parameters.append({
            'params': [
                p for n, p in model.named_parameters()
                if not any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            args.weight_decay
        })
        optimizer_grouped_parameters.append({
            'params': [
                p for n, p in model.named_parameters()
                if any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            0.0
        })
        optimizer = AdamW(optimizer_grouped_parameters,
                          lr=args.learning_rate,
                          eps=args.adam_epsilon)
        scheduler = WarmupLinearSchedule(optimizer,
                                         warmup_steps=warmup_step,
                                         t_total=t_total)
        if args.fp16:
            try:
                from apex import amp
            except ImportError:
                raise ImportError(
                    "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
                )
            self.model, optimizer = amp.initialize(
                model, optimizer, opt_level=args.fp16_opt_level)
        if args.schedule_half:
            scheduler = ReduceLROnPlateau(optimizer,
                                          factor=0.5,
                                          patience=0,
                                          min_lr=args.min_lr,
                                          mode='max')
        self.scheduler, self.optimizer = scheduler, optimizer
Example #11
0
    def generate_bert_optimizer(t_total, bert_lr, model):
        no_decay = ['bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [{
            'params': [
                p for n, p in model.named_parameters()
                if not any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            gamma
        }, {
            'params': [
                p for n, p in model.named_parameters()
                if any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            0.0
        }]
        optimizer = AdamW(optimizer_grouped_parameters, lr=bert_lr, eps=1e-8)
        scheduler = WarmupLinearSchedule(optimizer,
                                         warmup_steps=0,
                                         t_total=t_total)

        return scheduler, optimizer
Example #12
0
def main(parser):
    # Config
    args = parser.parse_args()
    data_dir = Path(args.data_dir)
    model_dir = Path(args.model_dir)

    # data_config = Config(json_path=data_dir / 'config.json')
    model_config = Config(json_path=model_dir / 'config.json')

    # Vocab & Tokenizer
    tok_path = get_tokenizer() # ./tokenizer_78b3253a26.model
    ptr_tokenizer = SentencepieceTokenizer(tok_path)

    _, vocab_of_gluonnlp = get_pytorch_kobert_model()
    token_to_idx = vocab_of_gluonnlp.token_to_idx

    model_config.vocab_size = len(token_to_idx)
    vocab = Vocabulary(token_to_idx=token_to_idx)

    print("len(token_to_idx): ", len(token_to_idx))
    with open(model_dir / "token2idx_vocab.json", 'w', encoding='utf-8') as f:
        json.dump(token_to_idx, f, ensure_ascii=False, indent=4)

    # save vocab & tokenizer
    with open(model_dir / "vocab.pkl", 'wb') as f:
        pickle.dump(vocab, f)

    # load vocab & tokenizer
    with open(model_dir / "vocab.pkl", 'rb') as f:
        vocab = pickle.load(f)

    tokenizer = Tokenizer(vocab=vocab, split_fn=ptr_tokenizer, pad_fn=keras_pad_fn, maxlen=model_config.maxlen)
    ner_formatter = NamedEntityRecognitionFormatter(vocab=vocab, tokenizer=tokenizer, maxlen=model_config.maxlen, model_dir=model_dir)

    # Train & Val Datasets
    cwd = Path.cwd()
    data_in = cwd / "data_in"
    train_data_dir = data_in / "NER-master" / "말뭉치 - 형태소_개체명"
    tr_clf_ds = NamedEntityRecognitionDataset(train_data_dir=train_data_dir, model_dir=model_dir)
    tr_clf_ds.set_transform_fn(transform_source_fn=ner_formatter.transform_source_fn, transform_target_fn=ner_formatter.transform_target_fn)
    tr_clf_dl = DataLoader(tr_clf_ds, batch_size=model_config.batch_size, shuffle=True, num_workers=4, drop_last=False)

    # Model
    model = KobertCRF(config=model_config, num_classes=len(tr_clf_ds.ner_to_index))
    model.train()

    # optim
    train_examples_len = len(tr_clf_ds)
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}]

    # num_train_optimization_steps = int(train_examples_len / model_config.batch_size / model_config.gradient_accumulation_steps) * model_config.epochs
    t_total = len(tr_clf_dl) // model_config.gradient_accumulation_steps * model_config.epochs
    optimizer = AdamW(optimizer_grouped_parameters, lr=model_config.learning_rate, eps=model_config.adam_epsilon)
    scheduler = WarmupLinearSchedule(optimizer, warmup_steps=model_config.warmup_steps, t_total=t_total)

    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

    n_gpu = torch.cuda.device_count()
    # if n_gpu > 1:
    #     model = torch.nn.DataParallel(model)
    model.to(device)

    # save
    tb_writer = SummaryWriter('{}/runs'.format(model_dir))
    checkpoint_manager = CheckpointManager(model_dir)
    summary_manager = SummaryManager(model_dir)
    best_val_loss = 1e+10
    best_train_acc = 0

    # Train!
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(tr_clf_ds))
    logger.info("  Num Epochs = %d", model_config.epochs)
    logger.info("  Instantaneous batch size per GPU = %d", model_config.batch_size)
    # logger.info("  Total train batch size (w. parallel, distributed & accumulation) = %d",
    #                args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1))
    logger.info("  Gradient Accumulation steps = %d", model_config.gradient_accumulation_steps)
    logger.info("  Total optimization steps = %d", t_total)

    global_step = 0
    tr_loss, logging_loss = 0.0, 0.0
    best_dev_acc, best_dev_loss = 0.0, 99999999999.0
    best_steps = 0
    model.zero_grad()
    set_seed()  # Added here for reproductibility (even between python 2 and 3)

    # Train
    train_iterator = trange(int(model_config.epochs), desc="Epoch")
    for _epoch, _ in enumerate(train_iterator):
        epoch_iterator = tqdm(tr_clf_dl, desc="Iteration") # , disable=args.local_rank not in [-1, 0]
        epoch = _epoch
        for step, batch in enumerate(epoch_iterator):
            model.train()
            x_input, token_type_ids, y_real = map(lambda elm: elm.to(device), batch)
            log_likelihood, sequence_of_tags = model(x_input, token_type_ids, y_real)

            # loss: negative log-likelihood
            loss = -1 * log_likelihood

            if n_gpu > 1:
                loss = loss.mean()  # mean() to average on multi-gpu parallel training
            if model_config.gradient_accumulation_steps > 1:
                loss = loss / model_config.gradient_accumulation_steps

            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), model_config.max_grad_norm)
            tr_loss += loss.item()

            if (step + 1) % model_config.gradient_accumulation_steps == 0:
                optimizer.step()
                scheduler.step()  # Update learning rate schedule
                model.zero_grad()
                global_step += 1

                with torch.no_grad():
                    sequence_of_tags = torch.tensor(sequence_of_tags)
                    print("sequence_of_tags: ", sequence_of_tags)
                    print("y_real: ", y_real)
                    print("loss: ", loss)
                    print("(sequence_of_tags == y_real): ", (sequence_of_tags == y_real))

                    mb_acc = (sequence_of_tags == y_real).float()[y_real != vocab.PAD_ID].mean()

                tr_acc = mb_acc.item()
                tr_loss_avg = tr_loss / global_step
                tr_summary = {'loss': tr_loss_avg, 'acc': tr_acc}

                # if step % 50 == 0:
                print('epoch : {}, global_step : {}, tr_loss: {:.3f}, tr_acc: {:.2%}'.format(epoch + 1, global_step,
                                                                                             tr_summary['loss'],
                                                                                             tr_summary['acc']))

                if model_config.logging_steps > 0 and global_step % model_config.logging_steps == 0:
                    # Log metrics
                    if model_config.evaluate_during_training:  # Only evaluate when single GPU otherwise metrics may not average well
                        pass
                    tb_writer.add_scalar('lr', scheduler.get_lr()[0], global_step)
                    tb_writer.add_scalar('loss', (tr_loss - logging_loss) / model_config.logging_steps, global_step)
                    logger.info("Average loss: %s at global step: %s",
                                str((tr_loss - logging_loss) / model_config.logging_steps), str(global_step))
                    logging_loss = tr_loss

                if model_config.save_steps > 0 and global_step % model_config.save_steps == 0:
                    # Save model checkpoint
                    output_dir = os.path.join(model_config.output_dir, 'epoch-{}'.format(epoch + 1))
                    if not os.path.exists(output_dir):
                        os.makedirs(output_dir)
                    logger.info("Saving model checkpoint to %s", output_dir)

                    state = {'global_step': global_step + 1,
                             'model_state_dict': model.state_dict(),
                             'opt_state_dict': optimizer.state_dict()}
                    summary = {'train': tr_summary}
                    summary_manager.update(summary)
                    summary_manager.save('summary.json')

                    is_best = tr_acc >= best_train_acc  # acc 기준 (원래는 train_acc가 아니라 val_acc로 해야)
                    # Save
                    if is_best:
                        best_train_acc = tr_acc
                        checkpoint_manager.save_checkpoint(state,
                                                           'best-epoch-{}-step-{}-acc-{:.3f}.bin'.format(epoch + 1,
                                                                                                         global_step,
                                                                                                         tr_acc))
                    else:
                        torch.save(state, os.path.join(output_dir,
                                                       'model-epoch-{}-step-{}-acc-{:.3f}.bin'.format(epoch + 1,
                                                                                                      global_step,
                                                                                                      tr_acc)))

    tb_writer.close()
    logger.info(" global_step = %s, average loss = %s", global_step, tr_loss / global_step)

    return global_step, tr_loss / global_step, best_steps
Example #13
0
class TransformerBase(TrainableModel):
    MODEL_CONFIGURATIONS = {
        'bert': (BertConfig, BertTokenizer),
        'bert+core': (BertConfig, BertTokenizer),
        'xlnet': (XLNetConfig, XLNetTokenizer),
        'xlm': (XLMConfig, XLMTokenizer),
    }

    def __init__(self,
                 model_type,
                 model_name_or_path,
                 labels: List[str] = None,
                 config_name=None,
                 tokenizer_name=None,
                 do_lower_case=False,
                 output_path=None,
                 device='cpu',
                 n_gpus=0):
        assert model_type in self.MODEL_CONFIGURATIONS.keys(
        ), "unsupported model_type"
        self.model_type = model_type
        self.model_name_or_path = model_name_or_path
        self.labels = labels
        self.num_labels = len(self.labels) + 1  # +1 for 0 for padding
        self.labels_id_map = {k: v for k, v in enumerate(self.labels, 1)}
        self.do_lower_case = do_lower_case
        if output_path is not None and not os.path.exists(output_path):
            raise FileNotFoundError('output_path is not found')
        self.output_path = output_path

        self.model_class = None
        config_class, tokenizer_class = self.MODEL_CONFIGURATIONS[model_type]
        self.config_class = config_class
        self.tokenizer_class = tokenizer_class

        self.tokenizer_name = tokenizer_name
        self.tokenizer = self._load_tokenizer(self.tokenizer_name)
        self.config_name = config_name
        self.config = self._load_config(config_name)

        self.model = None
        self.device = device
        self.n_gpus = n_gpus

        self._optimizer = None
        self._scheduler = None

    def to(self, device='cpu', n_gpus=0):
        if self.model is not None:
            self.model.to(device)
            if n_gpus > 1:
                self.model = torch.nn.DataParallel(self.model)
        self.device = device
        self.n_gpus = n_gpus

    @property
    def optimizer(self):
        return self._optimizer

    @optimizer.setter
    def optimizer(self, opt):
        self._optimizer = opt

    @property
    def scheduler(self):
        return self._scheduler

    @scheduler.setter
    def scheduler(self, sch):
        self._scheduler = sch

    def setup_default_optimizer(self,
                                weight_decay: float = 0.0,
                                learning_rate: float = 5e-5,
                                adam_epsilon: float = 1e-8,
                                warmup_steps: int = 0,
                                total_steps: int = 0):
        # Prepare optimizer and schedule (linear warmup and decay)
        no_decay = ['bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [{
            'params': [
                p for n, p in self.model.named_parameters()
                if not any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            weight_decay
        }, {
            'params': [
                p for n, p in self.model.named_parameters()
                if any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            0.0
        }]
        self.optimizer = AdamW(optimizer_grouped_parameters,
                               lr=learning_rate,
                               eps=adam_epsilon)
        self.scheduler = WarmupLinearSchedule(self.optimizer,
                                              warmup_steps=warmup_steps,
                                              t_total=total_steps)

    def _load_config(self, config_name=None):
        config = self.config_class.from_pretrained(
            config_name if config_name else self.model_name_or_path,
            num_labels=self.num_labels)
        return config

    def _load_tokenizer(self, tokenizer_name=None):
        tokenizer = self.tokenizer_class.from_pretrained(
            tokenizer_name if tokenizer_name else self.model_name_or_path,
            do_lower_case=self.do_lower_case)
        return tokenizer

    def save_model(self, output_dir, save_checkpoint=False, args=None):
        """Save model/tokenizer/arguments to given output directory
        """
        # Create output directory if needed
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)

        logger.info("Saving model checkpoint to %s", output_dir)
        model_to_save = self.model.module if hasattr(self.model,
                                                     'module') else self.model
        model_to_save.save_pretrained(output_dir)
        if not save_checkpoint:
            if self.tokenizer is not None:
                self.tokenizer.save_pretrained(output_dir)
            with io.open(output_dir + os.sep + 'labels.txt',
                         'w',
                         encoding='utf-8') as fw:
                for l in self.labels:
                    fw.write('{}\n'.format(l))
            if args is not None:
                torch.save(args, os.path.join(output_dir, 'training_args.bin'))

    @classmethod
    def load_model(cls, model_path, model_type):
        # Load a trained model and vocabulary from given path
        if not os.path.exists(model_path):
            raise FileNotFoundError
        with io.open(model_path + os.sep + 'labels.txt') as fp:
            labels = [l.strip() for l in fp.readlines()]
        return cls(model_type=model_type,
                   model_name_or_path=model_path,
                   labels=labels)

    def get_train_steps_epochs(self, max_steps, num_train_epochs,
                               gradient_accumulation_steps, num_samples):
        if max_steps > 0:
            t_total = max_steps
            num_train_epochs = max_steps // (num_samples //
                                             gradient_accumulation_steps) + 1
        else:
            t_total = num_samples // gradient_accumulation_steps * num_train_epochs
        return t_total, num_train_epochs

    def _train(self,
               data_set: DataLoader,
               dev_data_set: Union[DataLoader, List[DataLoader]] = None,
               test_data_set: Union[DataLoader, List[DataLoader]] = None,
               gradient_accumulation_steps: int = 1,
               per_gpu_train_batch_size: int = 8,
               max_steps: int = -1,
               num_train_epochs: int = 3,
               max_grad_norm: float = 1.0,
               logging_steps: int = 50,
               save_steps: int = 100):
        """Run model training
            batch_mapper: a function that maps a batch into parameters that the model
                          expects in the forward method (for use with custom heads and models).
                          If None it will default to the basic models input structure.
            logging_callback_fn: a function that is called in each evaluation step
                          with the model as a parameter.

        """
        t_total, num_train_epochs = self.get_train_steps_epochs(
            max_steps, num_train_epochs, gradient_accumulation_steps,
            len(data_set))
        if self.optimizer is None and self.scheduler is None:
            logger.info("Loading default optimizer and scheduler")
            self.setup_default_optimizer(total_steps=t_total)

        train_batch_size = per_gpu_train_batch_size * max(1, self.n_gpus)
        logger.info("***** Running training *****")
        logger.info("  Num examples = %d", len(data_set.dataset))
        logger.info("  Num Epochs = %d", num_train_epochs)
        logger.info("  Instantaneous batch size per GPU = %d",
                    per_gpu_train_batch_size)
        logger.info(
            "  Total train batch size (w. parallel, distributed & accumulation) = %d",
            train_batch_size * gradient_accumulation_steps)
        logger.info("  Gradient Accumulation steps = %d",
                    gradient_accumulation_steps)
        logger.info("  Total optimization steps = %d", t_total)

        global_step = 0
        tr_loss, logging_loss = 0.0, 0.0
        self.model.zero_grad()
        epochs_eval = []
        train_iterator = trange(num_train_epochs, desc="Epoch")
        for _ in train_iterator:
            epoch_iterator = tqdm(data_set, desc="Iteration")
            for step, batch in enumerate(epoch_iterator):
                self.model.train()
                batch = tuple(t.to(self.device) for t in batch)
                inputs = self._batch_mapper(batch)
                ouputs = self.model(**inputs)
                loss = ouputs[0]  # get loss

                if self.n_gpus > 1:
                    loss = loss.mean(
                    )  # mean() to average on multi-gpu parallel training
                if gradient_accumulation_steps > 1:
                    loss = loss / gradient_accumulation_steps

                loss.backward()
                torch.nn.utils.clip_grad_norm_(self.model.parameters(),
                                               max_grad_norm)

                tr_loss += loss.item()
                if (step + 1) % gradient_accumulation_steps == 0:
                    self.optimizer.step()
                    self.scheduler.step()  # Update learning rate schedule
                    self.model.zero_grad()
                    global_step += 1

                    if logging_steps > 0 and global_step % logging_steps == 0:
                        # Log metrics and run evaluation on dev/test
                        self._evaluate_during_train(dev_data_set,
                                                    test_data_set)
                        logger.info('lr = {}'.format(
                            self.scheduler.get_lr()[0]))
                        logger.info('loss = {}'.format(
                            (tr_loss - logging_loss) / logging_steps))
                        logging_loss = tr_loss

                    if save_steps > 0 and global_step % save_steps == 0:
                        # Save model checkpoint
                        self.save_model_checkpoint(
                            output_path=self.output_path,
                            name='checkpoint-{}'.format(global_step))

                if 0 < max_steps < global_step:
                    epoch_iterator.close()
                    break
            if 0 < max_steps < global_step:
                train_iterator.close()
                break
            epochs_eval.append(
                self._evaluate_during_train(dev_data_set, test_data_set))

        logger.info(" global_step = %s, average loss = %s", global_step,
                    tr_loss)
        return epochs_eval

    def _evaluate_during_train(self, dev_data_set, test_data_set) -> dict:
        ds_evals = {'dev': None, 'test': None}
        for ds_name, ds in zip(['dev', 'test'], [dev_data_set, test_data_set]):
            if ds is None:  # got no data loader
                continue
            if isinstance(ds, DataLoader):
                ds = [ds]
            for d in ds:
                logits, label_ids = self._evaluate(d)
                ds_evals[ds_name] = self.evaluate_predictions(
                    logits, label_ids)
        return ds_evals

    def _evaluate(self, data_set: DataLoader):
        logger.info("***** Running inference *****")
        logger.info(" Batch size: {}".format(data_set.batch_size))
        eval_loss = 0.0
        nb_eval_steps = 0
        preds = None
        out_label_ids = None
        for batch in tqdm(data_set, desc="Inference iteration"):
            self.model.eval()
            batch = tuple(t.to(self.device) for t in batch)

            with torch.no_grad():
                inputs = self._batch_mapper(batch)
                outputs = self.model(**inputs)
                if 'labels' in inputs:
                    tmp_eval_loss, logits = outputs[:2]
                    eval_loss += tmp_eval_loss.mean().item()
                else:
                    logits = outputs[0]
            nb_eval_steps += 1
            model_output = logits.detach().cpu()
            model_out_label_ids = inputs['labels'].detach().cpu(
            ) if 'labels' in inputs else None
            if preds is None:
                preds = model_output
                out_label_ids = model_out_label_ids
            else:
                preds = torch.cat((preds, model_output), dim=0)
                out_label_ids = torch.cat(
                    (out_label_ids, model_out_label_ids),
                    dim=0) if out_label_ids is not None else None
        if out_label_ids is None:
            return preds
        return preds, out_label_ids

    def _batch_mapper(self, batch):
        mapping = {
            'input_ids':
            batch[0],
            'attention_mask':
            batch[1],
            # XLM don't use segment_ids
            'token_type_ids':
            batch[2] if self.model_type in ['bert', 'xlnet'] else None
        }
        if len(batch) == 4:
            mapping.update({'labels': batch[3]})
        return mapping

    def evaluate_predictions(self, logits, label_ids):
        raise Exception(
            'evaluate_predictions method must be implemented in order to be used for '
            'dev/test set evaluation')

    def save_model_checkpoint(self, output_path, name):
        output_dir_path = os.path.join(output_path, name)
        self.save_model(output_dir_path, save_checkpoint=True)
Example #14
0
def fit(model, training_iter, eval_iter, num_train_steps, device, n_gpu, verbose=1):
    # ------------------结果可视化------------------------
    if args.local_rank in [-1, 0]:
        TIMESTAMP = "{0:%Y-%m-%dT%H-%M-%S/}".format(datetime.now())
        tb_writer = SummaryWriter('log/%s'%TIMESTAMP)
    # ---------------------优化器-------------------------
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']

    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}]

    t_total = num_train_steps

    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
    scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total)#int(t_total*args.warmup_proportion)
    # ---------------------GPU半精度fp16-----------------------------
    if args.fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
        model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)

    # multi-gpu training (should be after apex fp16 initialization)
    if n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Distributed training (should be after apex fp16 initialization)
    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank],
                                                          output_device=args.local_rank,
                                                          find_unused_parameters=True)
    # ---------------------模型初始化----------------------
    model.to(device)
    tr_loss, logging_loss = 0.0, 0.0
    # ------------------------训练------------------------------
    best_f1 = 0
    #start = time.time()
    global_step = 0
    set_seed(args, n_gpu)  # Added here for reproductibility (even between python 2 and 3)
    bar = tqdm(range(t_total), total = t_total)
    nb_tr_examples, nb_tr_steps = 0, 0

    for step in bar:
        model.train()
        batch = next(training_iter)
        batch = tuple(t.to(device) for t in batch)
        inputs = {'input_ids': batch[0],
                  'attention_mask': batch[1],
                  'token_type_ids': batch[2] if args.model_type in ['bert', 'xlnet'] else None,
                  # XLM don't use segment_ids
                  'labels': batch[3]}
        encode = model(**inputs)
        encode = encode[0]#提取预测结果
        loss = model.loss_fn(encode, labels=inputs['labels'])

        if n_gpu > 1:
            loss = loss.mean()  # mean() to average on multi-gpu parallel training
        if args.gradient_accumulation_steps > 1:
            loss = loss / args.gradient_accumulation_steps

        if args.fp16:
            with amp.scale_loss(loss, optimizer) as scaled_loss:
                scaled_loss.backward()
            #torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
        else:
            loss.backward()
            #torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)

        tr_loss += loss.item()
        train_loss = round(tr_loss*args.gradient_accumulation_steps/(nb_tr_steps+1),4)
        bar.set_description("loss {}".format(train_loss))
        nb_tr_examples += inputs['input_ids'].size(0)
        nb_tr_steps += 1

        if (nb_tr_steps + 1) % args.gradient_accumulation_steps == 0:
            optimizer.step()
            scheduler.step()  # Update learning rate schedule
            optimizer.zero_grad()
            global_step += 1

        if (step + 1) %(args.eval_steps*args.gradient_accumulation_steps)==0:
            tr_loss = 0
            nb_tr_examples, nb_tr_steps = 0, 0
            logger.info("***** Report result *****")
            logger.info("  %s = %s", 'global_step', str(global_step))
            logger.info("  %s = %s", 'train loss', str(train_loss))


        if args.local_rank in [-1, 0] and \
                args.do_eval and (step+1)%(args.eval_steps*args.gradient_accumulation_steps)==0:

            # -----------------------验证----------------------------
            model.eval()
            y_predicts, y_labels = [], []
            eval_loss, eval_acc, eval_f1 = 0, 0, 0
            nb_eval_steps, nb_eval_examples = 0, 0

            for _, batch in enumerate(eval_iter):
                batch = tuple(t.to(device) for t in batch)
                inputs = {'input_ids': batch[0],
                          'attention_mask': batch[1],
                          'token_type_ids': batch[2] if args.model_type in ['bert', 'xlnet'] else None,
                          # XLM don't use segment_ids
                          'labels': batch[3]}
                with torch.no_grad():
                    encode = model(**inputs)
                    encode = encode[0]  # 提取预测结果
                    eval_los = model.loss_fn(encode, labels=inputs['labels'])

                    predicts = model.predict(encode)#.detach().cpu().numpy()

                nb_eval_examples += inputs['input_ids'].size(0)
                nb_eval_steps += 1
                eval_loss += eval_los.mean().item()
                y_predicts.append(torch.from_numpy(predicts))

                labels = inputs['labels'].view(1, -1)
                labels = labels[labels != -1]
                y_labels.append(labels)

            eval_loss = eval_loss / nb_eval_steps
            eval_predicted = torch.cat(y_predicts, dim=0).cpu().numpy()
            eval_labeled = torch.cat(y_labels, dim=0).cpu().numpy()

            eval_f1 = model.acc_rec_f1(eval_predicted, eval_labeled)#eval_acc, eval_rec,

            logger.info(
                '\n\nglobal_step %d - train_loss: %4f - eval_loss: %4f - eval_f1:%4f\n'
                % (global_step,
                   train_loss,
                   eval_loss,
                   eval_f1))

            # 保存最好的模型
            if eval_f1 > best_f1:
                best_f1 = eval_f1
                save_model(model, args.output_dir)

            if args.local_rank in [-1, 0]:
                tb_writer.add_scalar('train_loss', train_loss, step)#.item()
                tb_writer.add_scalar('eval_loss', eval_loss, step)#.item() / count
                tb_writer.add_scalar('eval_f1', eval_f1, step)#eval_acc

            tb_writer.add_scalar('lr', scheduler.get_lr()[0], global_step)

    if args.local_rank in [-1, 0]:
        tb_writer.close()
Example #15
0
    def train(self):
        if not os.path.exists(self.output_dir):
            os.makedirs(self.output_dir)

        # logger.info(f'Fold {split_index + 1}')
        train_dataloader, eval_dataloader, train_examples, eval_examples = self.create_dataloader(
        )

        num_train_optimization_steps = self.train_steps

        # Prepare model
        config = BertConfig.from_pretrained(self.model_name_or_path,
                                            num_labels=self.num_labels)
        model = BertForSequenceClassification.from_pretrained(
            self.model_name_or_path, self.args, config=config)
        model.to(self.device)
        model.train()
        # Prepare optimizer
        param_optimizer = list(model.named_parameters())
        param_optimizer = [n for n in param_optimizer]

        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [{
            'params': [
                p for n, p in param_optimizer
                if not any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            self.weight_decay
        }, {
            'params':
            [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
            'weight_decay':
            0.0
        }]

        optimizer = AdamW(optimizer_grouped_parameters,
                          lr=self.learning_rate,
                          eps=self.adam_epsilon)
        scheduler = WarmupLinearSchedule(optimizer,
                                         warmup_steps=self.warmup_steps,
                                         t_total=self.train_steps)

        global_step = 0

        logger.info("***** Running training *****")
        logger.info("  Num examples = %d", len(train_examples))
        logger.info("  Batch size = %d", self.train_batch_size)
        logger.info("  Num steps = %d", num_train_optimization_steps)

        best_acc = 0
        best_MRR = 0
        tr_loss = 0
        nb_tr_examples, nb_tr_steps = 0, 0
        train_dataloader = cycle(train_dataloader)

        for step in range(num_train_optimization_steps):
            batch = next(train_dataloader)
            batch = tuple(t.to(self.device) for t in batch)
            input_ids, input_mask, segment_ids, utterance_mask, response_mask, history_mask, label_ids = batch
            loss = model(input_ids=input_ids,
                         token_type_ids=segment_ids,
                         attention_mask=input_mask,
                         utterance_mask=utterance_mask,
                         response_mask=response_mask,
                         history_mask=history_mask,
                         labels=label_ids)
            tr_loss += loss.item()
            train_loss = round(tr_loss / (nb_tr_steps + 1), 4)

            nb_tr_examples += input_ids.size(0)
            nb_tr_steps += 1

            loss.backward()
            if (nb_tr_steps + 1) % self.gradient_accumulation_steps == 0:

                optimizer.step()
                optimizer.zero_grad()
                scheduler.step()
                global_step += 1

            if (step + 1) % (self.eval_steps *
                             self.gradient_accumulation_steps) == 0:
                tr_loss = 0
                nb_tr_examples, nb_tr_steps = 0, 0
                logger.info("***** Report result *****")
                logger.info("  %s = %s", 'global_step', str(global_step))
                logger.info("  %s = %s", 'train loss', str(train_loss))

            if self.do_eval and (step + 1) % (
                    self.eval_steps * self.gradient_accumulation_steps) == 0:
                for file in ['dev.csv']:
                    inference_labels = []
                    gold_labels = []
                    inference_logits = []
                    scores = []
                    ID = [x.guid for x in eval_examples]

                    logger.info("***** Running evaluation *****")
                    logger.info("  Num examples = %d", len(eval_examples))
                    logger.info("  Batch size = %d", self.eval_batch_size)

                    model.eval()
                    eval_loss, eval_accuracy = 0, 0
                    nb_eval_steps, nb_eval_examples = 0, 0
                    for input_ids, input_mask, segment_ids, utterance_mask, response_mask, history_mask, label_ids in eval_dataloader:
                        input_ids = input_ids.to(self.device)
                        input_mask = input_mask.to(self.device)
                        segment_ids = segment_ids.to(self.device)
                        utterance_mask = utterance_mask.to(self.device)
                        response_mask = response_mask.to(self.device)
                        history_mask = history_mask.to(self.device)
                        label_ids = label_ids.to(self.device)

                        with torch.no_grad():
                            tmp_eval_loss = model(
                                input_ids=input_ids,
                                token_type_ids=segment_ids,
                                attention_mask=input_mask,
                                utterance_mask=utterance_mask,
                                response_mask=response_mask,
                                history_mask=history_mask,
                                labels=label_ids)
                            logits = model(
                                input_ids=input_ids,
                                token_type_ids=segment_ids,
                                attention_mask=input_mask,
                                utterance_mask=utterance_mask,
                                response_mask=response_mask,
                                history_mask=history_mask,
                            )

                        logits = logits.detach().cpu().numpy()
                        label_ids = label_ids.to('cpu').numpy()
                        inference_labels.append(np.argmax(logits, axis=1))
                        scores.append(logits)
                        gold_labels.append(label_ids)
                        inference_logits.append(logits)
                        eval_loss += tmp_eval_loss.mean().item()
                        nb_eval_examples += input_ids.size(0)
                        nb_eval_steps += 1

                    gold_labels = np.concatenate(gold_labels, 0)
                    inference_logits = np.concatenate(inference_logits, 0)
                    scores = np.concatenate(scores, 0)
                    model.train()
                    eval_loss = eval_loss / nb_eval_steps
                    eval_accuracy = accuracyCQA(inference_logits, gold_labels)
                    eval_DOUBAN_MRR, eval_DOUBAN_mrr, eval_DOUBAN_MAP, eval_Precision1 = compute_DOUBAN(
                        ID, scores, gold_labels)
                    r_at_1 = r_at_k(ID, scores, gold_labels, 1)
                    r_at_2 = r_at_k(ID, scores, gold_labels, 2)
                    r_at_5 = r_at_k(ID, scores, gold_labels, 5)
                    # print('eval_mrr',eval_mrr)
                    print('eval_F1', eval_accuracy, 'eval_MRR',
                          eval_DOUBAN_MRR, 'eval_MAP', eval_DOUBAN_MAP,
                          'eval_Precision1', eval_Precision1, 'r10@1', r_at_1,
                          'r10@2', r_at_2, 'r10@5', r_at_5, 'global_step',
                          global_step, 'loss', train_loss)
                    result = {
                        'eval_loss': eval_loss,
                        'eval_F1': eval_accuracy,
                        'eval_MRR': eval_DOUBAN_MRR,
                        'eval_MAP': eval_DOUBAN_MAP,
                        'eval_Precision1': eval_Precision1,
                        'r10@1': r_at_1,
                        'r10@2': r_at_2,
                        'r10@5': r_at_5,
                        'global_step': global_step,
                        'loss': train_loss
                    }

                    output_eval_file = os.path.join(self.output_dir,
                                                    "eval_results.txt")
                    with open(output_eval_file, "a") as writer:
                        for key in sorted(result.keys()):
                            logger.info("  %s = %s", key, str(result[key]))
                            writer.write("%s = %s\n" % (key, str(result[key])))
                        writer.write('*' * 80)
                        writer.write('\n')
                    # if eval_accuracy > best_acc :
                    if eval_DOUBAN_MRR > best_MRR:
                        print("=" * 80)
                        print("Best MRR", eval_DOUBAN_MRR)
                        print("Saving Model......")
                        # best_acc = eval_accuracy
                        best_MRR = eval_DOUBAN_MRR
                        # Save a trained model
                        model_to_save = model.module if hasattr(
                            model, 'module') else model
                        output_model_file = os.path.join(
                            self.output_dir, "pytorch_model.bin")
                        torch.save(model_to_save.state_dict(),
                                   output_model_file)
                        print("=" * 80)
                    else:
                        print("=" * 80)
Example #16
0
def main():
    parser = argparse.ArgumentParser()

    ## Required parameters
    parser.add_argument(
        "--data_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The input data dir. Should contain the .tsv files (or other data files) for the task."
    )
    parser.add_argument("--model_type",
                        default=None,
                        type=str,
                        required=True,
                        help="Model type selected in the list: " +
                        ", ".join(MODEL_CLASSES.keys()))
    parser.add_argument(
        "--model_name_or_path",
        default=None,
        type=str,
        required=True,
        help="Path to pre-trained model or shortcut name selected in the list: "
        + ", ".join(ALL_MODELS))
    parser.add_argument(
        "--meta_path",
        default=None,
        type=str,
        required=False,
        help="Path to pre-trained model or shortcut name selected in the list: "
        + ", ".join(ALL_MODELS))
    parser.add_argument(
        "--output_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The output directory where the model predictions and checkpoints will be written."
    )

    ## Other parameters
    parser.add_argument(
        "--config_name",
        default="",
        type=str,
        help="Pretrained config name or path if not the same as model_name")
    parser.add_argument(
        "--tokenizer_name",
        default="",
        type=str,
        help="Pretrained tokenizer name or path if not the same as model_name")
    parser.add_argument(
        "--cache_dir",
        default="",
        type=str,
        help=
        "Where do you want to store the pre-trained models downloaded from s3")
    parser.add_argument(
        "--max_seq_length",
        default=128,
        type=int,
        help=
        "The maximum total input sequence length after tokenization. Sequences longer "
        "than this will be truncated, sequences shorter will be padded.")
    parser.add_argument("--do_train",
                        action='store_true',
                        help="Whether to run training.")
    parser.add_argument("--do_test",
                        action='store_true',
                        help="Whether to run training.")
    parser.add_argument("--do_eval",
                        action='store_true',
                        help="Whether to run eval on the dev set.")
    parser.add_argument(
        "--evaluate_during_training",
        action='store_true',
        help="Rul evaluation during training at each logging step.")
    parser.add_argument(
        "--do_lower_case",
        action='store_true',
        help="Set this flag if you are using an uncased model.")

    parser.add_argument("--per_gpu_train_batch_size",
                        default=8,
                        type=int,
                        help="Batch size per GPU/CPU for training.")
    parser.add_argument("--per_gpu_eval_batch_size",
                        default=8,
                        type=int,
                        help="Batch size per GPU/CPU for evaluation.")
    parser.add_argument(
        '--gradient_accumulation_steps',
        type=int,
        default=1,
        help=
        "Number of updates steps to accumulate before performing a backward/update pass."
    )
    parser.add_argument("--learning_rate",
                        default=5e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--weight_decay",
                        default=0.0,
                        type=float,
                        help="Weight deay if we apply some.")
    parser.add_argument("--adam_epsilon",
                        default=1e-8,
                        type=float,
                        help="Epsilon for Adam optimizer.")
    parser.add_argument("--max_grad_norm",
                        default=1.0,
                        type=float,
                        help="Max gradient norm.")
    parser.add_argument("--num_train_epochs",
                        default=3.0,
                        type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument(
        "--max_steps",
        default=-1,
        type=int,
        help=
        "If > 0: set total number of training steps to perform. Override num_train_epochs."
    )
    parser.add_argument("--eval_steps", default=-1, type=int, help="")
    parser.add_argument("--lstm_hidden_size", default=300, type=int, help="")
    parser.add_argument("--lstm_layers", default=2, type=int, help="")
    parser.add_argument("--lstm_dropout", default=0.5, type=float, help="")

    parser.add_argument("--train_steps", default=-1, type=int, help="")
    parser.add_argument("--report_steps", default=-1, type=int, help="")
    parser.add_argument("--warmup_steps",
                        default=0,
                        type=int,
                        help="Linear warmup over warmup_steps.")
    parser.add_argument("--split_num", default=3, type=int, help="text split")
    parser.add_argument('--logging_steps',
                        type=int,
                        default=50,
                        help="Log every X updates steps.")
    parser.add_argument('--save_steps',
                        type=int,
                        default=50,
                        help="Save checkpoint every X updates steps.")
    parser.add_argument(
        "--eval_all_checkpoints",
        action='store_true',
        help=
        "Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number"
    )
    parser.add_argument("--no_cuda",
                        action='store_true',
                        help="Avoid using CUDA when available")
    parser.add_argument('--overwrite_output_dir',
                        action='store_true',
                        help="Overwrite the content of the output directory")
    parser.add_argument(
        '--overwrite_cache',
        action='store_true',
        help="Overwrite the cached training and evaluation sets")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")

    parser.add_argument(
        '--fp16',
        action='store_true',
        help=
        "Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit"
    )
    parser.add_argument(
        '--fp16_opt_level',
        type=str,
        default='O1',
        help=
        "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
        "See details at https://nvidia.github.io/apex/amp.html")
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="For distributed training: local_rank")
    parser.add_argument('--server_ip',
                        type=str,
                        default='',
                        help="For distant debugging.")
    parser.add_argument('--server_port',
                        type=str,
                        default='',
                        help="For distant debugging.")
    args = parser.parse_args()

    # Setup CUDA, GPU & distributed training
    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        args.n_gpu = torch.cuda.device_count()
    else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        torch.distributed.init_process_group(backend='nccl')
        args.n_gpu = 1
    args.device = device

    # Setup logging
    logging.basicConfig(
        format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
        datefmt='%m/%d/%Y %H:%M:%S',
        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
    logger.warning(
        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
        args.local_rank, device, args.n_gpu, bool(args.local_rank != -1),
        args.fp16)

    # Set seed
    set_seed(args)

    try:
        os.makedirs(args.output_dir)
    except:
        pass

    tokenizer = BertTokenizer.from_pretrained(args.model_name_or_path,
                                              do_lower_case=args.do_lower_case)

    config = BertConfig.from_pretrained(args.model_name_or_path, num_labels=3)

    # Prepare model
    model = BertForSequenceClassification.from_pretrained(
        args.model_name_or_path, args, config=config)

    if args.fp16:
        model.half()
    model.to(device)
    if args.local_rank != -1:
        try:
            from apex.parallel import DistributedDataParallel as DDP
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training."
            )

        model = DDP(model)
    elif args.n_gpu > 1:
        model = torch.nn.DataParallel(model)
    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
    args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
    if args.do_train:

        # Prepare data loader

        train_examples = read_examples(os.path.join(args.data_dir,
                                                    'train.csv'),
                                       is_training=True)
        train_features = convert_examples_to_features(train_examples,
                                                      tokenizer,
                                                      args.max_seq_length,
                                                      args.split_num, True)
        all_input_ids = torch.tensor(select_field(train_features, 'input_ids'),
                                     dtype=torch.long)
        all_input_mask = torch.tensor(select_field(train_features,
                                                   'input_mask'),
                                      dtype=torch.long)
        all_segment_ids = torch.tensor(select_field(train_features,
                                                    'segment_ids'),
                                       dtype=torch.long)
        all_label = torch.tensor([f.label for f in train_features],
                                 dtype=torch.long)
        train_data = TensorDataset(all_input_ids, all_input_mask,
                                   all_segment_ids, all_label)
        if args.local_rank == -1:
            train_sampler = RandomSampler(train_data)
        else:
            train_sampler = DistributedSampler(train_data)
        train_dataloader = DataLoader(train_data,
                                      sampler=train_sampler,
                                      batch_size=args.train_batch_size //
                                      args.gradient_accumulation_steps)

        num_train_optimization_steps = args.train_steps

        # Prepare optimizer

        param_optimizer = list(model.named_parameters())

        # hack to remove pooler, which is not used
        # thus it produce None grad that break apex
        param_optimizer = [n for n in param_optimizer]

        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [{
            'params': [
                p for n, p in param_optimizer
                if not any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            args.weight_decay
        }, {
            'params':
            [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
            'weight_decay':
            0.0
        }]

        optimizer = AdamW(optimizer_grouped_parameters,
                          lr=args.learning_rate,
                          eps=args.adam_epsilon)
        scheduler = WarmupLinearSchedule(optimizer,
                                         warmup_steps=args.warmup_steps,
                                         t_total=args.train_steps)

        global_step = 0

        logger.info("***** Running training *****")
        logger.info("  Num examples = %d", len(train_examples))
        logger.info("  Batch size = %d", args.train_batch_size)
        logger.info("  Num steps = %d", num_train_optimization_steps)

        best_acc = 0
        model.train()
        tr_loss = 0
        nb_tr_examples, nb_tr_steps = 0, 0
        bar = tqdm(range(num_train_optimization_steps),
                   total=num_train_optimization_steps)
        train_dataloader = cycle(train_dataloader)

        for step in bar:
            batch = next(train_dataloader)
            batch = tuple(t.to(device) for t in batch)
            input_ids, input_mask, segment_ids, label_ids = batch
            loss = model(input_ids=input_ids,
                         token_type_ids=segment_ids,
                         attention_mask=input_mask,
                         labels=label_ids)
            if args.n_gpu > 1:
                loss = loss.mean()  # mean() to average on multi-gpu.
            if args.fp16 and args.loss_scale != 1.0:
                loss = loss * args.loss_scale
            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps
            tr_loss += loss.item()
            train_loss = round(
                tr_loss * args.gradient_accumulation_steps / (nb_tr_steps + 1),
                4)
            bar.set_description("loss {}".format(train_loss))
            nb_tr_examples += input_ids.size(0)
            nb_tr_steps += 1

            if args.fp16:
                optimizer.backward(loss)
            else:

                loss.backward()

            if (nb_tr_steps + 1) % args.gradient_accumulation_steps == 0:
                if args.fp16:
                    # modify learning rate with special warm up BERT uses
                    # if args.fp16 is False, BertAdam is used that handles this automatically
                    lr_this_step = args.learning_rate * warmup_linear.get_lr(
                        global_step, args.warmup_proportion)
                    for param_group in optimizer.param_groups:
                        param_group['lr'] = lr_this_step
                scheduler.step()
                optimizer.step()
                optimizer.zero_grad()
                global_step += 1

            if (step + 1) % (args.eval_steps *
                             args.gradient_accumulation_steps) == 0:
                tr_loss = 0
                nb_tr_examples, nb_tr_steps = 0, 0
                logger.info("***** Report result *****")
                logger.info("  %s = %s", 'global_step', str(global_step))
                logger.info("  %s = %s", 'train loss', str(train_loss))

            if args.do_eval and (step + 1) % (
                    args.eval_steps * args.gradient_accumulation_steps) == 0:
                for file in ['dev.csv']:
                    inference_labels = []
                    gold_labels = []
                    inference_logits = []
                    eval_examples = read_examples(os.path.join(
                        args.data_dir, file),
                                                  is_training=True)
                    eval_features = convert_examples_to_features(
                        eval_examples, tokenizer, args.max_seq_length,
                        args.split_num, False)
                    all_input_ids = torch.tensor(select_field(
                        eval_features, 'input_ids'),
                                                 dtype=torch.long)
                    all_input_mask = torch.tensor(select_field(
                        eval_features, 'input_mask'),
                                                  dtype=torch.long)
                    all_segment_ids = torch.tensor(select_field(
                        eval_features, 'segment_ids'),
                                                   dtype=torch.long)
                    all_label = torch.tensor([f.label for f in eval_features],
                                             dtype=torch.long)

                    eval_data = TensorDataset(all_input_ids, all_input_mask,
                                              all_segment_ids, all_label)

                    logger.info("***** Running evaluation *****")
                    logger.info("  Num examples = %d", len(eval_examples))
                    logger.info("  Batch size = %d", args.eval_batch_size)

                    # Run prediction for full data
                    eval_sampler = SequentialSampler(eval_data)
                    eval_dataloader = DataLoader(
                        eval_data,
                        sampler=eval_sampler,
                        batch_size=args.eval_batch_size)

                    model.eval()
                    eval_loss, eval_accuracy = 0, 0
                    nb_eval_steps, nb_eval_examples = 0, 0
                    for input_ids, input_mask, segment_ids, label_ids in eval_dataloader:
                        input_ids = input_ids.to(device)
                        input_mask = input_mask.to(device)
                        segment_ids = segment_ids.to(device)
                        label_ids = label_ids.to(device)

                        with torch.no_grad():
                            tmp_eval_loss = model(input_ids=input_ids,
                                                  token_type_ids=segment_ids,
                                                  attention_mask=input_mask,
                                                  labels=label_ids)
                            logits = model(input_ids=input_ids,
                                           token_type_ids=segment_ids,
                                           attention_mask=input_mask)

                        logits = logits.detach().cpu().numpy()
                        label_ids = label_ids.to('cpu').numpy()
                        inference_labels.append(np.argmax(logits, axis=1))
                        gold_labels.append(label_ids)
                        inference_logits.append(logits)
                        eval_loss += tmp_eval_loss.mean().item()
                        nb_eval_examples += input_ids.size(0)
                        nb_eval_steps += 1

                    gold_labels = np.concatenate(gold_labels, 0)
                    inference_logits = np.concatenate(inference_logits, 0)
                    model.train()
                    eval_loss = eval_loss / nb_eval_steps
                    eval_accuracy = accuracy(inference_logits, gold_labels)

                    result = {
                        'eval_loss': eval_loss,
                        'eval_F1': eval_accuracy,
                        'global_step': global_step,
                        'loss': train_loss
                    }

                    output_eval_file = os.path.join(args.output_dir,
                                                    "eval_results.txt")
                    with open(output_eval_file, "a") as writer:
                        for key in sorted(result.keys()):
                            logger.info("  %s = %s", key, str(result[key]))
                            writer.write("%s = %s\n" % (key, str(result[key])))
                        writer.write('*' * 80)
                        writer.write('\n')
                    if eval_accuracy > best_acc and 'dev' in file:
                        print("=" * 80)
                        print("Best F1", eval_accuracy)
                        print("Saving Model......")
                        best_acc = eval_accuracy
                        # Save a trained model
                        model_to_save = model.module if hasattr(
                            model,
                            'module') else model  # Only save the model it-self
                        output_model_file = os.path.join(
                            args.output_dir, "pytorch_model.bin")
                        torch.save(model_to_save.state_dict(),
                                   output_model_file)
                        print("=" * 80)
                    else:
                        print("=" * 80)
    if args.do_test:
        del model
        gc.collect()
        args.do_train = False
        model = BertForSequenceClassification.from_pretrained(os.path.join(
            args.output_dir, "pytorch_model.bin"),
                                                              args,
                                                              config=config)
        if args.fp16:
            model.half()
        model.to(device)
        if args.local_rank != -1:
            try:
                from apex.parallel import DistributedDataParallel as DDP
            except ImportError:
                raise ImportError(
                    "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training."
                )

            model = DDP(model)
        elif args.n_gpu > 1:
            model = torch.nn.DataParallel(model)

        for file, flag in [('dev.csv', 'dev'), ('test.csv', 'test')]:
            inference_labels = []
            gold_labels = []
            eval_examples = read_examples(os.path.join(args.data_dir, file),
                                          is_training=False)
            eval_features = convert_examples_to_features(
                eval_examples, tokenizer, args.max_seq_length, args.split_num,
                False)
            all_input_ids = torch.tensor(select_field(eval_features,
                                                      'input_ids'),
                                         dtype=torch.long)
            all_input_mask = torch.tensor(select_field(eval_features,
                                                       'input_mask'),
                                          dtype=torch.long)
            all_segment_ids = torch.tensor(select_field(
                eval_features, 'segment_ids'),
                                           dtype=torch.long)
            all_label = torch.tensor([f.label for f in eval_features],
                                     dtype=torch.long)

            eval_data = TensorDataset(all_input_ids, all_input_mask,
                                      all_segment_ids, all_label)
            # Run prediction for full data
            eval_sampler = SequentialSampler(eval_data)
            eval_dataloader = DataLoader(eval_data,
                                         sampler=eval_sampler,
                                         batch_size=args.eval_batch_size)

            model.eval()
            eval_loss, eval_accuracy = 0, 0
            nb_eval_steps, nb_eval_examples = 0, 0
            for input_ids, input_mask, segment_ids, label_ids in eval_dataloader:
                input_ids = input_ids.to(device)
                input_mask = input_mask.to(device)
                segment_ids = segment_ids.to(device)
                label_ids = label_ids.to(device)

                with torch.no_grad():
                    logits = model(
                        input_ids=input_ids,
                        token_type_ids=segment_ids,
                        attention_mask=input_mask).detach().cpu().numpy()
                label_ids = label_ids.to('cpu').numpy()
                inference_labels.append(logits)
                gold_labels.append(label_ids)
            gold_labels = np.concatenate(gold_labels, 0)
            logits = np.concatenate(inference_labels, 0)
            print(flag, accuracy(logits, gold_labels))
            if flag == 'test':
                df = pd.read_csv(os.path.join(args.data_dir, file))
                df['label_0'] = logits[:, 0]
                df['label_1'] = logits[:, 1]
                df['label_2'] = logits[:, 2]
                df[['id', 'label_0', 'label_1',
                    'label_2']].to_csv(os.path.join(args.output_dir,
                                                    "sub.csv"),
                                       index=False)
def main():
    parser = argparse.ArgumentParser()

    ## Required parameters
    parser.add_argument(
        "--data_dir",
        default='/hdd/lujunyu/dataset/multi_turn_corpus/douban/',
        type=str,
        required=False,
        help=
        "The input data dir. Should contain the .tsv files (or other data files) for the task."
    )
    parser.add_argument("--task_name",
                        default='ubuntu',
                        type=str,
                        required=False,
                        help="The name of the task to train.")
    parser.add_argument(
        "--output_dir",
        default='/hdd/lujunyu/model/chatbert/douban_base_si_aug/',
        type=str,
        required=False,
        help="The output directory where the model checkpoints will be written."
    )
    parser.add_argument(
        "--dialog_augmentation_path",
        default=
        '/hdd/lujunyu/dataset/multi_turn_corpus/douban/train_augment_3.txt',  ## train_augment_3.txt
        type=str,
        help="Whether to use augmentation")

    ## Other parameters
    parser.add_argument(
        "--init_model_name",
        default='bert-base-chinese',
        type=str,
        help="Initial checkpoint (usually from a pre-trained BERT model).")
    parser.add_argument(
        "--do_lower_case",
        default=True,
        action='store_true',
        help=
        "Whether to lower case the input text. True for uncased models, False for cased models."
    )
    parser.add_argument(
        "--max_seq_length",
        default=256,
        type=int,
        help=
        "The maximum total input sequence length after WordPiece tokenization. \n"
        "Sequences longer than this will be truncated, and sequences shorter \n"
        "than this will be padded.")
    parser.add_argument("--do_train",
                        default=True,
                        action='store_true',
                        help="Whether to run training.")
    parser.add_argument("--do_test",
                        default=True,
                        action='store_true',
                        help="Whether to run eval on the test set.")
    parser.add_argument("--train_batch_size",
                        default=800,
                        type=int,
                        help="Total batch size for training.")
    parser.add_argument("--eval_batch_size",
                        default=100,
                        type=int,
                        help="Total batch size for eval.")
    parser.add_argument("--learning_rate",
                        default=2e-6,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--num_train_epochs",
                        default=10.0,
                        type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument(
        "--warmup_steps",
        default=0.0,
        type=float,
        help=
        "Proportion of training to perform linear learning rate warmup for. "
        "E.g., 0.1 = 10%% of training.")
    parser.add_argument("--weight_decay",
                        default=1e-3,
                        type=float,
                        help="weight_decay")
    parser.add_argument("--save_checkpoints_steps",
                        default=25000,
                        type=int,
                        help="How often to save the model checkpoint.")
    parser.add_argument("--no_cuda",
                        default=False,
                        action='store_true',
                        help="Whether not to use CUDA when available")
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")
    parser.add_argument('--seed',
                        type=int,
                        default=12,
                        help="random seed for initialization")
    parser.add_argument(
        '--gradient_accumulation_steps',
        type=int,
        default=20,
        help=
        "Number of updates steps to accumualte before performing a backward/update pass."
    )
    args = parser.parse_args()

    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl')
    logger.info("device %s n_gpu %d distributed training %r", device, n_gpu,
                bool(args.local_rank != -1))

    if args.gradient_accumulation_steps < 1:
        raise ValueError(
            "Invalid gradient_accumulation_steps parameter: {}, should be >= 1"
            .format(args.gradient_accumulation_steps))

    args.train_batch_size = int(args.train_batch_size /
                                args.gradient_accumulation_steps)

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

    if not args.do_train and not args.do_eval:
        raise ValueError(
            "At least one of `do_train` or `do_eval` must be True.")

    bert_config = BertConfig.from_pretrained(args.init_model_name,
                                             num_labels=2)

    if args.max_seq_length > bert_config.max_position_embeddings:
        raise ValueError(
            "Cannot use sequence length {} because the BERT model was only trained up to sequence length {}"
            .format(args.max_seq_length, bert_config.max_position_embeddings))

    if os.path.exists(args.output_dir) and os.listdir(args.output_dir):
        if args.do_train:
            raise ValueError(
                "Output directory ({}) already exists and is not empty.".
                format(args.output_dir))
    else:
        os.makedirs(args.output_dir, exist_ok=True)

    tokenizer = BertTokenizer.from_pretrained(args.init_model_name,
                                              do_lower_case=args.do_lower_case)
    if args.dialog_augmentation_path:
        train_dataset = DoubanDataset(file_path=args.dialog_augmentation_path,
                                      max_seq_length=args.max_seq_length,
                                      tokenizer=tokenizer)
    else:
        train_dataset = DoubanDataset(file_path=os.path.join(
            args.data_dir, "train.txt"),
                                      max_seq_length=args.max_seq_length,
                                      tokenizer=tokenizer)
    eval_dataset = DoubanDataset(file_path=os.path.join(
        args.data_dir, "dev.txt"),
                                 max_seq_length=args.max_seq_length,
                                 tokenizer=tokenizer)

    train_dataloader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=args.train_batch_size,
        sampler=RandomSampler(train_dataset),
        num_workers=4)
    eval_dataloader = torch.utils.data.DataLoader(
        eval_dataset,
        batch_size=args.eval_batch_size,
        sampler=SequentialSampler(eval_dataset),
        num_workers=4)

    model = BertForSequenceClassification.from_pretrained(
        '/hdd/lujunyu/model/chatbert/douban_base_si_aug_beifen/model.pt',
        config=bert_config)
    model.to(device)

    num_train_steps = None
    if args.do_train:
        num_train_steps = int(
            len(train_dataset) / args.train_batch_size /
            args.gradient_accumulation_steps * args.num_train_epochs)
        # Prepare optimizer
        param_optimizer = list(model.named_parameters())
        # remove pooler, which is not used thus it produce None grad that break apex
        param_optimizer = [n for n in param_optimizer]

        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [{
            'params': [
                p for n, p in param_optimizer
                if not any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            args.weight_decay
        }, {
            'params':
            [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
            'weight_decay':
            0.0
        }]

        optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate)
        scheduler = WarmupLinearSchedule(optimizer,
                                         warmup_steps=args.warmup_steps,
                                         t_total=num_train_steps)
    else:
        optimizer = None
        scheduler = None

    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(
            model, device_ids=[args.local_rank], output_device=args.local_rank)
    elif n_gpu > 1:
        model = torch.nn.DataParallel(model)

    global_step = 0
    best_acc = 0.0
    if args.do_train:
        logger.info("***** Running training *****")
        logger.info("  Num examples = %d", len(train_dataset))
        logger.info("  Batch size = %d", args.train_batch_size)
        logger.info("  Num steps = %d", num_train_steps)

        model.train()
        for _ in trange(int(args.num_train_epochs), desc="Epoch"):
            tr_loss = 0
            nb_tr_examples, nb_tr_steps = 0, 0
            for step, batch in enumerate(
                    tqdm(train_dataloader, desc="Iteration")):
                batch = tuple(t.to(device) for t in batch)
                input_ids, input_mask, segment_ids, label_ids = batch
                loss, _ = model(input_ids,
                                token_type_ids=segment_ids,
                                attention_mask=input_mask,
                                labels=label_ids)
                if n_gpu > 1:
                    loss = loss.mean()  # mean() to average on multi-gpu.
                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps
                loss.backward()
                tr_loss += loss.item()
                nb_tr_examples += input_ids.size(0)
                nb_tr_steps += 1
                if (step + 1) % args.gradient_accumulation_steps == 0:
                    optimizer.step()  # We have accumulated enought gradients
                    scheduler.step()
                    model.zero_grad()
                    global_step += 1

                if (step + 1) % args.save_checkpoints_steps == 0:
                    ### Evaluate at the end of epoches
                    model.eval()
                    eval_loss, eval_accuracy = 0, 0
                    nb_eval_steps, nb_eval_examples = 0, 0
                    logits_all = []
                    for input_ids, input_mask, segment_ids, label_ids in eval_dataloader:
                        input_ids = input_ids.to(device)
                        input_mask = input_mask.to(device)
                        segment_ids = segment_ids.to(device)
                        label_ids = label_ids.to(device)

                        with torch.no_grad():
                            tmp_eval_loss, logits = model(
                                input_ids,
                                token_type_ids=segment_ids,
                                attention_mask=input_mask,
                                labels=label_ids)

                        logits = logits.detach().cpu().numpy()
                        label_ids = label_ids.to('cpu').numpy()
                        for i in range(len(logits)):
                            logits_all += [logits[i]]

                        tmp_eval_accuracy = accuracy(logits,
                                                     label_ids.reshape(-1))

                        eval_loss += tmp_eval_loss.mean().item()
                        eval_accuracy += tmp_eval_accuracy

                        nb_eval_examples += input_ids.size(0)
                        nb_eval_steps += 1

                    eval_loss = eval_loss / nb_eval_steps
                    eval_accuracy = eval_accuracy / nb_eval_examples

                    result = {
                        'eval_loss': eval_loss,
                        'eval_accuracy': eval_accuracy
                    }

                    output_eval_file = os.path.join(args.output_dir,
                                                    "eval_results_dev.txt")
                    with open(output_eval_file, "a") as writer:
                        logger.info("***** Eval results *****")
                        for key in sorted(result.keys()):
                            logger.info("  %s = %s", key, str(result[key]))
                            writer.write("%s = %s\n" % (key, str(result[key])))
                    output_eval_file = os.path.join(args.output_dir,
                                                    "logits_dev.txt")
                    with open(output_eval_file, "w") as f:
                        for i in range(len(logits_all)):
                            for j in range(len(logits_all[i])):
                                f.write(str(logits_all[i][j]))
                                if j == len(logits_all[i]) - 1:
                                    f.write("\n")
                                else:
                                    f.write(" ")

                    ### Save the best checkpoint
                    if best_acc < eval_accuracy:
                        try:  ### Remove 'module' prefix when using DataParallel
                            state_dict = model.module.state_dict()
                        except AttributeError:
                            state_dict = model.state_dict()
                        torch.save(state_dict,
                                   os.path.join(args.output_dir, "model.pt"))
                        best_acc = eval_accuracy
                        logger.info('Saving the best model in {}'.format(
                            os.path.join(args.output_dir, "model.pt")))

                    model.train()
Example #18
0
    def train(self):
        model = BertForSequenceClassification.from_pretrained(
            self.args.model_name_or_path, self.args, config=self.config)
        model.to(self.device)

        logger.info('准备数据')
        data = DATABDCI(
            debug=False,
            data_dir='/home/lsy2018/文本匹配/DATA/DATA_BDCI/',
            data_process_output='/home/lsy2018/文本匹配/DATA/DATA_BDCI/data_1014/')

        train_examples = data.read_examples(
            os.path.join(self.data_process_output, 'train.csv'))
        train_features = data.convert_examples_to_features(
            train_examples, self.tokenizer, self.max_seq_length)
        all_input_ids = torch.tensor(data.select_field(train_features,
                                                       'input_ids'),
                                     dtype=torch.long)
        all_input_mask = torch.tensor(data.select_field(
            train_features, 'input_mask'),
                                      dtype=torch.long)
        all_segment_ids = torch.tensor(data.select_field(
            train_features, 'segment_ids'),
                                       dtype=torch.long)
        all_label = torch.tensor([f.label for f in train_features],
                                 dtype=torch.long)
        train_data = TensorDataset(all_input_ids, all_input_mask,
                                   all_segment_ids, all_label)

        # 这步干嘛的?
        train_sampler = RandomSampler(train_data)
        train_dataloader = DataLoader(train_data,
                                      sampler=train_sampler,
                                      batch_size=self.batch_size //
                                      self.gradient_accumulation_steps)

        # Prepare optimizer

        param_optimizer = list(model.named_parameters())

        # hack to remove pooler, which is not used
        # thus it produce None grad that break apex
        param_optimizer = [n for n in param_optimizer]

        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [{
            'params': [
                p for n, p in param_optimizer
                if not any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            self.weight_decay
        }, {
            'params':
            [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
            'weight_decay':
            0.0
        }]

        optimizer = AdamW(optimizer_grouped_parameters,
                          lr=self.learning_rate,
                          eps=self.adam_epsilon)
        scheduler = WarmupLinearSchedule(optimizer,
                                         warmup_steps=self.warmup_steps,
                                         t_total=self.train_steps)

        best_acc = 0
        global_step = 0
        model.train()
        train_loss = 0
        nb_tr_examples, nb_tr_steps = 0, 0

        bar = tqdm(range(self.train_steps), total=self.train_steps)
        train_dataloader = cycle(train_dataloader)

        for step in bar:
            batch = next(train_dataloader)
            batch = tuple(t.to(self.device) for t in batch)
            input_ids, input_mask, segment_ids, label_ids = batch
            loss = model(input_ids=input_ids,
                         token_type_ids=segment_ids,
                         attention_mask=input_mask,
                         labels=label_ids)

            train_loss += loss.item()
            train_loss = round(
                train_loss * self.gradient_accumulation_steps /
                (nb_tr_steps + 1), 4)
            bar.set_description("loss {}".format(train_loss))
            nb_tr_examples += input_ids.size(0)
            nb_tr_steps += 1
            loss.backward()

            if (nb_tr_steps + 1) % self.gradient_accumulation_steps == 0:
                scheduler.step()
                optimizer.step()
                optimizer.zero_grad()
                global_step += 1

            if self.do_eval and (step + 1) % (
                    self.eval_steps * self.gradient_accumulation_steps) == 0:
                inference_labels = []
                scores = []
                gold_labels = []
                inference_logits = []
                eval_examples = data.read_examples(
                    os.path.join(self.data_process_output, 'dev.csv'))
                eval_features = data.convert_examples_to_features(
                    eval_examples, self.tokenizer, self.max_seq_length)
                ID1 = [x.sentence_ID1 for x in eval_examples]
                ID2 = [x.sentence_ID2 for x in eval_examples]

                all_input_ids = torch.tensor(data.select_field(
                    eval_features, 'input_ids'),
                                             dtype=torch.long)
                all_input_mask = torch.tensor(data.select_field(
                    eval_features, 'input_mask'),
                                              dtype=torch.long)
                all_segment_ids = torch.tensor(data.select_field(
                    eval_features, 'segment_ids'),
                                               dtype=torch.long)
                all_label = torch.tensor([f.label for f in eval_features],
                                         dtype=torch.long)

                eval_data = TensorDataset(all_input_ids, all_input_mask,
                                          all_segment_ids, all_label)

                logger.info("***** Running evaluation *****")
                logger.info("  Num examples = %d", len(eval_examples))
                logger.info("  Batch size = %d", self.batch_size)

                # Run prediction for full data
                eval_sampler = SequentialSampler(eval_data)
                eval_dataloader = DataLoader(eval_data,
                                             sampler=eval_sampler,
                                             batch_size=self.batch_size)

                model.eval()
                eval_loss, eval_accuracy = 0, 0
                nb_eval_steps, nb_eval_examples = 0, 0
                count = 0

                for input_ids, input_mask, segment_ids, label_ids in eval_dataloader:
                    # ID1_list_eachbatch = ID1[count*args.eval_batch_size:(count+1)*args.eval_batch_size]
                    # ID2_list_eachbatch = ID2[count * args.eval_batch_size:(count + 1) * args.eval_batch_size]
                    input_ids = input_ids.to(self.device)
                    input_mask = input_mask.to(self.device)
                    segment_ids = segment_ids.to(self.device)
                    label_ids = label_ids.to(self.device)

                    with torch.no_grad():
                        tmp_eval_loss = model(input_ids=input_ids,
                                              token_type_ids=segment_ids,
                                              attention_mask=input_mask,
                                              labels=label_ids)
                        logits = model(input_ids=input_ids,
                                       token_type_ids=segment_ids,
                                       attention_mask=input_mask)

                        logits = logits.detach().cpu().numpy()
                        label_ids = label_ids.to('cpu').numpy()
                        inference_labels.append(np.argmax(logits, axis=1))
                        # scores.append(logits)
                        gold_labels.append(label_ids)
                        inference_logits.append(logits)
                        eval_loss += tmp_eval_loss.mean().item()
                        nb_eval_examples += input_ids.size(0)
                        nb_eval_steps += 1

                    gold_labels = np.concatenate(gold_labels, 0)
                    inference_logits = np.concatenate(inference_logits, 0)
                    # scores = np.concatenate(scores, 0)
                    model.train()
                    eval_loss = eval_loss / nb_eval_steps
                    eval_accuracy = accuracy(inference_logits, gold_labels)
                    # eval_mrr = compute_MRR(scores, gold_labels, ID1, ID2)

                    result = {
                        'eval_loss': eval_loss,
                        'eval_F1': eval_accuracy,
                        'global_step': global_step,
                        # 'mrr':eval_mrr,
                        'loss': train_loss
                    }

                    output_eval_file = os.path.join(self.output_dir,
                                                    "eval_results.txt")
                    with open(output_eval_file, "a") as writer:
                        for key in sorted(result.keys()):
                            logger.info("  %s = %s", key, str(result[key]))
                            writer.write("%s = %s\n" % (key, str(result[key])))
                        writer.write('*' * 80)
                        writer.write('\n')
                    if eval_accuracy > best_acc:
                        print("=" * 80)
                        print("Best F1", eval_accuracy)
                        print("Saving Model......")
                        best_acc = eval_accuracy
                        # Save a trained model
                        model_to_save = model.module if hasattr(
                            model,
                            'module') else model  # Only save the model it-self
                        output_model_file = os.path.join(
                            self.output_dir, "pytorch_model.bin")
                        torch.save(model_to_save.state_dict(),
                                   output_model_file)
                        print("=" * 80)
                    else:
                        print("=" * 80)
def train(config, train_dataset, model, tokenizer):
    """ Train the model """
    config.train_batch_size = config.per_gpu_train_batch_size * \
        max(1, config.n_gpu)
    if config.local_rank == -1:
        train_sampler = RandomSampler(train_dataset)
    else:
        DistributedSampler(train_dataset)

    train_dataloader = DataLoader(
        train_dataset, sampler=train_sampler, batch_size=config.train_batch_size)

    if config.max_steps > 0:
        t_total = config.max_steps
        config.num_train_epochs = config.max_steps // (
            len(train_dataloader) // config.gradient_accumulation_steps) + 1
    else:
        t_total = len(
            train_dataloader) // config.gradient_accumulation_steps * config.num_train_epochs

    # Prepare optimizer and schedule (linear warmup and decay)
    no_decay = ['bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in model.named_parameters()
                    if not any(nd in n for nd in no_decay)], 'weight_decay': config.weight_decay},
        {'params': [p for n, p in model.named_parameters()
                    if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
    ]
    optimizer = AdamW(optimizer_grouped_parameters,
                      lr=config.learning_rate, eps=config.adam_epsilon)
    scheduler = WarmupLinearSchedule(
        optimizer, warmup_steps=config.warmup_steps, t_total=t_total)
    if config.n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Distributed training (should be after apex fp16 initialization)
    if config.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[config.local_rank],
                                                          output_device=config.local_rank,
                                                          find_unused_parameters=True)

    # Train!
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_dataset))
    logger.info("  Num Epochs = %d", config.num_train_epochs)
    logger.info("  Instantaneous batch size per GPU = %d",
                config.per_gpu_train_batch_size)
    logger.info("  Total train batch size (w. parallel, distributed & accumulation) = %d",
                config.train_batch_size * config.gradient_accumulation_steps
                * (torch.distributed.get_world_size() if config.local_rank != -1 else 1))
    logger.info("  Gradient Accumulation steps = %d",
                config.gradient_accumulation_steps)
    logger.info("  Total optimization steps = %d", t_total)

    global_step = 0
    tr_loss, logging_loss = 0.0, 0.0
    model.zero_grad()
    train_iterator = trange(int(config.num_train_epochs),
                            desc="Epoch", disable=config.local_rank not in [-1, 0])
    # Added here for reproductibility (even between python 2 and 3)
    set_seed(config.seed)
    loss_current=100000
    for _ in train_iterator:
        epoch_iterator = tqdm(train_dataloader, desc="Iteration",
                              disable=config.local_rank not in [-1, 0])
        for step, batch in enumerate(epoch_iterator):
            model.train()
            batch = tuple(t.to(config.device) for t in batch)
            inputs = {'input_ids':      batch[0],
                      'attention_mask': batch[1],
                      # XLM and RoBERTa don't use segment_ids
                      'token_type_ids': batch[2],
                      'labels':      batch[3],
                      'e1_mask': batch[4],
                      'e2_mask': batch[5],
                      }

            outputs = model(**inputs)
            # model outputs are always tuple in pytorch-transformers (see doc)
            loss = outputs[0]

            if config.n_gpu > 1:
                loss = loss.mean()  # mean() to average on multi-gpu parallel training
            if config.gradient_accumulation_steps > 1:
                loss = loss / config.gradient_accumulation_steps
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                
            loss.backward()
            torch.nn.utils.clip_grad_norm_(
                model.parameters(), config.max_grad_norm)

            tr_loss += loss.item()
            if (step + 1) % config.gradient_accumulation_steps == 0:

                optimizer.step()
                scheduler.step()  # Update learning rate schedule
                model.zero_grad()
                global_step += 1

                
                if config.local_rank in [-1, 0] and config.save_steps > 0 and global_step % config.save_steps == 0:
                    # Save model checkpoint
                    output_dir = os.path.join(
                        config.output_dir, 'checkpoint-{}'.format(global_step))
                    if not os.path.exists(output_dir):
                        os.makedirs(output_dir)
                    # Take care of distributed/parallel training
                    model_to_save = model.module if hasattr(
                        model, 'module') else model
                    model_to_save.save_pretrained(output_dir)
                    torch.save(config, os.path.join(
                        output_dir, 'training_config.bin'))
                    logger.info("Saving model checkpoint to %s", output_dir)

            
            if config.max_steps > 0 and global_step > config.max_steps:
                epoch_iterator.close()
                break
        if config.local_rank in [-1, 0] and config.logging_steps > 0 :#and global_step % config.logging_steps == 0:
            # Log metrics
            # Only evaluate when single GPU otherwise metrics may not average well
            if config.local_rank == -1 and config.evaluate_during_training:
                results = evaluate(config, model, tokenizer)
            logging_loss = tr_loss
        if config.max_steps > 0 and global_step > config.max_steps:
            train_iterator.close()
            break
    return global_step, tr_loss / global_step
Example #20
0
    def train(self, output_dir, train_batch_size, gradient_accumulation_steps, seed,
              epochs, data_path, pretrained_path, valid_path, no_cuda=False, dropout=0.3,
              weight_decay=0.01, warmup_proportion=0.1, learning_rate=5e-5, adam_epsilon=1e-8,
              max_seq_length=128, squeeze=True, max_grad_norm=1.0, eval_batch_size=32, epoch_save_model=False,
              model_name='BERT', embedding_path=None, split_train_data=False, motherfile = False):
        if os.path.exists(output_dir) and os.listdir(output_dir):
            raise ValueError("Output directory (%s) already exists and is not empty." % output_dir)
        
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)

        logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
                        datefmt='%m/%d/%Y %H:%M:%S',
                        level=logging.INFO,
                        filename=os.path.join(output_dir, "log.txt"))
        logging.getLogger().addHandler(logging.StreamHandler(sys.stdout))
        logger = logging.getLogger(__name__)

        if gradient_accumulation_steps < 1:
            raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1"
                         % gradient_accumulation_steps)

        train_batch_size = train_batch_size // gradient_accumulation_steps
    
        random.seed(seed)
        np.random.seed(seed)
        torch.manual_seed(seed)

        # add one for IGNORE label
        if motherfile:
            print(data_path)
            train_examples, train_label_list = get_examples_from_motherfile(data_path, 'train')
            val_examples, val_label_list = get_examples_from_motherfile(data_path, 'test')
            train_label_list.extend(val_label_list)
            label_list = list(set(train_label_list))
        elif split_train_data:
            examples, label_list = get_examples(data_path, 'train')
            random.shuffle(examples)
            train_examples = examples[0:int(len(examples)*0.6)]
            val_examples = examples[int(len(examples)*0.6):int(len(examples)*0.8)]
            eval_examples = examples[int(len(examples)*0.8):]
        else:
            train_examples = None
            train_examples, label_list = get_examples(data_path, 'train')
        num_train_optimization_steps = 0
        num_labels = len(label_list) + 1
        num_train_optimization_steps = int(
            len(train_examples) / train_batch_size / gradient_accumulation_steps) * epochs
        
        hidden_size = 300 if pretrained_path == None else 768 if 'base' in pretrained_path else 1024
        device = 'cuda:0' if (torch.cuda.is_available() and not no_cuda) else 'cpu'
        logger.info(device)
        print(pretrained_path)
        if model_name == 'HERBERT':
            model = AutoTokenizerForTokenClassification(
                pretrained_path=pretrained_path, n_labels=num_labels, hidden_size=hidden_size, dropout_p=dropout,
                device=device)
        elif model_name == 'BERT_MULTILINGUAL':
            model = BertBaseMultilingualCased(
                pretrained_path=pretrained_path, n_labels=num_labels, hidden_size=hidden_size, dropout_p=dropout,
                device=device)
        elif model_name == 'Reformer':
            model = Reformer(n_labels=num_labels, hidden_size=512,
                             dropout=dropout, device=device, max_seq_length=max_seq_length,
                             batch_size=train_batch_size)
        else:
            model = XLMRForTokenClassification(pretrained_path=pretrained_path,
                                n_labels=num_labels, hidden_size=hidden_size,
                                dropout=dropout, device=device)

        model.to(device)
        no_decay = ['bias', 'final_layer_norm.weight']

        params = list(model.named_parameters())

        optimizer_grouped_parameters = [
            {'params': [p for n, p in params if not any(
                nd in n for nd in no_decay)], 'weight_decay': weight_decay},
            {'params': [p for n, p in params if any(
                nd in n for nd in no_decay)], 'weight_decay': 0.0}
        ]

        warmup_steps = int(warmup_proportion * num_train_optimization_steps)
        optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate, eps=adam_epsilon)
        scheduler = WarmupLinearSchedule(optimizer, warmup_steps=warmup_steps, t_total=num_train_optimization_steps)

        train_features = convert_examples_to_features(
            train_examples, label_list, max_seq_length, model.encode_word)

        logger.info("***** Running training *****")
        logger.info("  Num examples = %d", len(train_examples))
        logger.info("  Batch size = %d", train_batch_size)
        logger.info("  Num steps = %d", num_train_optimization_steps)

        train_data = create_dataset(train_features)
        train_sampler = RandomSampler(train_data)
        train_dataloader = DataLoader(
            train_data, sampler=train_sampler, batch_size=train_batch_size)
        if not split_train_data:
            val_examples, _ = get_examples(valid_path, 'valid')
        val_features = convert_examples_to_features(
            val_examples, label_list, max_seq_length, model.encode_word)

        val_data = create_dataset(val_features)
        
        best_val_f1 = 0.0

        for epoch_no in range(1, epochs+1):
            logger.info("Epoch %d" % epoch_no)
            tr_loss = 0
            nb_tr_examples, nb_tr_steps = 0, 0
            
            model.train()
            steps = len(train_dataloader)
            for step, batch in enumerate(train_dataloader):
                batch = tuple(t.to(device) for t in batch)
                input_ids, label_ids, l_mask, valid_ids, = batch
                loss = model(input_ids, label_ids, l_mask, valid_ids)
                if gradient_accumulation_steps > 1:
                    loss = loss / gradient_accumulation_steps

                loss.backward()
                torch.nn.utils.clip_grad_norm_(
                    model.parameters(), max_grad_norm)

                tr_loss += loss.item()
                nb_tr_examples += input_ids.size(0)
                nb_tr_steps += 1
                if step % 5 == 0:
                    logger.info('Step = %d/%d; Loss = %.4f' % (step+1, steps, tr_loss / (step+1)))
                if (step + 1) % gradient_accumulation_steps == 0:
                    optimizer.step()
                    scheduler.step()
                    model.zero_grad()

            logger.info("\nTesting on validation set...")
            f1, report = evaluate_model(model, val_data, label_list, eval_batch_size, device)
            print(report)
            if f1 > best_val_f1:
                best_val_f1 = f1
                logger.info("\nFound better f1=%.4f on validation set. Saving model\n" % f1)
                logger.info("%s\n" % report)
                torch.save(model.state_dict(), open(os.path.join(output_dir, 'model.pt'), 'wb'))
                save_params(output_dir, dropout, num_labels, label_list)

            if epoch_save_model:
                epoch_output_dir = os.path.join(output_dir, "e%03d" % epoch_no)
                os.makedirs(epoch_output_dir)
                torch.save(model.state_dict(), open(os.path.join(epoch_output_dir, 'model.pt'), 'wb'))
                save_params(epoch_output_dir, dropout, num_labels, label_list)
def run_train(args):
    # --------- data
    processor = BertProcessor(vocab_path=config['bert_vocab_path'], do_lower_case=args.do_lower_case)
    label_list = processor.get_labels()
    label2id = {label: i for i, label in enumerate(label_list)}
    id2label = {i: label for i, label in enumerate(label_list)}

    train_data = processor.get_train(config['data_dir'] / f"{args.data_name}.train.pkl")
    train_examples = processor.create_examples(lines=train_data,
                                               example_type='train',
                                               cached_examples_file=config['data_dir'] / f"cached_train_examples_{args.arch}")
    train_features = processor.create_features(examples=train_examples,
                                               max_seq_len=args.train_max_seq_len,
                                               cached_features_file=config[
                                                                        'data_dir'] / "cached_train_features_{}_{}".format(
                                                   args.train_max_seq_len, args.arch
                                               ))
    train_dataset = processor.create_dataset(train_features, is_sorted=args.sorted)
    if args.sorted:
        train_sampler = SequentialSampler(train_dataset)
    else:
        train_sampler = RandomSampler(train_dataset)
    train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size)

    valid_data = processor.get_dev(config['data_dir'] / f"{args.data_name}.valid.pkl")
    valid_examples = processor.create_examples(lines=valid_data,
                                               example_type='valid',
                                               cached_examples_file=config['data_dir'] / f"cached_valid_examples_{args.arch}")

    valid_features = processor.create_features(examples=valid_examples,
                                               max_seq_len=args.eval_max_seq_len,
                                               cached_features_file=config['data_dir'] / "cached_valid_features_{}_{}".format(
                                                   args.eval_max_seq_len, args.arch))
    valid_dataset = processor.create_dataset(valid_features)
    valid_sampler = SequentialSampler(valid_dataset)
    valid_dataloader = DataLoader(valid_dataset, sampler=valid_sampler, batch_size=args.eval_batch_size)

    # ------- model
    logger.info("initializing model")
    if args.resume_path:
        args.resume_path = Path(args.resume_path)
        model = BertForMultiClass.from_pretrained(args.resume_path, num_labels=len(label_list))
    else:
        model = BertForMultiClass.from_pretrained(config['bert_model_dir'], num_labels=len(label_list))
    t_total = int(len(train_dataloader) / args.gradient_accumulation_steps * args.epochs)

    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
         'weight_decay': args.weight_decay},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
    ]
    warmup_steps = int(t_total * args.warmup_proportion)
    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
    lr_scheduler = WarmupLinearSchedule(optimizer, warmup_steps=warmup_steps, t_total=t_total)

    if args.fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
        model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)

    # ---- callbacks
    logger.info("initializing callbacks")
    train_monitor = TrainingMonitor(file_dir=config['figure_dir'], arch=args.arch)
    model_checkpoint = ModelCheckpoint(checkpoint_dir=config['checkpoint_dir'], mode=args.mode,
                                       monitor=args.monitor, arch=args.arch,
                                       save_best_only=args.save_best)

    # **************************** training model ***********************
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_examples))
    logger.info("  Num Epochs = %d", args.epochs)
    logger.info("  Total train batch size (w. parallel, distributed & accumulation) = %d",
                args.train_batch_size * args.gradient_accumulation_steps * (
                    torch.distributed.get_world_size() if args.local_rank != -1 else 1))
    logger.info("  Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
    logger.info("  Total optimization steps = %d", t_total)

    trainer = Trainer(n_gpu=args.n_gpu,
                      model=model,
                      epochs=args.epochs,
                      logger=logger,
                      criterion=CrossEntropy(),
                      optimizer=optimizer,
                      lr_scheduler=lr_scheduler,
                      early_stopping=None,
                      training_monitor=train_monitor,
                      fp16=args.fp16,
                      resume_path=args.resume_path,
                      grad_clip=args.grad_clip,
                      model_checkpoint=model_checkpoint,
                      gradient_accumulation_steps=args.gradient_accumulation_steps,
                      evaluate=F1Score(),
                      class_report=ClassReport(target_names=[id2label[x] for x in range(len(label2id))]))
    trainer.train(train_data=train_dataloader, valid_data=valid_dataloader, seed=args.seed)
Example #22
0
def main():
    torch.manual_seed(42)

    # Random
    #params = {'batch_size': 32, 'dropout': 0, 'hidden_dim': 128, 'learning_rate': 0.01, 'num_epochs': 5, 'num_layers': 2, 'oversample': False, 'soft_labels': False}
    # Glove
    params = {
        'batch_size': 32,
        'dropout': 0,
        'hidden_dim': 128,
        'learning_rate': 0.001,
        'num_epochs': 5,
        'num_layers': 2,
        'oversample': False,
        'soft_labels': False
    }
    # Random
    #params = {'batch_size': 32, 'dropout': 0, 'hidden_dim': 256, 'learning_rate': 0.0001, 'num_epochs': 5, 'num_layers': 3, 'oversample': False, 'soft_labels': False}

    #some params
    experiment_number = 1
    test_percentage = 0.1
    val_percentage = 0.2
    batch_size = params["batch_size"]
    num_epochs = 5  #params["num_epochs"]
    dropout = params["dropout"]
    embedding_dim = 300
    model_name = "CNN"  #'Bert' #"CNN" #"LSTM"
    unsupervised = True
    embedding = "Glove"  #"Random" ##"Glove" # "Both" #
    soft_labels = False
    combine = embedding == "Both"

    # LSTM parameters
    if model_name == "LSTM":
        hidden_dim = params["hidden_dim"]
        num_layers = params["num_layers"]

    # Bert parameter
    num_warmup_steps = 100
    num_total_steps = 1000
    if model_name == "Bert":
        embedding = "None"
    if embedding == "Both":
        combine = True
        embedding = "Random"
    else:
        combine = False
    learning_rate = params["learning_rate"]  #5e-5, 3e-5, 2e-5
    oversample_bool = False
    weighted_loss = True
    # load data
    dataset = Dataset("../data/cleaned_tweets_orig.csv",
                      use_embedding=embedding,
                      embedd_dim=embedding_dim,
                      combine=combine,
                      for_bert=(model_name == "Bert"))

    #dataset.oversample()
    train_data, val_test_data = split_dataset(dataset,
                                              test_percentage + val_percentage)
    val_data, test_data = split_dataset(
        val_test_data, test_percentage / (test_percentage + val_percentage))

    # print(len(train_data))
    #save_data(train_data, 'train')
    #save_data(test_data, 'test')

    #define loaders
    if oversample_bool:
        weights, targets = get_loss_weights(train_data, return_targets=True)
        class_sample_count = [
            1024 / 20, 13426, 2898 / 2
        ]  # dataset has 10 class-1 samples, 1 class-2 samples, etc.
        oversample_weights = 1 / torch.Tensor(class_sample_count)
        oversample_weights = oversample_weights[targets]
        # oversample_weights = torch.tensor([0.9414, 0.2242, 0.8344]) #torch.ones((3))-
        sampler = torch.utils.data.sampler.WeightedRandomSampler(
            oversample_weights, len(oversample_weights))
        train_loader = torch.utils.data.DataLoader(train_data,
                                                   batch_size=batch_size,
                                                   collate_fn=my_collate,
                                                   sampler=sampler)
    else:
        train_loader = torch.utils.data.DataLoader(train_data,
                                                   batch_size=batch_size,
                                                   collate_fn=my_collate)
    val_loader = torch.utils.data.DataLoader(val_data,
                                             batch_size=batch_size,
                                             collate_fn=my_collate)

    #define model
    if model_name == "CNN":
        vocab_size = len(dataset.vocab)
        model = CNN(vocab_size, embedding_dim, combine=combine)
    elif model_name == "LSTM":
        vocab_size = len(dataset.vocab)
        model = LSTM(vocab_size,
                     embedding_dim,
                     batch_size=batch_size,
                     hidden_dim=hidden_dim,
                     lstm_num_layers=num_layers,
                     combine=combine,
                     dropout=dropout)

    elif model_name == "Bert":
        model = BertForSequenceClassification.from_pretrained(
            "bert-base-uncased", num_labels=3)
        train_loader = torch.utils.data.DataLoader(train_data,
                                                   batch_size=batch_size,
                                                   collate_fn=bert_collate)
        val_loader = torch.utils.data.DataLoader(val_data,
                                                 batch_size=batch_size,
                                                 collate_fn=bert_collate)

    #device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    #LOSS : weighted cross entropy loss, by class counts of other classess
    if weighted_loss:
        weights = torch.tensor([0.9414, 0.2242, 0.8344], device=device)
    else:
        weights = torch.ones(3, device=device)
    #weights = torch.tensor([1.0, 1.0, 1.0], device = device) #get_loss_weights(train_data).to(device) # not to run again
    criterion = nn.CrossEntropyLoss(weight=weights)
    if soft_labels:
        criterion = weighted_soft_cross_entropy
    #latent model
    if unsupervised:
        vocab_size = len(dataset.vocab)
        criterion = nn.CrossEntropyLoss(weight=weights, reduction='none')
        model = Rationalisation_model(vocab_size,
                                      embedding_dim=embedding_dim,
                                      model=model_name,
                                      batch_size=batch_size,
                                      combine=combine,
                                      criterion=criterion)

    if not model_name == "Bert":
        model.embedding.weight.data.copy_(dataset.vocab.vectors)
        if combine:
            model.embedding_glove.weight.data.copy_(dataset.glove.vectors)

    #model to device
    model.to(device)

    #optimiser
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    if model_name == "Bert":
        optimizer = AdamW(model.parameters(),
                          lr=learning_rate,
                          correct_bias=False)
        # Linear scheduler for adaptive lr
        scheduler = WarmupLinearSchedule(optimizer,
                                         warmup_steps=num_warmup_steps,
                                         t_total=num_total_steps)
    else:
        scheduler = None

    plot_log = defaultdict(list)
    for epoch in range(num_epochs):
        #train and validate
        epoch_loss, epoch_acc = train_epoch(model,
                                            train_loader,
                                            optimizer,
                                            criterion,
                                            device,
                                            soft_labels=soft_labels,
                                            weights=weights,
                                            scheduler=scheduler,
                                            unsupervised=unsupervised)
        val_loss, val_acc = evaluate_epoch(model,
                                           val_loader,
                                           criterion,
                                           device,
                                           soft_labels=soft_labels,
                                           weights=weights,
                                           unsupervised=unsupervised)
        #save for plotting
        for name, point in zip(
            ["train_loss", "train_accuracy", "val_loss", "val_accuracy"],
            [epoch_loss, epoch_acc, val_loss, val_acc]):
            plot_log[f'{name}'] = point
        #realtime feel
        print(f'Epoch: {epoch+1}')
        print(
            f'\tTrain Loss: {epoch_loss:.5f} | Train Acc: {epoch_acc*100:.2f}%'
        )
        print(f'\t Val. Loss: {val_loss:.5f} |  Val. Acc: {val_acc*100:.2f}%')
    sample_sentences_and_z(model, train_loader, device, dataset.vocab)
    #save plot
    results_directory = f'plots/{experiment_number}'
    os.makedirs(results_directory, exist_ok=True)
    for name, data in plot_log.items():
        save_plot(data, name, results_directory)
    #save model
    torch.save(model, os.path.join(results_directory, 'model_cnn.pth'))
    #confusion matrix and all that fun
    loss, acc, predictions, ground_truth = evaluate_epoch(
        model,
        val_loader,
        criterion,
        device,
        is_final=True,
        soft_labels=soft_labels,
        weights=weights,
        unsupervised=unsupervised)
    conf_matrix = confusion_matrix(ground_truth, predictions)
    class_report = classification_report(ground_truth, predictions)
    print('\nFinal Loss and Accuracy\n----------------\n')
    print(f'\t Val. Loss: {loss:.5f} |  Val. Acc: {acc*100:.2f}%')
    print('\nCONFUSION MATRIX\n----------------\n')
    print(conf_matrix)
    print('\nCLASSSIFICATION REPORT\n----------------------\n')
    print(class_report)

    plot_confusion_matrix(ground_truth,
                          predictions,
                          classes=["Hate speech", "Offensive", "Neither"],
                          normalize=False,
                          title='Confusion matrix')
    plt.show()
    def train(self):
        # Model
        model = KobertBiLSTMCRF(config=self.model_config,
                                num_classes=len(self.tr_ds.ner_to_index))
        model.train()

        # optim
        train_examples_len = len(self.tr_ds)
        param_optimizer = list(model.named_parameters())
        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [{
            'params': [
                p for n, p in param_optimizer
                if not any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            0.01
        }, {
            'params':
            [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
            'weight_decay':
            0.0
        }]

        # num_train_optimization_steps = int(train_examples_len / model_config.batch_size / model_config.gradient_accumulation_steps) * model_config.epochs
        t_total = len(
            self.tr_dl
        ) // self.model_config.gradient_accumulation_steps * self.model_config.epochs
        optimizer = AdamW(optimizer_grouped_parameters,
                          lr=self.model_config.learning_rate,
                          eps=self.model_config.adam_epsilon)
        scheduler = WarmupLinearSchedule(
            optimizer,
            warmup_steps=self.model_config.warmup_steps,
            t_total=t_total)

        device = torch.device(
            'cuda') if torch.cuda.is_available() else torch.device('cpu')

        n_gpu = torch.cuda.device_count()
        # if n_gpu > 1:
        #     model = torch.nn.DataParallel(model)
        model.to(device)

        # save
        tb_writer = SummaryWriter('{}/runs'.format(self.model_dir))
        checkpoint_manager = CheckpointManager(self.model_dir)
        summary_manager = SummaryManager(self.model_dir)
        best_val_loss = 1e+10
        best_train_acc = 0

        # Train!
        self.logger.info("***** Running training *****")
        self.logger.info("  Num examples = %d", len(self.tr_ds))
        self.logger.info("  Num Epochs = %d", self.model_config.epochs)
        self.logger.info("  Instantaneous batch size per GPU = %d",
                         self.model_config.batch_size)
        # logger.info("  Total train batch size (w. parallel, distributed & accumulation) = %d",
        #                args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1))
        self.logger.info("  Gradient Accumulation steps = %d",
                         self.model_config.gradient_accumulation_steps)
        self.logger.info("  Total optimization steps = %d", t_total)

        global_step = 0
        tr_loss, logging_loss = 0.0, 0.0
        best_dev_acc, best_dev_loss = 0.0, 99999999999.0
        best_steps = 0
        model.zero_grad()
        self.set_seed(
        )  # Added here for reproductibility (even between python 2 and 3)

        # Train
        train_iterator = trange(int(self.model_config.epochs), desc="Epoch")
        for _epoch, _ in enumerate(train_iterator):
            epoch_iterator = tqdm(
                self.tr_dl,
                desc="Iteration")  # , disable=args.local_rank not in [-1, 0]
            epoch = _epoch
            for step, batch in enumerate(epoch_iterator):
                model.train()
                x_input, token_type_ids, y_real = map(
                    lambda elm: elm.to(device), batch)
                log_likelihood, sequence_of_tags = model(
                    x_input, token_type_ids, y_real)

                # loss: negative log-likelihood
                loss = -1 * log_likelihood

                if n_gpu > 1:
                    loss = loss.mean(
                    )  # mean() to average on multi-gpu parallel training
                if self.model_config.gradient_accumulation_steps > 1:
                    loss = loss / self.model_config.gradient_accumulation_steps

                loss.backward()
                torch.nn.utils.clip_grad_norm_(model.parameters(),
                                               self.model_config.max_grad_norm)
                tr_loss += loss.item()

                if (step + 1
                    ) % self.model_config.gradient_accumulation_steps == 0:
                    optimizer.step()
                    scheduler.step()  # Update learning rate schedule
                    model.zero_grad()
                    global_step += 1

                    with torch.no_grad():
                        sequence_of_tags = torch.tensor(sequence_of_tags)
                        print("sequence_of_tags: ", sequence_of_tags)
                        print("y_real: ", y_real)
                        print("loss: ", loss)
                        print("(sequence_of_tags == y_real): ",
                              (sequence_of_tags == y_real))
                        _tags = torch.squeeze(sequence_of_tags, dim=0)
                        mb_acc = (_tags == y_real).float()[
                            y_real != self.vocab.PAD_ID].mean()
                        #mb_acc = (sequence_of_tags == y_real).float()[y_real != self.vocab.PAD_ID].mean()

                    tr_acc = mb_acc.item()
                    tr_loss_avg = tr_loss / global_step
                    tr_summary = {'loss': tr_loss_avg, 'acc': tr_acc}

                    # if step % 50 == 0:
                    print(
                        'epoch : {}, global_step : {}, tr_loss: {:.3f}, tr_acc: {:.2%}'
                        .format(epoch + 1, global_step, tr_summary['loss'],
                                tr_summary['acc']))

                    if self.model_config.logging_steps > 0 and global_step % self.model_config.logging_steps == 0:
                        # Log metrics
                        if self.model_config.evaluate_during_training:  # Only evaluate when single GPU otherwise metrics may not average well
                            pass
                        tb_writer.add_scalar('lr',
                                             scheduler.get_lr()[0],
                                             global_step)
                        tb_writer.add_scalar('loss', (tr_loss - logging_loss) /
                                             self.model_config.logging_steps,
                                             global_step)
                        self.logger.info(
                            "Average loss: %s at global step: %s",
                            str((tr_loss - logging_loss) /
                                self.model_config.logging_steps),
                            str(global_step))
                        logging_loss = tr_loss

                    if self.model_config.save_steps > 0 and global_step % self.model_config.save_steps == 0:

                        eval_summary, list_of_y_real, list_of_pred_tags = self.evaluate(
                            model, self.val_dl)

                        # Save model checkpoint
                        output_dir = os.path.join(self.model_config.output_dir,
                                                  'epoch-{}'.format(epoch + 1))
                        if not os.path.exists(output_dir):
                            os.makedirs(output_dir)
                        self.logger.info("Saving model checkpoint to %s",
                                         output_dir)

                        state = {
                            'global_step': global_step + 1,
                            'model_state_dict': model.state_dict(),
                            'opt_state_dict': optimizer.state_dict()
                        }
                        summary = {'train': tr_summary}
                        summary_manager.update(summary)
                        summary_manager.save('summary.json')

                        is_best = tr_acc >= best_train_acc  # acc 기준 (원래는 train_acc가 아니라 val_acc로 해야)
                        # Save
                        if is_best:
                            best_train_acc = tr_acc
                            checkpoint_manager.save_checkpoint(
                                state,
                                'best-epoch-{}-step-{}-acc-{:.3f}.bin'.format(
                                    epoch + 1, global_step, tr_acc))

                            print(
                                "Saving model checkpoint as best-epoch-{}-step-{}-acc-{:.3f}.bin"
                                .format(epoch + 1, global_step, best_dev_acc))

                            # print classification report and save confusion matrix
                            cr_save_path = self.model_dir + '/best-epoch-{}-step-{}-acc-{:.3f}-cr.csv'.format(
                                epoch + 1, global_step, best_dev_acc)
                            cm_save_path = self.model_dir + '/best-epoch-{}-step-{}-acc-{:.3f}-cm.png'.format(
                                epoch + 1, global_step, best_dev_acc)

                            self.save_cr_and_cm(list_of_y_real,
                                                list_of_pred_tags,
                                                cr_save_path=cr_save_path,
                                                cm_save_path=cm_save_path)
                        else:
                            torch.save(
                                state,
                                os.path.join(
                                    output_dir,
                                    'model-epoch-{}-step-{}-acc-{:.3f}.bin'.
                                    format(epoch + 1, global_step, tr_acc)))

        tb_writer.close()
        self.logger.info(" global_step = %s, average loss = %s", global_step,
                         tr_loss / global_step)
Example #24
0
def train(args, train_iter, dev, test, src_field, tgt_field, tag_field,
          checkpoint):
    # srcpadid = src_field.vocab.stoi['<pad>']
    tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')

    model = Classify_Extractor(args, tgt_field)

    if torch.cuda.is_available():
        model.cuda()

    print_params(model)

    decay = args.decay

    if args.optimizer == 'bert':
        weight_decay = 0.0
        no_decay = ['bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [{
            'params': [
                p for n, p in model.named_parameters()
                if not any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            weight_decay
        }, {
            'params': [
                p for n, p in model.named_parameters()
                if any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            0.0
        }]
        opt = AdamW(optimizer_grouped_parameters, lr=args.lr, eps=1e-8)
        totalnum = 0
        for i in train_iter:
            totalnum += 1
        #print(args.lr)
        #print(args.maximum_steps)
        #exit()
        t_total = totalnum // decay * args.maximum_steps
        scheduler = WarmupLinearSchedule(opt, warmup_steps=0, t_total=t_total)
    else:
        opt = torch.optim.Adadelta(model.parameters(), lr=args.lr)

    best_e = 0.0
    best_c = 0.0
    best_epoch_for_c = 0
    best_epoch_for_e = 0
    offset = 0.0
    pre_epoch = 0
    patience_c = 0
    patience_e = 0

    if checkpoint is not None:
        print('model.load_state_dict(checkpoint[model])')
        model.load_state_dict(checkpoint['model'])
        if args.resume:
            opt.load_state_dict(checkpoint['optim'])

            best_f = checkpoint['f']
            offset = checkpoint['iters']
            pre_epoch = checkpoint['epoch']

            print('*************************************')
            print('resume from {} epoch {} iters and best_f {}'.format(
                pre_epoch, offset, best_f))
            print('*************************************')

    print("**************start training****************")
    start = time.time()

    for epoch in range(args.maxepoch):
        train_iter.init_epoch()
        epoch += pre_epoch

        for iters, train_batch in enumerate(train_iter):
            iters += offset
            model.train()
            # model.zero_grad()
            # model.constrain_transition()
            t1 = time.time()
            batch_src = train_batch.src
            #print(batch_src)
            #exit()
            src = [tokenizer.convert_tokens_to_ids(s) for s in batch_src]
            maxlen = max([len(s) for s in batch_src])

            src_mask = []
            padded_sents = []
            for s in src:
                new_s = s + [0] * (maxlen - len(s))
                padded_sents.append(new_s)
                mask = [1] * len(s) + [0] * (maxlen - len(s))
                src_mask.append(mask)
            # B T
            src = torch.tensor(padded_sents).long().cuda()
            # B T
            src_mask = torch.tensor(src_mask).byte().cuda()
            # src, src_mask = prepare_src(train_batch.src, srcpadid)
            tgt = prepare_tgt(train_batch.tgt)
            tag = train_batch.tag

            loss = model(src, src_mask, tgt, tag)

            # "update parameters"

            if decay > 1:
                loss = loss / decay

            loss.backward()

            # if args.grad_clip:
            #     torch.nn.utils.clip_grad_norm_(model.parameters(), args.grad_clip)

            if (iters + 1) % decay == 0:
                opt.step()
                scheduler.step()  # Update learning rate schedule
                opt.zero_grad()

            # opt.step()

            t2 = time.time()

            loss = loss.item()

            print("epoch:{} iters:{} src:({},{}) tgt:({},{}) "
                  "loss:{:.2f} t:{:.2f}".format(epoch + 1, iters + 1,
                                                *src.size(), *tgt.size(), loss,
                                                t2 - t1))

        # if torch.cuda.is_available():
        #     torch.cuda.empty_cache()

        if (epoch + 1) % 1 == 0:
            print("=============validate model==============")
            with torch.no_grad():
                dev.init_epoch()
                model.eval()
                # model.constrain_transition()
                sents = []
                cy_true = []
                cy_pred = []
                for j, dev_batch in enumerate(dev):
                    t1 = time.time()
                    # src, src_mask = prepare_src(dev_batch.src, srcpadid)
                    batch_src = dev_batch.src
                    src = [
                        tokenizer.convert_tokens_to_ids(s) for s in batch_src
                    ]
                    maxlen = max([len(s) for s in batch_src])

                    src_mask = []
                    padded_sents = []
                    for s in src:
                        new_s = s + [0] * (maxlen - len(s))
                        padded_sents.append(new_s)
                        mask = [1] * len(s) + [0] * (maxlen - len(s))
                        src_mask.append(mask)
                    # B T
                    src = torch.tensor(padded_sents).long().cuda()
                    # B T
                    src_mask = torch.tensor(src_mask).byte().cuda()

                    tgt = prepare_tgt(dev_batch.tgt)
                    tag = dev_batch.tag.squeeze(-1)
                    _, pre_tag = model.component_extraction(src, src_mask)
                    pre_ctag = model.simile_classify(src, src_mask)
                    cy_true.extend(tag.tolist())
                    cy_pred.extend(pre_ctag.tolist())

                    for sen, tags, p_tags, c_tags in zip(
                            src, tgt, pre_tag, tag):
                        sen = sen[:len(p_tags)].tolist()
                        tags = tags[:len(p_tags)].tolist()
                        if c_tags == 1:
                            sents.append([
                                sen, [tgt_field.vocab.itos[t] for t in tags],
                                [tgt_field.vocab.itos[t] for t in p_tags]
                            ])
                    print('dev iters: {}, t:{}'.format(j, time.time() - t1))

                _, eprecision, erecall, ef1 = evaluate(sents)

                cprecision = precision_score(cy_true, cy_pred)
                crecall = recall_score(cy_true, cy_pred)
                cf1 = f1_score(cy_true, cy_pred)

                print(
                    'epoch: {} classify--> precision: {} recall: {} f1: {} best:{}'
                    .format(epoch + 1, cprecision, crecall, cf1, best_c))
                print('extractor--> precision: {} recall: {} f1: {} best: {}'.
                      format(eprecision, erecall, ef1, best_e))

                if cf1 > best_c:
                    best_c = cf1
                    best_epoch_for_c = epoch + 1

                    print(
                        'save best classifier model at epoch={}'.format(epoch +
                                                                        1))
                    checkpoint = {
                        'model': model.state_dict(),
                        'optim': opt.state_dict(),
                        'args': args
                    }
                    torch.save(
                        checkpoint, '{}/{}.classify.best.pt'.format(
                            args.model_path, args.model))
                    patience_c = 0
                else:
                    patience_c += 1

                if ef1 > best_e:
                    best_e = ef1
                    best_epoch_for_e = epoch + 1

                    print(
                        'save best extractor model at epoch={}'.format(epoch +
                                                                       1))
                    checkpoint = {
                        'model': model.state_dict(),
                        'optim': opt.state_dict(),
                        'args': args
                    }
                    torch.save(
                        checkpoint, '{}/{}.extractor.best.pt'.format(
                            args.model_path, args.model))
                    patience_e = 0
                else:
                    patience_e += 1

        if patience_c > args.patience and patience_e > args.patience:
            print("early stop at {}".format(epoch))
            break

        if args.decay:
            opt.param_groups[0]['lr'] = opt.param_groups[0]['lr'] * args.decay

    print('*******Done********{}'.format(
        time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())))
    minutes = (time.time() - start) // 60
    if minutes < 60:
        print(
            'best_c:{}, best_e:{} best_epoch_c:{}, best_epoch_e:{}, time:{} mins'
            .format(best_c, best_e, best_epoch_for_c, best_epoch_for_e,
                    minutes))
    else:
        hours = minutes / 60
        print(
            'best_c:{}, best_e:{} best_epoch_c:{}, best_epoch_e:{}, time:{:.1f} hours'
            .format(best_c, best_e, best_epoch_for_c, best_epoch_for_e, hours))

    print('*******Testing************')
    model1 = Classify_Extractor(args, tgt_field)
    model1.cuda()
    load_from = '{}/{}.classify.best.pt'.format(args.model_path, args.model)
    print('load the best model {}'.format(load_from))
    checkpoint = torch.load(load_from, map_location='cpu')
    print('load parameters')
    model1.load_state_dict(checkpoint['model'])

    model2 = Classify_Extractor(args, tgt_field)
    model2.cuda()
    load_from = '{}/{}.extractor.best.pt'.format(args.model_path, args.model)
    print('load the best model {}'.format(load_from))
    checkpoint = torch.load(load_from, map_location='cpu')
    print('load parameters')
    model2.load_state_dict(checkpoint['model'])
    with torch.no_grad():
        test.init_epoch()
        model1.eval()
        model2.eval()
        sents = []
        cy_true = []
        cy_pred = []
        for j, test_batch in enumerate(test):
            t1 = time.time()
            # src, src_mask = prepare_src(test_batch.src, srcpadid)
            batch_src = test_batch.src
            src = [tokenizer.convert_tokens_to_ids(s) for s in batch_src]
            maxlen = max([len(s) for s in batch_src])

            src_mask = []
            padded_sents = []
            for s in src:
                new_s = s + [0] * (maxlen - len(s))
                padded_sents.append(new_s)
                mask = [1] * len(s) + [0] * (maxlen - len(s))
                src_mask.append(mask)
            # B T
            src = torch.tensor(padded_sents).long().cuda()
            # B T
            src_mask = torch.tensor(src_mask).byte().cuda()

            tgt = prepare_tgt(test_batch.tgt)
            tag = test_batch.tag.squeeze(-1)
            _, pre_tag = model2.component_extraction(src, src_mask)
            pre_ctag = model1.simile_classify(src, src_mask)
            cy_true.extend(tag.tolist())
            cy_pred.extend(pre_ctag.tolist())

            # for sen, tags, p_tags in zip(src, tgt, pre_tag):
            #     sen = sen[:len(p_tags)].tolist()
            #     tags = tags[:len(p_tags)].tolist()
            #     sents.append([sen, [tgt_field.vocab.itos[t] for t in tags],
            #                  [tgt_field.vocab.itos[t] for t in p_tags]])
            for sen, tags, p_tags, c_tags in zip(src, tgt, pre_tag, pre_ctag):
                sen = sen[:len(p_tags)].tolist()
                tags = tags[:len(p_tags)].tolist()
                if c_tags == 1:
                    sents.append([
                        sen, [tgt_field.vocab.itos[t] for t in tags],
                        [tgt_field.vocab.itos[t] for t in p_tags]
                    ])
                elif c_tags == 0:
                    sents.append([
                        sen, [tgt_field.vocab.itos[t] for t in tags],
                        ['O' for t in p_tags]
                    ])

            print('test iters: {}, t:{}'.format(j, time.time() - t1))

        _, eprecision, erecall, ef1 = evaluate(sents)

        cprecision = precision_score(cy_true, cy_pred)
        crecall = recall_score(cy_true, cy_pred)
        cf1 = f1_score(cy_true, cy_pred)

        print('Testing classify--> precision: {} recall: {} f1: {}'.format(
            cprecision, crecall, cf1))
        print('extractor--> precision: {} recall: {} f1: {}'.format(
            eprecision, erecall, ef1))
Example #25
0
    def train(self):
        if not os.path.exists(self.output_dir):
            os.makedirs(self.output_dir)

        data_splitList = DATACQA.load_data(os.path.join(self.data_dir, 'train.csv'),n_splits=5)
        for split_index,each_data in enumerate(data_splitList):
            # Prepare model
            config = BertConfig.from_pretrained(self.model_name_or_path, num_labels=self.num_labels)
            model = BertForSequenceClassification.from_pretrained(self.model_name_or_path, self.args, config=config)
            model.to(self.device)

            logger.info(f'Fold {split_index + 1}')
            train_dataloader, eval_dataloader, train_examples, eval_examples = self.create_dataloader(each_data)

            num_train_optimization_steps = self.train_steps

            # Prepare optimizer

            param_optimizer = list(model.named_parameters())
            param_optimizer = [n for n in param_optimizer]

            no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
            optimizer_grouped_parameters = [
                {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
                 'weight_decay': self.weight_decay},
                {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
            ]

            optimizer = AdamW(optimizer_grouped_parameters, lr=self.learning_rate, eps=self.adam_epsilon)
            scheduler = WarmupLinearSchedule(optimizer, warmup_steps=self.warmup_steps, t_total=self.train_steps)

            global_step = 0

            logger.info("***** Running training *****")
            logger.info("  Num examples = %d", len(train_examples))
            logger.info("  Batch size = %d", self.train_batch_size)
            logger.info("  Num steps = %d", num_train_optimization_steps)

            best_acc = 0
            model.train()
            tr_loss = 0
            nb_tr_examples, nb_tr_steps = 0, 0
            train_dataloader = cycle(train_dataloader)

            for step in range(num_train_optimization_steps):
                batch = next(train_dataloader)
                batch = tuple(t.to(self.device) for t in batch)
                input_ids, input_mask, segment_ids, label_ids = batch
                loss = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=label_ids)
                tr_loss += loss.item()
                train_loss = round(tr_loss / (nb_tr_steps + 1), 4)

                nb_tr_examples += input_ids.size(0)
                nb_tr_steps += 1

                loss.backward()

                if (nb_tr_steps + 1) % self.gradient_accumulation_steps == 0:

                    scheduler.step()
                    optimizer.step()
                    optimizer.zero_grad()
                    global_step += 1

                if (step + 1) % (self.eval_steps * self.gradient_accumulation_steps) == 0:
                    tr_loss = 0
                    nb_tr_examples, nb_tr_steps = 0, 0
                    logger.info("***** Report result *****")
                    logger.info("  %s = %s", 'global_step', str(global_step))
                    logger.info("  %s = %s", 'train loss', str(train_loss))

                if self.do_eval and (step + 1) % (self.eval_steps * self.gradient_accumulation_steps) == 0:
                    for file in ['dev.csv']:
                        inference_labels = []
                        gold_labels = []
                        inference_logits = []
                        scores = []
                        questions = [x.text_a for x in eval_examples]

                        logger.info("***** Running evaluation *****")
                        logger.info("  Num examples = %d", len(eval_examples))
                        logger.info("  Batch size = %d", self.eval_batch_size)

                        # Run prediction for full data

                        model.eval()
                        eval_loss, eval_accuracy = 0, 0
                        nb_eval_steps, nb_eval_examples = 0, 0
                        for input_ids, input_mask, segment_ids, label_ids in eval_dataloader:
                            input_ids = input_ids.to(self.device)
                            input_mask = input_mask.to(self.device)
                            segment_ids = segment_ids.to(self.device)
                            label_ids = label_ids.to(self.device)

                            with torch.no_grad():
                                tmp_eval_loss = model(
                                    input_ids=input_ids,
                                    token_type_ids=segment_ids,
                                    attention_mask=input_mask,
                                    labels=label_ids)
                                logits = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask)

                            logits = logits.detach().cpu().numpy()
                            label_ids = label_ids.to('cpu').numpy()
                            inference_labels.append(np.argmax(logits, axis=1))
                            scores.append(logits)
                            gold_labels.append(label_ids)
                            inference_logits.append(logits)
                            eval_loss += tmp_eval_loss.mean().item()
                            nb_eval_examples += input_ids.size(0)
                            nb_eval_steps += 1

                        gold_labels = np.concatenate(gold_labels, 0)
                        inference_logits = np.concatenate(inference_logits, 0)
                        scores = np.concatenate(scores, 0)
                        model.train()
                        eval_loss = eval_loss / nb_eval_steps
                        eval_accuracy = accuracyCQA(inference_logits, gold_labels)
                        eval_mrr = compute_MRR_CQA(scores,gold_labels,questions)
                        eval_5R20 = compute_5R20(scores,gold_labels,questions)

                        result = {'eval_loss': eval_loss,
                                  'eval_F1': eval_accuracy,
                                  'eval_MRR':eval_mrr,
                                  'eval_5R20':eval_5R20,
                                  'global_step': global_step,
                                  'loss': train_loss}

                        output_eval_file = os.path.join(self.output_dir, "eval_results.txt")
                        with open(output_eval_file, "a") as writer:
                            for key in sorted(result.keys()):
                                logger.info("  %s = %s", key, str(result[key]))
                                writer.write("%s = %s\n" % (key, str(result[key])))
                            writer.write('*' * 80)
                            writer.write('\n')
                        if eval_accuracy > best_acc :
                            print("=" * 80)
                            print("Best F1", eval_accuracy)
                            print("Saving Model......")
                            best_acc = eval_accuracy
                            # Save a trained model
                            model_to_save = model.module if hasattr(model,'module') else model
                            output_model_file = os.path.join(self.output_dir, "pytorch_model_{}.bin".format(split_index))
                            torch.save(model_to_save.state_dict(), output_model_file)
                            print("=" * 80)
                        else:
                            print("=" * 80)

            del model
            gc.collect()
Example #26
0
def train(model, criterion, dataset, logger, train_csv_logger, val_csv_logger,
          test_csv_logger, args, epoch_offset):
    model = model.cuda()

    # process generalization adjustment stuff
    adjustments = [float(c) for c in args.generalization_adjustment.split(',')]
    assert len(adjustments) in (1, dataset['train_data'].n_groups)
    if len(adjustments) == 1:
        adjustments = np.array(adjustments * dataset['train_data'].n_groups)
    else:
        adjustments = np.array(adjustments)

    train_loss_computer = LossComputer(
        criterion,
        is_robust=args.robust,
        dataset=dataset['train_data'],
        alpha=args.alpha,
        gamma=args.gamma,
        adj=adjustments,
        step_size=args.robust_step_size,
        normalize_loss=args.use_normalized_loss,
        btl=args.btl,
        min_var_weight=args.minimum_variational_weight,
        sp=args.sp,
        mode=args.mode,
        ratio=args.ratio)

    # BERT uses its own scheduler and optimizer
    if args.model == 'bert':
        no_decay = ['bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [{
            'params': [
                p for n, p in model.named_parameters()
                if not any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            args.weight_decay
        }, {
            'params': [
                p for n, p in model.named_parameters()
                if any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            0.0
        }]
        optimizer = AdamW(optimizer_grouped_parameters,
                          lr=args.lr,
                          eps=args.adam_epsilon)
        t_total = len(dataset['train_loader']) * args.n_epochs
        print(f'\nt_total is {t_total}\n')
        scheduler = WarmupLinearSchedule(optimizer,
                                         warmup_steps=args.warmup_steps,
                                         t_total=t_total)
    else:
        if args.adam:
            optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad,
                                                model.parameters()),
                                         lr=args.lr,
                                         weight_decay=args.weight_decay)
        else:
            optimizer = torch.optim.SGD(filter(lambda p: p.requires_grad,
                                               model.parameters()),
                                        lr=args.lr,
                                        momentum=0.9,
                                        weight_decay=args.weight_decay)
        if args.scheduler:
            scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
                optimizer,
                'min',
                factor=0.1,
                patience=5,
                threshold=0.0001,
                min_lr=0,
                eps=1e-08)
        else:
            scheduler = None

    best_val_acc = 0
    for epoch in range(epoch_offset, epoch_offset + args.n_epochs):
        logger.write('\nEpoch [%d]:\n' % epoch)
        logger.write(f'Training:\n')
        run_epoch(epoch,
                  model,
                  optimizer,
                  dataset['train_loader'],
                  train_loss_computer,
                  logger,
                  train_csv_logger,
                  args,
                  is_training=True,
                  show_progress=args.show_progress,
                  log_every=args.log_every,
                  scheduler=scheduler)

        logger.write(f'\nValidation:\n')
        val_loss_computer = LossComputer(criterion,
                                         is_robust=args.robust,
                                         dataset=dataset['val_data'],
                                         step_size=args.robust_step_size,
                                         alpha=args.alpha)
        run_epoch(epoch,
                  model,
                  optimizer,
                  dataset['val_loader'],
                  val_loss_computer,
                  logger,
                  val_csv_logger,
                  args,
                  is_training=False)

        # Test set; don't print to avoid peeking
        # if dataset['test_data'] is not None:
        #     test_loss_computer = LossComputer(
        #         criterion,
        #         is_robust=args.robust,
        #         dataset=dataset['test_data'],
        #         step_size=args.robust_step_size,
        #         alpha=args.alpha)
        #     run_epoch(
        #         epoch, model, optimizer,
        #         dataset['test_loader'],
        #         test_loss_computer,
        #         None, test_csv_logger, args,
        #         is_training=False)

        # Inspect learning rates
        if (epoch + 1) % 1 == 0:
            for param_group in optimizer.param_groups:
                curr_lr = param_group['lr']
                logger.write('Current lr: %f\n' % curr_lr)

        if args.scheduler and args.model != 'bert':
            if args.robust:
                val_loss, _ = val_loss_computer.compute_robust_loss_greedy(
                    val_loss_computer.avg_group_loss,
                    val_loss_computer.avg_group_loss)
            else:
                val_loss = val_loss_computer.avg_actual_loss
            scheduler.step(
                val_loss)  #scheduler step to update lr at the end of epoch

        if epoch % args.save_step == 0:
            torch.save(model, os.path.join(args.log_dir,
                                           '%d_model.pth' % epoch))

        if args.save_last:
            torch.save(model, os.path.join(args.log_dir, 'last_model.pth'))

        if args.save_best:
            if args.robust or args.reweight_groups:
                curr_val_acc = min(val_loss_computer.avg_group_acc)
            else:
                curr_val_acc = val_loss_computer.avg_acc
            logger.write(f'Current validation accuracy: {curr_val_acc}\n')
            if curr_val_acc > best_val_acc:
                best_val_acc = curr_val_acc
                torch.save(model, os.path.join(args.log_dir, 'best_model.pth'))
                logger.write(f'Best model saved at epoch {epoch}\n')

        if args.automatic_adjustment:
            gen_gap = val_loss_computer.avg_group_loss - train_loss_computer.exp_avg_loss
            adjustments = gen_gap * torch.sqrt(
                train_loss_computer.group_counts)
            train_loss_computer.adj = adjustments
            logger.write('Adjustments updated\n')
            for group_idx in range(train_loss_computer.n_groups):
                logger.write(
                    f'  {train_loss_computer.get_group_name(group_idx)}:\t'
                    f'adj = {train_loss_computer.adj[group_idx]:.3f}\n')
        logger.write('\n')
Example #27
0
def main():
    parser = argparse.ArgumentParser()

    ## Required parameters
    parser.add_argument(
        "--data_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The input data dir. Should contain the .tsv files (or other data files) for the task."
    )
    parser.add_argument(
        "--bert_model",
        default=None,
        type=str,
        required=True,
        help="Bert pre-trained model selected in the list: bert-base-uncased, "
        "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, "
        "bert-base-multilingual-cased, bert-base-chinese.")
    parser.add_argument("--task_name",
                        default=None,
                        type=str,
                        required=True,
                        help="The name of the task to train.")
    parser.add_argument("--test_file", default='', type=str, help="Test file")
    parser.add_argument(
        "--output_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The output directory where the model predictions and checkpoints will be written."
    )

    ## Other parameters
    parser.add_argument(
        "--cache_dir",
        default="",
        type=str,
        help=
        "Where do you want to store the pre-trained models downloaded from s3")
    parser.add_argument(
        "--max_seq_length",
        default=128,
        type=int,
        help=
        "The maximum total input sequence length after WordPiece tokenization. \n"
        "Sequences longer than this will be truncated, and sequences shorter \n"
        "than this will be padded.")
    parser.add_argument("--validate_per_epoch",
                        default=3,
                        type=int,
                        help="validations number per epoch")
    parser.add_argument("--do_train",
                        action='store_true',
                        help="Whether to run training.")
    parser.add_argument("--do_validation",
                        action='store_true',
                        help="Whether to run validation.")
    parser.add_argument("--do_eval",
                        action='store_true',
                        help="Whether to run eval on the dev set.")
    parser.add_argument(
        "--do_lower_case",
        action='store_true',
        help="Set this flag if you are using an uncased model.")
    parser.add_argument("--train_batch_size",
                        default=32,
                        type=int,
                        help="Total batch size for training.")
    parser.add_argument("--eval_batch_size",
                        default=32,
                        type=int,
                        help="Total batch size for eval.")
    parser.add_argument("--learning_rate",
                        default=5e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--drop",
                        default=0.1,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--num_train_epochs",
                        default=3.0,
                        type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument(
        "--warmup_proportion",
        default=0.1,
        type=float,
        help=
        "Proportion of training to perform linear learning rate warmup for. "
        "E.g., 0.1 = 10%% of training.")
    parser.add_argument("--weight_decay",
                        default=0.01,
                        type=float,
                        help="Weight deay if we apply some.")
    parser.add_argument("--adam_epsilon",
                        default=1e-8,
                        type=float,
                        help="Epsilon for Adam optimizer.")
    parser.add_argument("--max_grad_norm",
                        default=1.0,
                        type=float,
                        help="Max gradient norm.")
    parser.add_argument("--no_cuda",
                        action='store_true',
                        help="Whether not to use CUDA when available")
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")
    parser.add_argument(
        '--gradient_accumulation_steps',
        type=int,
        default=1,
        help=
        "Number of updates steps to accumulate before performing a backward/update pass."
    )
    parser.add_argument(
        '--fp16',
        action='store_true',
        help="Whether to use 16-bit float precision instead of 32-bit")
    parser.add_argument(
        '--fp16_opt_level',
        type=str,
        default='O1',
        help=
        "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
        "See details at https://nvidia.github.io/apex/amp.html")
    parser.add_argument(
        '--loss_scale',
        type=float,
        default=0,
        help=
        "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
        "0 (default value): dynamic loss scaling.\n"
        "Positive power of 2: static loss scaling value.\n")
    parser.add_argument('--server_ip',
                        type=str,
                        default='',
                        help="Can be used for distant debugging.")
    parser.add_argument('--server_port',
                        type=str,
                        default='',
                        help="Can be used for distant debugging.")
    args = parser.parse_args()

    if os.path.exists(args.output_dir) and os.listdir(
            args.output_dir) and args.do_train:
        raise ValueError(
            "Output directory ({}) already exists and is not empty.".format(
                args.output_dir))
    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)

    if args.do_train:
        logger.addHandler(
            logging.FileHandler(os.path.join(args.output_dir, "train.log"),
                                'w'))
    else:
        logger.addHandler(
            logging.FileHandler(os.path.join(args.output_dir, "eval.log"),
                                'w'))

    if args.server_ip and args.server_port:
        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
        import ptvsd
        print("Waiting for debugger attach")
        ptvsd.enable_attach(address=(args.server_ip, args.server_port),
                            redirect_output=True)
        ptvsd.wait_for_attach()

    processors = {"ner": NerProcessor}

    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl')
    logger.info(
        "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".
        format(device, n_gpu, bool(args.local_rank != -1), args.fp16))

    if args.gradient_accumulation_steps < 1:
        raise ValueError(
            "Invalid gradient_accumulation_steps parameter: {}, should be >= 1"
            .format(args.gradient_accumulation_steps))

    args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)

    if not args.do_train and not args.do_eval:
        raise ValueError(
            "At least one of `do_train` or `do_eval` must be True.")

    task_name = args.task_name.lower()

    if task_name not in processors:
        raise ValueError("Task not found: %s" % (task_name))

    processor = processors[task_name]()
    label_list = processor.get_tag_labels(args.data_dir)
    global EVAL_TAGS
    EVAL_TAGS = [
        label for label in label_list if label not in ['O', '[CLS]', '[SEP]']
    ]
    # EVAL_TAGS = [f'{x}-{y}' for x in ['B', 'I'] for y in EVAL_TAGS]
    logger.info(EVAL_TAGS)
    num_labels = len(label_list) + 1
    allowed_tags = set(EVAL_TAGS + ['O'])

    do_lower_case = 'uncased' in args.bert_model
    tokenizer = BertTokenizer.from_pretrained(args.bert_model,
                                              do_lower_case=do_lower_case)

    train_examples = None
    num_train_optimization_steps = 0
    if args.do_train:
        train_examples = processor.get_train_examples(args.data_dir)
        num_train_optimization_steps = int(
            len(train_examples) / args.train_batch_size /
            args.gradient_accumulation_steps) * args.num_train_epochs
        if args.local_rank != -1:
            num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size(
            )

    if args.local_rank not in [-1, 0]:
        torch.distributed.barrier(
        )  # Make sure only the first process in distributed training will download model & vocab

    # Prepare model
    config = BertConfig.from_pretrained(args.bert_model,
                                        num_labels=num_labels,
                                        finetuning_task=args.task_name,
                                        hidden_dropout_prob=args.drop)
    print(config)
    model = Ner.from_pretrained(args.bert_model, from_tf=False, config=config)

    if args.local_rank == 0:
        torch.distributed.barrier(
        )  # Make sure only the first process in distributed training will download model & vocab

    model.to(device)

    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        args.weight_decay
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]
    warmup_steps = int(args.warmup_proportion * num_train_optimization_steps)
    optimizer = AdamW(optimizer_grouped_parameters,
                      lr=args.learning_rate,
                      eps=args.adam_epsilon)
    scheduler = WarmupLinearSchedule(optimizer,
                                     warmup_steps=warmup_steps,
                                     t_total=num_train_optimization_steps)
    if args.fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
            )
        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level=args.fp16_opt_level)

    # multi-gpu training (should be after apex fp16 initialization)
    if n_gpu > 1:
        model = torch.nn.DataParallel(model)

    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(
            model,
            device_ids=[args.local_rank],
            output_device=args.local_rank,
            find_unused_parameters=True)

    global_step = 0
    nb_tr_steps = 0
    tr_loss = 0
    label_map = {i: label for i, label in enumerate(label_list, 1)}
    best_dev = 0.0
    if args.do_validation:
        dev_examples = processor.get_dev_examples(args.data_dir, label_list)
        dev_features = convert_examples_to_features(dev_examples, label_list,
                                                    args.max_seq_length,
                                                    tokenizer)
        logger.info("***** Dev set *****")
        logger.info("  Num examples = %d", len(dev_examples))
        all_input_ids = torch.tensor([f.input_ids for f in dev_features],
                                     dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in dev_features],
                                      dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in dev_features],
                                       dtype=torch.long)
        all_label_ids = torch.tensor([f.label_id for f in dev_features],
                                     dtype=torch.long)
        all_valid_ids = torch.tensor([f.valid_ids for f in dev_features],
                                     dtype=torch.long)
        all_lmask_ids = torch.tensor([f.label_mask for f in dev_features],
                                     dtype=torch.long)
        dev_data = TensorDataset(all_input_ids, all_input_mask,
                                 all_segment_ids, all_label_ids, all_valid_ids,
                                 all_lmask_ids)
        dev_sampler = SequentialSampler(dev_data)
        dev_dataloader = DataLoader(dev_data,
                                    sampler=dev_sampler,
                                    batch_size=args.eval_batch_size)
        validation_steps = int(
            len(train_examples) /
            args.train_batch_size) // args.validate_per_epoch
    if args.do_train:
        train_features = convert_examples_to_features(train_examples,
                                                      label_list,
                                                      args.max_seq_length,
                                                      tokenizer)
        logger.info("***** Running training *****")
        logger.info("  Num examples = %d", len(train_examples))
        logger.info("  Batch size = %d", args.train_batch_size)
        logger.info("  Num steps = %d", num_train_optimization_steps)
        all_input_ids = torch.tensor([f.input_ids for f in train_features],
                                     dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in train_features],
                                      dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in train_features],
                                       dtype=torch.long)
        all_label_ids = torch.tensor([f.label_id for f in train_features],
                                     dtype=torch.long)
        all_valid_ids = torch.tensor([f.valid_ids for f in train_features],
                                     dtype=torch.long)
        all_lmask_ids = torch.tensor([f.label_mask for f in train_features],
                                     dtype=torch.long)
        train_data = TensorDataset(all_input_ids, all_input_mask,
                                   all_segment_ids, all_label_ids,
                                   all_valid_ids, all_lmask_ids)
        if args.local_rank == -1:
            train_sampler = RandomSampler(train_data)
        else:
            train_sampler = DistributedSampler(train_data)
        train_dataloader = DataLoader(train_data,
                                      sampler=train_sampler,
                                      batch_size=args.train_batch_size)

        model.train()
        start_time = time.time()
        for epoch in trange(int(args.num_train_epochs), desc="Epoch"):
            tr_loss = 0
            nb_tr_examples, nb_tr_steps = 0, 0
            for step, batch in enumerate(
                    tqdm(train_dataloader, desc="Iteration")):
                batch = tuple(t.to(device) for t in batch)
                input_ids, input_mask, segment_ids, label_ids, valid_ids, l_mask = batch
                loss = model(input_ids,
                             segment_ids,
                             input_mask,
                             label_ids,
                             valid_ids,
                             l_mask,
                             device=device)
                if n_gpu > 1:
                    loss = loss.mean()  # mean() to average on multi-gpu.
                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps

                if args.fp16:
                    with amp.scale_loss(loss, optimizer) as scaled_loss:
                        scaled_loss.backward()
                    torch.nn.utils.clip_grad_norm_(
                        amp.master_params(optimizer), args.max_grad_norm)
                else:
                    loss.backward()
                    torch.nn.utils.clip_grad_norm_(model.parameters(),
                                                   args.max_grad_norm)

                tr_loss += loss.item()
                nb_tr_examples += input_ids.size(0)
                nb_tr_steps += 1
                if (step + 1) % args.gradient_accumulation_steps == 0:
                    optimizer.step()
                    scheduler.step()  # Update learning rate schedule
                    model.zero_grad()
                    global_step += 1

                if args.do_validation and (step + 1) % validation_steps == 0:
                    logger.info(
                        'Epoch: {}, Step: {} / {}, used_time = {:.2f}s, loss = {:.6f}'
                        .format(epoch, step + 1, len(train_dataloader),
                                time.time() - start_time,
                                tr_loss / nb_tr_steps))
                    model.eval()
                    y_true = []
                    y_pred = []
                    label_map = {
                        i: label
                        for i, label in enumerate(label_list, 1)
                    }
                    label_map[0] = '[PAD]'
                    for batch in tqdm(dev_dataloader, desc='Validation'):
                        batch = tuple(t.to(device) for t in batch)
                        input_ids, input_mask, segment_ids, label_ids, valid_ids, l_mask = batch
                        with torch.no_grad():
                            logits = model(input_ids,
                                           segment_ids,
                                           input_mask,
                                           None,
                                           valid_ids,
                                           l_mask,
                                           device=device)
                        logits = torch.argmax(F.log_softmax(logits, dim=2),
                                              dim=2)
                        logits = logits.detach().cpu().numpy()
                        label_ids = label_ids.to('cpu').numpy()
                        # input_mask = input_mask.to('cpu').numpy()

                        for i, label in enumerate(label_ids):
                            temp_1 = []
                            temp_2 = []
                            for j, m in enumerate(label):
                                if j == 0:
                                    continue
                                elif label_ids[i][j] == len(label_map) - 1:
                                    y_true.append(temp_1)
                                    y_pred.append(temp_2)
                                    break
                                else:
                                    temp_1.append(label_map[label_ids[i][j]])
                                    temp_2.append(label_map[logits[i][j]])

                    y_true_copy = [[
                        x if x in allowed_tags else 'O' for x in y
                    ] for y in y_true]
                    y_pred_copy = [[
                        x if x in allowed_tags else 'O' for x in y
                    ] for y in y_pred]
                    report = classification_report(y_true_copy,
                                                   y_pred_copy,
                                                   digits=6)
                    # report_dict = classification_report(y_true_copy,
                    #                                     y_pred_copy,
                    #                                     output_dict=True)
                    # report_dict = report
                    logger.info("***** Validation results *****")
                    logger.info("\n%s", report)

                    fscore = float([
                        line.strip().split()[4] for line in report.split('\n')
                        if line.strip().startswith('micro')
                    ][0])

                    if fscore > best_dev:
                        logger.info(f'!!!Best dev: {fscore}')
                        logger.info(f'at epoch: {epoch}')
                        best_dev = fscore
                        model_to_save = model.module if hasattr(
                            model,
                            'module') else model  # Only save the model it-self
                        model_to_save.save_pretrained(args.output_dir)
                        tokenizer.save_pretrained(args.output_dir)
                        label_map = {
                            i: label
                            for i, label in enumerate(label_list, 1)
                        }
                        label_map[0] = '[PAD]'
                        model_config = {
                            "bert_model": args.bert_model,
                            "do_lower": args.do_lower_case,
                            "max_seq_length": args.max_seq_length,
                            "num_labels": len(label_list) + 1,
                            "label_map": label_map
                        }
                        json.dump(
                            model_config,
                            open(
                                os.path.join(args.output_dir,
                                             "model_config.json"), "w"))
                    model.train()

    model = Ner.from_pretrained(args.output_dir)
    do_lower_case = 'uncased' in args.bert_model
    tokenizer = BertTokenizer.from_pretrained(args.output_dir,
                                              do_lower_case=do_lower_case)
    model.to(device)

    if args.do_eval and (args.local_rank == -1
                         or torch.distributed.get_rank() == 0):
        args.test_file = os.path.join(
            args.data_dir,
            'test.json') if args.test_file == '' else args.test_file
        eval_examples = processor.get_test_examples(args.test_file)
        eval_features = convert_examples_to_features(eval_examples, label_list,
                                                     args.max_seq_length,
                                                     tokenizer)
        logger.info("***** Running evaluation *****")
        logger.info("  Num examples = %d", len(eval_examples))
        logger.info("  Batch size = %d", args.eval_batch_size)
        all_input_ids = torch.tensor([f.input_ids for f in eval_features],
                                     dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in eval_features],
                                      dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in eval_features],
                                       dtype=torch.long)
        all_label_ids = torch.tensor([f.label_id for f in eval_features],
                                     dtype=torch.long)
        all_valid_ids = torch.tensor([f.valid_ids for f in eval_features],
                                     dtype=torch.long)
        all_lmask_ids = torch.tensor([f.label_mask for f in eval_features],
                                     dtype=torch.long)
        eval_data = TensorDataset(all_input_ids, all_input_mask,
                                  all_segment_ids, all_label_ids,
                                  all_valid_ids, all_lmask_ids)
        # Run prediction for full data
        eval_sampler = SequentialSampler(eval_data)
        eval_dataloader = DataLoader(eval_data,
                                     sampler=eval_sampler,
                                     batch_size=args.eval_batch_size)
        model.eval()
        eval_loss, eval_accuracy = 0, 0
        nb_eval_steps, nb_eval_examples = 0, 0
        y_true = []
        y_pred = []
        tag_scores = []
        label_map = {i: label for i, label in enumerate(label_list, 1)}
        label_map[0] = '[PAD]'
        for input_ids, input_mask, segment_ids, label_ids, valid_ids, l_mask in tqdm(
                eval_dataloader, desc="Evaluating"):
            input_ids = input_ids.to(device)
            input_mask = input_mask.to(device)
            segment_ids = segment_ids.to(device)
            valid_ids = valid_ids.to(device)
            label_ids = label_ids.to(device)
            l_mask = l_mask.to(device)

            with torch.no_grad():
                logits = model(input_ids,
                               segment_ids,
                               input_mask,
                               valid_ids=valid_ids,
                               attention_mask_label=l_mask,
                               device=device)

            scores = np.max(F.softmax(logits, dim=-1).cpu().numpy(), axis=-1)
            logits = torch.argmax(F.log_softmax(logits, dim=2), dim=2)
            logits = logits.detach().cpu().numpy()
            # scores = scores.detach().cpu().numpy()
            label_ids = label_ids.to('cpu').numpy()
            input_mask = input_mask.to('cpu').numpy()

            for i, label in enumerate(label_ids):
                temp_1 = []
                temp_2 = []
                temp_3 = []
                for j, m in enumerate(label):
                    if j == 0:
                        continue
                    elif label_ids[i][j] == len(label_map) - 1:
                        y_true.append(temp_1)
                        y_pred.append(temp_2)
                        tag_scores.append(temp_3)
                        break
                    else:
                        temp_1.append(label_map[label_ids[i][j]])
                        temp_2.append(label_map[logits[i][j]])
                        temp_3.append(scores[i][j])

        y_true_copy = [[x if x in allowed_tags else 'O' for x in y]
                       for y in y_true]
        y_pred_copy = [[x if x in allowed_tags else 'O' for x in y]
                       for y in y_pred]

        # report = classification_report(y_true_copy,
        #                                y_pred_copy, digits=4)

        report = 'all scores are 0!\n'
        logger.info("\n%s", report)
        output_eval_file = os.path.join(
            args.output_dir,
            f"{args.test_file.split('/')[-1]}_eval_results.txt")
        output_preds_file = os.path.join(
            args.output_dir,
            f"{args.test_file.split('/')[-1]}_predictions.tsv")
        with open(output_eval_file, "w") as writer:
            logger.info("***** Eval results *****")
            logger.info("\n%s", report)
            writer.write(report)

        prediction_results = {
            'id': [ex.guid for ex in eval_examples],
            'token': [ex.text_a for ex in eval_examples],
            'tag_label': [' '.join(ex.label) for ex in eval_examples],
            'tag_pred': [' '.join(pred) for pred in y_pred],
            'scores':
            [' '.join([str(x) for x in score]) for score in tag_scores]
        }
        pd.DataFrame(prediction_results).to_csv(output_preds_file,
                                                sep='\t',
                                                index=False)
Example #28
0
def main(args):

    assert args.use_one_optim is True

    if args.use_cls_only:
        args.no_dial = True

    print("### use_cls_only: {:}".format(args.use_cls_only))
    print("### no_dial: {:}".format(args.no_dial))

    if args.recover_e > 0:
        raise NotImplementedError("This option is from my oldest code version. "
                                  "I have not checked it for this code version.")

    if not os.path.exists(args.save_dir):
        os.mkdir(args.save_dir)
        print("### mkdir {:}".format(args.save_dir))

    def worker_init_fn(worker_id):
        np.random.seed(args.random_seed + worker_id)

    n_gpu = 0
    if torch.cuda.is_available() and (not args.use_cpu):
        n_gpu = torch.cuda.device_count()
        device = torch.device('cuda')
        print("### Device: {:}".format(device))
    else:
        print("### Use CPU (Debugging)")
        device = torch.device("cpu")

    if args.random_seed < 0:
        print("### Pick a random seed")
        args.random_seed = random.sample(list(range(0, 100000)), 1)[0]

    print("### Random Seed: {:}".format(args.random_seed))
    np.random.seed(args.random_seed)
    random.seed(args.random_seed)
    rng = random.Random(args.random_seed)
    torch.manual_seed(args.random_seed)

    if n_gpu > 0:
        if args.random_seed >= 0:
            torch.cuda.manual_seed(args.random_seed)
            torch.cuda.manual_seed_all(args.random_seed)

        torch.backends.cudnn.benchmark = False
        torch.backends.cudnn.deterministic = True

    if not os.path.exists(args.save_dir):
        os.mkdir(args.save_dir)

    ontology = json.load(open(args.ontology_data))
    slot_meta, ontology = make_slot_meta(ontology)
    op2id = OP_SET[args.op_code]
    print(op2id)

    tokenizer = BertTokenizer(args.vocab_path, do_lower_case=True)

    train_path = os.path.join(args.data_root, "train.pt")
    dev_path = os.path.join(args.data_root, "dev.pt")
    test_path = os.path.join(args.data_root, "test.pt")

    if not os.path.exists(test_path):
        test_data_raw = prepare_dataset(data_path=args.test_data_path,
                                        tokenizer=tokenizer,
                                        slot_meta=slot_meta,
                                        n_history=args.n_history,
                                        max_seq_length=args.max_seq_length,
                                        op_code=args.op_code)
        torch.save(test_data_raw, test_path)
    else:
        test_data_raw = torch.load(test_path)

    print("# test examples %d" % len(test_data_raw))

    if not os.path.exists(train_path):
        train_data_raw = prepare_dataset(data_path=args.train_data_path,
                                         tokenizer=tokenizer,
                                         slot_meta=slot_meta,
                                         n_history=args.n_history,
                                         max_seq_length=args.max_seq_length,
                                         op_code=args.op_code)

        torch.save(train_data_raw, train_path)
    else:
        train_data_raw = torch.load(train_path)

    train_data = MultiWozDataset(train_data_raw,
                                 tokenizer,
                                 slot_meta,
                                 args.max_seq_length,
                                 rng,
                                 ontology,
                                 args.word_dropout,
                                 args.shuffle_state,
                                 args.shuffle_p, pad_id=tokenizer.convert_tokens_to_ids(['[PAD]'])[0],
                                 slot_id=tokenizer.convert_tokens_to_ids(['[SLOT]'])[0],
                                 decoder_teacher_forcing=args.decoder_teacher_forcing,
                                 use_full_slot=args.use_full_slot,
                                 use_dt_only=args.use_dt_only, no_dial=args.no_dial,
                                 use_cls_only=args.use_cls_only)

    print("# train examples %d" % len(train_data_raw))

    if not os.path.exists(dev_path):
        dev_data_raw = prepare_dataset(data_path=args.dev_data_path,
                                       tokenizer=tokenizer,
                                       slot_meta=slot_meta,
                                       n_history=args.n_history,
                                       max_seq_length=args.max_seq_length,
                                       op_code=args.op_code)
        torch.save(dev_data_raw,  dev_path)
    else:
        dev_data_raw = torch.load(dev_path)

    print("# dev examples %d" % len(dev_data_raw))

    model_config = BertConfig.from_json_file(args.bert_config_path)
    model_config.dropout = args.dropout
    model_config.attention_probs_dropout_prob = args.attention_probs_dropout_prob
    model_config.hidden_dropout_prob = args.hidden_dropout_prob

    type_vocab_size = 4
    dec_config = args
    model = TransformerDST(model_config, dec_config, len(op2id), len(domain2id),
                           op2id['update'],
                           tokenizer.convert_tokens_to_ids(['[MASK]'])[0],
                           tokenizer.convert_tokens_to_ids(['[SEP]'])[0],
                           tokenizer.convert_tokens_to_ids(['[PAD]'])[0],
                           tokenizer.convert_tokens_to_ids(['-'])[0],
                           type_vocab_size, args.exclude_domain)

    if not os.path.exists(args.bert_ckpt_path):
        args.bert_ckpt_path = download_ckpt(args.bert_ckpt_path, args.bert_config_path, 'assets')

    state_dict = torch.load(args.bert_ckpt_path, map_location='cpu')
    _k = 'embeddings.token_type_embeddings.weight'
    print("config.type_vocab_size != state_dict[bert.embeddings.token_type_embeddings.weight] ({0} != {1})".format(
            type_vocab_size, state_dict[_k].shape[0]))
    state_dict[_k].resize_(
        type_vocab_size, state_dict[_k].shape[1])
    state_dict[_k].data[2, :].copy_(state_dict[_k].data[0, :])
    state_dict[_k].data[3, :].copy_(state_dict[_k].data[0, :])
    model.bert.load_state_dict(state_dict)
    print("\n### Done Load BERT")
    sys.stdout.flush()

    # re-initialize added special tokens ([SLOT], [NULL], [EOS])
    model.bert.embeddings.word_embeddings.weight.data[1].normal_(mean=0.0, std=0.02)
    model.bert.embeddings.word_embeddings.weight.data[2].normal_(mean=0.0, std=0.02)
    model.bert.embeddings.word_embeddings.weight.data[3].normal_(mean=0.0, std=0.02)

    # re-initialize seg-2, seg-3
    model.bert.embeddings.token_type_embeddings.weight.data[2].normal_(mean=0.0, std=0.02)
    model.bert.embeddings.token_type_embeddings.weight.data[3].normal_(mean=0.0, std=0.02)
    model.to(device)

    num_train_steps = int(len(train_data_raw) / args.batch_size * args.n_epochs)

    if args.use_one_optim:
        print("### Use One Optim")
        param_optimizer = list(model.named_parameters())
        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [
            {'params': [p for n, p in param_optimizer if not any(
                nd in n for nd in no_decay)], 'weight_decay': 0.01},
            {'params': [p for n, p in param_optimizer if any(
                nd in n for nd in no_decay)], 'weight_decay': 0.0}
        ]
        optimizer = AdamW(optimizer_grouped_parameters, lr=args.enc_lr)
        scheduler = WarmupLinearSchedule(optimizer, int(num_train_steps * args.enc_warmup),
                                             t_total=num_train_steps)
    else:
        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        enc_param_optimizer = list(model.bert.named_parameters())  # TODO: For BERT only
        print('### Optim BERT: {:}'.format(len(enc_param_optimizer)))
        enc_optimizer_grouped_parameters = [
            {'params': [p for n, p in enc_param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
            {'params': [p for n, p in enc_param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
            ]

        enc_optimizer = AdamW(enc_optimizer_grouped_parameters, lr=args.enc_lr)
        enc_scheduler = WarmupLinearSchedule(enc_optimizer, int(num_train_steps * args.enc_warmup),
                                             t_total=num_train_steps)

        dec_param_optimizer = list(model.named_parameters())  # TODO:  For other parameters
        print('### Optim All: {:}'.format(len(dec_param_optimizer)))
        dec_param_optimizer = [p for (n, p) in dec_param_optimizer if 'bert' not in n]
        print('### Optim OTH: {:}'.format(len(dec_param_optimizer)))
        dec_optimizer = AdamW(dec_param_optimizer, lr=args.dec_lr)
        dec_scheduler = WarmupLinearSchedule(dec_optimizer, int(num_train_steps * args.dec_warmup),
                                             t_total=num_train_steps)

    if args.recover_e > 0:
        model_recover, enc_recover, dec_recover = load(args, str(args.recover_e))
        print("### Recover Model E{:}".format(args.recover_e))
        sys.stdout.flush()
        model.load_state_dict(model_recover)
        print("### Recover Optim E{:}".format(args.recover_e))
        sys.stdout.flush()
        enc_optimizer.load_state_dict(enc_recover)
        dec_optimizer.load_state_dict(dec_optimizer)

    if n_gpu > 1:
        model = torch.nn.DataParallel(model)

    train_sampler = RandomSampler(train_data)
    train_dataloader = DataLoader(train_data,
                                  sampler=train_sampler,
                                  batch_size=args.batch_size,
                                  collate_fn=train_data.collate_fn,
                                  num_workers=args.num_workers,
                                  worker_init_fn=worker_init_fn)

    loss_fnc = nn.CrossEntropyLoss()
    best_score = {'epoch': 0, 'joint_acc': 0, 'op_acc': 0, 'final_slot_f1': 0}

    start_time = time.time()

    for epoch in range(args.n_epochs):
        batch_loss = []
        model.train()
        for step, batch in enumerate(train_dataloader):

            batch = [b.to(device) if (not isinstance(b, int)) and (not isinstance(b, dict) and (not isinstance(b, list)) and (not isinstance(b, np.ndarray))) else b for b in batch]

            input_ids_p, segment_ids_p, input_mask_p, \
            state_position_ids, op_ids, domain_ids, input_ids_g, segment_ids_g, position_ids_g, input_mask_g, \
            masked_pos, masked_weights, lm_label_ids, id_n_map, gen_max_len, n_total_pred = batch

            domain_scores, state_scores, loss_g = model(input_ids_p, segment_ids_p, input_mask_p, state_position_ids,
                input_ids_g, segment_ids_g, position_ids_g, input_mask_g,
                masked_pos, masked_weights, lm_label_ids, id_n_map, gen_max_len, only_pred_op=args.only_pred_op, n_gpu=n_gpu)

            if n_total_pred > 0:
                loss_g = loss_g.sum() / n_total_pred
            else:
                loss_g = 0

            loss_s = loss_fnc(state_scores.view(-1, len(op2id)), op_ids.view(-1))

            if args.only_pred_op:
                loss = loss_s
            else:
                loss = loss_s + loss_g

            if args.exclude_domain is not True:
                loss_d = loss_fnc(domain_scores.view(-1, len(domain2id)), domain_ids.view(-1))
                loss = loss + loss_d

            batch_loss.append(loss.item())

            loss.backward()

            if args.use_one_optim:
                optimizer.step()
                scheduler.step()
            else:
                enc_optimizer.step()
                enc_scheduler.step()
                dec_optimizer.step()
                dec_scheduler.step()

            model.zero_grad()

            if step % 100 == 0:
                try:
                    loss_g = loss_g.item()
                except AttributeError:
                    loss_g = loss_g

                if args.exclude_domain is not True:
                    print("time %.1f min, [%d/%d] [%d/%d] mean_loss : %.3f, state_loss : %.3f, gen_loss : %.3f, dom_loss : %.3f" \
                          % ((time.time()-start_time)/60, epoch+1, args.n_epochs, step,
                             len(train_dataloader), np.mean(batch_loss),
                             loss_s.item(), loss_g, loss_d.item()))
                else:
                    print("time %.1f min, [%d/%d] [%d/%d] mean_loss : %.3f, state_loss : %.3f, gen_loss : %.3f" \
                          % ((time.time()-start_time)/60, epoch+1, args.n_epochs, step,
                             len(train_dataloader), np.mean(batch_loss),
                             loss_s.item(), loss_g))

                sys.stdout.flush()
                batch_loss = []

        if args.use_one_optim:
            save(args, epoch + 1, model, optimizer)
        else:
            save(args, epoch + 1, model, enc_optimizer, dec_optimizer)

        if ((epoch+1) % args.eval_epoch == 0) and (epoch+1 >= 8):
            eval_res = model_evaluation(model, dev_data_raw, tokenizer, slot_meta, epoch+1, args.op_code,
                                        use_full_slot=args.use_full_slot, use_dt_only=args.use_dt_only, no_dial=args.no_dial, use_cls_only=args.use_cls_only, n_gpu=n_gpu)
            print("### Epoch {:} Score : ".format(epoch+1), eval_res)

            if eval_res['joint_acc'] > best_score['joint_acc']:
                best_score = eval_res
                print("### Best Joint Acc: {:} ###".format(best_score['joint_acc']))
                print('\n')

                if epoch+1 >= 8:  # To speed up
                    eval_res_test = model_evaluation(model, test_data_raw, tokenizer, slot_meta, epoch + 1, args.op_code,
                                                     use_full_slot=args.use_full_slot, use_dt_only=args.use_dt_only, no_dial=args.no_dial, use_cls_only=args.use_cls_only, n_gpu=n_gpu)
                    print("### Epoch {:} Test Score : ".format(epoch + 1), eval_res_test)
Example #29
0
def train(args, train_dataset, model, dev_dataset):
    """ Train the model """
    if args.local_rank in [-1, 0]:
        tb_writer = SummaryWriter(log_dir=args.tensorboard_dir)

    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
    train_sampler = RandomSampler(
        train_dataset) if args.local_rank == -1 else DistributedSampler(
            train_dataset)
    train_dataloader = DataLoader(train_dataset,
                                  sampler=train_sampler,
                                  batch_size=args.train_batch_size)

    if args.max_steps > 0:
        t_total = args.max_steps
        args.num_train_epochs = args.max_steps // (
            len(train_dataloader) // args.gradient_accumulation_steps) + 1
    else:
        t_total = len(
            train_dataloader
        ) // args.gradient_accumulation_steps * args.num_train_epochs

    # Prepare optimizer and schedule (linear warmup and decay)
    no_decay = ['bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params': [
            p for n, p in model.named_parameters()
            if not any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        args.weight_decay
    }, {
        'params': [
            p for n, p in model.named_parameters()
            if any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        0.0
    }]
    optimizer = AdamW(optimizer_grouped_parameters,
                      lr=args.learning_rate,
                      eps=args.adam_epsilon)
    scheduler = WarmupLinearSchedule(optimizer,
                                     warmup_steps=args.warmup_steps,
                                     t_total=t_total)
    if args.fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
            )
        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level=args.fp16_opt_level)

    # multi-gpu training (should be after apex fp16 initialization)
    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Distributed training (should be after apex fp16 initialization)
    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(
            model,
            device_ids=[args.local_rank],
            output_device=args.local_rank,
            find_unused_parameters=True)

    # Train!
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_dataset))
    logger.info("  Num Epochs = %d", args.num_train_epochs)
    logger.info("  Instantaneous batch size per GPU = %d",
                args.per_gpu_train_batch_size)
    logger.info(
        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
        args.train_batch_size * args.gradient_accumulation_steps *
        (torch.distributed.get_world_size() if args.local_rank != -1 else 1))
    logger.info("  Gradient Accumulation steps = %d",
                args.gradient_accumulation_steps)
    logger.info("  Total optimization steps = %d", t_total)

    global_step = 0
    tr_loss, logging_loss = 0.0, 0.0
    model.zero_grad()
    train_iterator = trange(int(args.num_train_epochs),
                            desc="Epoch",
                            disable=args.local_rank not in [-1, 0])
    set_seed(
        args)  # Added here for reproductibility (even between python 2 and 3)
    for epoch in train_iterator:
        set_seed(
            args, epoch + 10
        )  # Added here for reproductibility (even between python 2 and 3)
        # logger.info("  seed = %d", torch.initial_seed())
        epoch_iterator = tqdm(train_dataloader,
                              desc="Iteration",
                              disable=args.local_rank not in [-1, 0])
        for step, batch in enumerate(epoch_iterator):
            model.train()
            batch = tuple(t.to(args.device) for t in batch)
            inputs = {
                'input_ids':
                batch[0],
                'entity_a':
                batch[1],
                'entity_b':
                batch[2],
                'attention_mask':
                batch[3],
                'token_type_ids':
                batch[4] if args.model_type in ['bert', 'xlnet'] else None,
                # XLM don't use segment_ids
                'labels':
                batch[5]
            }
            outputs = model(**inputs)
            loss = outputs[
                0]  # model outputs are always tuple in pytorch-transformers (see doc)

            if args.n_gpu > 1:
                loss = loss.mean(
                )  # mean() to average on multi-gpu parallel training
            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps

            if args.fp16:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
                torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer),
                                               args.max_grad_norm)
            else:
                loss.backward()
                torch.nn.utils.clip_grad_norm_(model.parameters(),
                                               args.max_grad_norm)

            tr_loss += loss.item()
            if (step + 1) % args.gradient_accumulation_steps == 0:
                scheduler.step()  # Update learning rate schedule
                optimizer.step()
                model.zero_grad()
                global_step += 1

                if args.local_rank in [
                        -1, 0
                ] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
                    # Log metrics
                    if args.local_rank == -1 and args.evaluate_during_training:
                        # Only evaluate when single GPU otherwise metrics may not average well
                        results, eval_loss = evaluate(args, model, dev_dataset)
                        for key, value in results.items():
                            tb_writer.add_scalar('eval_{}'.format(key), value,
                                                 global_step)
                    logger.info(" global step = %d", global_step)
                    tb_writer.add_scalar('lr',
                                         scheduler.get_lr()[0], global_step)
                    tb_writer.add_scalar('loss', (tr_loss - logging_loss) /
                                         args.logging_steps, global_step)
                    logging_loss = tr_loss

                if args.local_rank in [
                        -1, 0
                ] and args.save_steps > 0 and global_step % args.save_steps == 0:
                    # Save model checkpoint
                    output_dir = os.path.join(
                        args.output_dir, 'checkpoint-{}'.format(global_step))
                    if not os.path.exists(output_dir):
                        os.makedirs(output_dir)
                    # Take care of distributed/parallel training
                    model_to_save = model.module if hasattr(
                        model, 'module') else model
                    model_to_save.save_pretrained(output_dir)
                    torch.save(args,
                               os.path.join(output_dir, 'training_args.bin'))
                    logger.info("Saving model checkpoint to %s", output_dir)

            if 0 < args.max_steps < global_step:
                epoch_iterator.close()
                break
        if 0 < args.max_steps < global_step:
            train_iterator.close()
            break

    if args.local_rank in [-1, 0]:
        tb_writer.close()

    return global_step, tr_loss / global_step
Example #30
0
def train(args, train_dataset, model, tokenizer):
    """ Train the model """
    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
    train_sampler = RandomSampler(train_dataset)
    train_dataloader = DataLoader(train_dataset,
                                  sampler=train_sampler,
                                  batch_size=args.train_batch_size)

    if args.max_steps > 0:
        t_total = args.max_steps
        args.num_train_epochs = args.max_steps // (
            len(train_dataloader) // args.gradient_accumulation_steps) + 1
    else:
        t_total = len(
            train_dataloader
        ) // args.gradient_accumulation_steps * args.num_train_epochs

    # Prepare optimizer and schedule (linear warmup and decay)
    no_decay = ['bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params': [
            p for n, p in model.named_parameters()
            if not any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        args.weight_decay
    }, {
        'params': [
            p for n, p in model.named_parameters()
            if any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        0.0
    }]
    optimizer = AdamW(optimizer_grouped_parameters,
                      lr=args.learning_rate,
                      eps=args.adam_epsilon)
    scheduler = WarmupLinearSchedule(optimizer,
                                     warmup_steps=args.warmup_steps,
                                     t_total=t_total)

    global_step = 0
    tr_loss, logging_loss = 0.0, 0.0
    model.zero_grad()
    train_iterator = trange(int(args.num_train_epochs), desc="Epoch")
    set_seed(
        args)  # Added here for reproductibility (even between python 2 and 3)
    epochnum = 0
    best_val_acc = None
    for _ in train_iterator:
        epoch_iterator = tqdm(train_dataloader, desc="Iteration")
        logging_steps = 0
        preds = None
        out_label_ids = None
        epochnum += 1
        for step, batch in enumerate(epoch_iterator):
            model.train()
            batch = tuple(t.to(args.device) for t in batch)
            inputs = {
                'input_ids':
                batch[0],
                'attention_mask':
                batch[1],
                'token_type_ids':
                batch[2] if args.model_type in ['bert', 'xlnet'] else
                None,  # XLM and RoBERTa don't use segment_ids
                'labels':
                batch[3]
            }
            outputs = model(**inputs)
            loss, logits = outputs[:2]

            if args.n_gpu > 1:
                loss = loss.mean(
                )  # mean() to average on multi-gpu parallel training
            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps

            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(),
                                           args.max_grad_norm)

            tr_loss += loss.item()
            scheduler.step()  # Update learning rate schedule
            optimizer.step()
            model.zero_grad()
            global_step += 1
            logging_steps += 1
            if preds is None:
                preds = logits.detach().cpu().numpy()
                out_label_ids = inputs['labels'].detach().cpu().numpy()
            else:
                preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)
                out_label_ids = np.append(
                    out_label_ids,
                    inputs['labels'].detach().cpu().numpy(),
                    axis=0)

        preds = np.argmax(preds, axis=1)
        tr_acc = (preds == out_label_ids).mean()
        # evaluate model
        eval_dataset = load_and_cache_examples(args,
                                               args.task_name,
                                               tokenizer,
                                               type='dev')
        eval_loss, eval_acc = evaluate(args, model, eval_dataset)
        if not best_val_acc or eval_acc > best_val_acc:
            if not os.path.exists(args.output_dir):
                os.makedirs(args.output_dir)
            model_to_save = model.module if hasattr(
                model, 'module'
            ) else model  # Take care of distributed/parallel training
            model_to_save.save_pretrained(args.output_dir)
            tokenizer.save_pretrained(args.output_dir)
            torch.save(args, os.path.join(args.output_dir,
                                          'training_args.bin'))
            best_val_acc = eval_acc
        else:
            args.learning_rate /= 4.0
            for g in optimizer.param_groups:
                g['lr'] = args.learning_rate

        print(
            'epoch {} | train_loss {:.6f} | train_acc {:.6f} | dev_loss {:.6f} | dev_acc {:.6f}'
            .format(epochnum, (tr_loss - logging_loss) / logging_steps, tr_acc,
                    eval_loss, eval_acc))
        logging_loss = tr_loss

    return global_step, tr_loss / global_step