Example #1
0
 def on_valid_end(self, eval_result, metric_key, optimizer, better_result):
     if better_result:
         eval_result = deepcopy(eval_result)
         eval_result['step'] = self.step
         eval_result['epoch'] = self.epoch
         fitlog.add_best_metric(eval_result)
     fitlog.add_metric(eval_result, step=self.step, epoch=self.epoch)
     if len(self.testers) > 0:
         for key, tester in self.testers.items():
             try:
                 eval_result = tester.test()
                 if self.verbose != 0:
                     self.pbar.write(
                         "FitlogCallback evaluation on {}:".format(key))
                     self.pbar.write(
                         tester._format_eval_results(eval_result))
                 fitlog.add_metric(eval_result,
                                   name=key,
                                   step=self.step,
                                   epoch=self.epoch)
                 if better_result:
                     fitlog.add_best_metric(eval_result, name=key)
             except Exception as e:
                 self.pbar.write(
                     "Exception happens when evaluate on DataSet named `{}`."
                     .format(key))
                 raise e
Example #2
0
 def on_valid_end(self, eval_result, metric_key, optimizer, better_result):
     if better_result:
         eval_result = deepcopy(eval_result)
         eval_result['step'] = self.step
         eval_result['epoch'] = self.epoch
         fitlog.add_best_metric(eval_result)
     fitlog.add_metric(eval_result, step=self.step, epoch=self.epoch)
     if better_result:
         for key, eval_result in self._save_metrics.items():
             fitlog.add_best_metric(eval_result, name=key)
Example #3
0
File: train.py Project: FFTYYY/Poem
def train(model , train_data , test_data):
	train_iter = DataSetIter(train_data , batch_size = C.batch_size)
	test_iter  = DataSetIter(test_data  , batch_size = C.batch_size)

	loss_func = nn.CrossEntropyLoss(ignore_index = 0)
	optim = tc.optim.Adam(params = model.parameters() , lr = C.lr , weight_decay = C.weight_decay)	
	scheduler = get_cosine_schedule_with_warmup(
		optim , 
		num_warmup_steps = C.warmup ,
		num_training_steps = train_iter.num_batches * C.epoch_number , 
	)

	best_test_loss 	= -1
	best_test_epoch = -1
	best_step 		= -1
	try:
		for epoch_n in range(C.epoch_number):
			tra_loss = run(model , train_iter , loss_func , epoch_n , optim , scheduler , True)
			tes_loss = run(model , test_iter , loss_func , epoch_n , None , None , False)

			logger.log ("Epoch %d ended. Train loss = %.4f , Valid loss = %.4f" % (
				epoch_n , tra_loss , tes_loss ,
			))
			fitlog.add_metric(
				tes_loss , 
				step = train_iter.num_batches * (epoch_n + 1) , 
				epoch = epoch_n , 
				name = "valid loss"
			)

			if best_test_epoch < 0 or tes_loss < best_test_loss:
				best_test_loss = tes_loss
				best_test_epoch = epoch_n
				best_step = fitlog_loss_step["train loss"]

				fitlog.add_best_metric(best_test_loss , name = "loss")
				with open(C.model_save , "wb") as fil:#暂时保存目前最好的模型
					pickle.dump(model , fil)
				fitlog.add_hyper(name = "best_step" , value =  "%d / %d" % (
					best_step ,
					train_iter.num_batches * C.epoch_number , 
				))

	except KeyboardInterrupt: # 手动提前停止
		pass

	logger.log ("Train end.")
	logger.log ("Got best valid loss %.4f in epoch %d" % (best_test_loss , best_test_epoch))

	return model
Example #4
0
def valid_one_epoch(fold,
                    epoch,
                    model,
                    criterion,
                    optimizer,
                    dataloader,
                    device,
                    scheduler=None):
    global min_loss, max_dice, save_dir
    model.eval()

    img_labels, img_preds = [], []
    total_loss, length = .0, 0

    pbar = tqdm(dataloader)
    for step, (imgs, labels) in enumerate(pbar):
        imgs = imgs.to(device)
        labels = labels.to(device)

        preds = model(imgs)
        img_preds.append(preds.sigmoid())
        img_labels.append(labels)

    preds = torch.cat(img_preds)
    labels = torch.cat(img_labels).type_as(preds)

    dice = calc_dice(preds, labels, args.thersh)
    fitlog.add_metric({"val": {f"fold_{fold}_dice": dice}}, step=epoch)
    if not save_dir:
        save_dir = fitlog.get_log_folder(absolute=True)  # 将模型保存在对应fitlog文件夹下
    if dice > max_dice:
        max_dice = dice
        fitlog.add_best_metric({"val": {f"fold_{fold}_dice": max_dice}})
        torch.save(
            model.state_dict(),
            f'{save_dir}/{args.structure}_{args.encoder}_fold{fold}_best.pth')

    print(f'fold {fold} epoch {epoch}, valid dice {dice:.4f}')
Example #5
0
def valid_one_epoch(fold,
                    epoch,
                    model,
                    criterion,
                    optimizer,
                    dataloader,
                    device,
                    scheduler=None):
    global min_loss, max_acc, save_dir
    model.eval()

    img_labels, img_preds = [], []
    total_loss, length = .0, 0

    pbar = tqdm(dataloader)
    for step, (imgs, labels) in enumerate(pbar):
        imgs = imgs.to(device)
        labels = labels.to(device)

        preds = model(imgs)
        # img_preds.append(torch.argmax(preds, dim=1).detach().cpu().numpy())
        img_preds.append((preds.sigmoid() > 0.5).detach().cpu().numpy())
        img_labels.append(labels.detach().cpu().numpy())

    img_preds = np.concatenate(img_preds)
    img_labels = np.concatenate(img_labels)
    acc = (img_preds == img_labels).mean()
    fitlog.add_metric({"val": {f"fold_{fold}_acc": acc}}, step=epoch)
    save_dir = fitlog.get_log_folder(absolute=True)
    if acc > max_acc:
        max_acc = acc
        fitlog.add_best_metric({"val": {f"fold_{fold}_acc": max_acc}})
        torch.save(model.state_dict(),
                   f'{save_dir}/{args.model}_fold{fold}_best.pth')

    print(f'fold {fold} epoch {epoch}, valid acc {acc:.4f}')
Example #6
0
def valid_one_epoch(fold,
                    epoch,
                    model,
                    loss_fn,
                    val_loader,
                    device,
                    scheduler=None,
                    schd_loss_update=False):
    global max_acc, min_loss

    model.eval()

    t = time.time()
    loss_sum = 0
    sample_num = 0
    image_preds_all = []
    image_targets_all = []

    pbar = tqdm(enumerate(val_loader), total=len(val_loader))
    for step, (imgs, image_labels) in pbar:
        imgs = imgs.to(device).float()
        image_labels = image_labels.to(device).long()

        image_preds = model(imgs)  #output = model(input)
        #print(image_preds.shape, exam_pred.shape)
        image_preds_all += [
            torch.argmax(image_preds, 1).detach().cpu().numpy()
        ]
        image_targets_all += [image_labels.detach().cpu().numpy()]

        loss = loss_fn(image_preds, image_labels)

        loss_sum += loss.item() * image_labels.shape[0]
        sample_num += image_labels.shape[0]

        if ((step + 1) % args.verbose == 0) or ((step + 1) == len(val_loader)):
            description = f'epoch {epoch} loss: {loss_sum/sample_num:.4f}'
            pbar.set_description(description)

    image_preds_all = np.concatenate(image_preds_all)
    image_targets_all = np.concatenate(image_targets_all)

    acc = (image_preds_all == image_targets_all).mean()
    if loss_sum < min_loss:
        min_loss = loss_sum
        fitlog.add_best_metric({'loss': min_loss})
        fitlog.add_best_metric({'loss_epoch': epoch})
    if acc > max_acc:
        max_acc = acc
        fitlog.add_best_metric({'acc': max_acc})
        fitlog.add_best_metric({'acc_epoch': epoch})
        torch.save(model.state_dict(),
                   '{}/{}_fold_{}_best'.format(save_dir, args.model, fold))

    print('validation multi-class accuracy = {:.4f}'.format(
        (image_preds_all == image_targets_all).mean()))

    if scheduler is not None:
        if schd_loss_update:
            scheduler.step(loss_sum / sample_num)
        else:
            scheduler.step()
Example #7
0
            train_one_epoch(epoch,
                            model,
                            criterion,
                            optimizer,
                            train_loader,
                            device,
                            scheduler=scheduler,
                            schd_batch_update=False)

            with torch.no_grad():
                valid_one_epoch(fold,
                                epoch,
                                model,
                                criterion,
                                val_loader,
                                device,
                                scheduler=None,
                                schd_loss_update=False)

            torch.save(model.state_dict(),
                       '{}/{}_fold_{}_last'.format(save_dir, args.model, fold))

        del model, optimizer, train_loader, val_loader, scaler, scheduler
        torch.cuda.empty_cache()
        acc.append(max_acc)

    if len(acc) > 1:
        nfold = len(acc)
        fitlog.add_best_metric({str(nfold) + 'fold': np.mean(acc)})
    fitlog.finish()
Example #8
0
    def _train(self, criterion, optimizer, train_data_loader, val_data_loader,
               test_data_loader):
        fitlog.add_hyper({
            "model_name": self.opt.model_name,
            "dataset": self.opt.dataset,
            'resplit': self.opt.resplit,
            "domain": self.opt.domain,
            "aug": self.opt.aug,
            "adv": self.opt.adv,
            "aux": self.opt.aux,
            "adv_aux": self.opt.adv_aux,
            'chg': self.opt.chg
        })

        max_val_acc = 0
        max_val_f1 = 0
        global_step = 0
        last_model_path = None
        # model_path =None
        path = None

        pgd = PGD(self.model)
        k = 3
        for epoch in range(self.opt.num_epoch):
            logger.info('>' * 100)
            logger.info('epoch: {}'.format(epoch))
            n_correct, n_total, loss_total = 0, 0, 0
            # switch model to training mode
            self.model.train()
            for i_batch, sample_batched in enumerate(train_data_loader):
                global_step += 1
                # clear gradient accumulators
                optimizer.zero_grad()

                inputs = [
                    sample_batched[col].to(self.opt.device)
                    for col in self.opt.inputs_cols
                ]
                if self.opt.model_name == 'bert_multi_target':
                    targets = sample_batched['polarity'].to(self.opt.device)
                else:
                    targets = sample_batched['polarity'].to(self.opt.device)

                if self.opt.model_name in reg_list:
                    aux_cls_logeits, outputs, reg_can_loss, reg_aux_loss, bert_word_output, reg_chg_loss = self.model(
                        inputs, None)
                else:
                    outputs = self.model(inputs)
                    reg_can_loss = 0
                    reg_aux_loss = 0
                    reg_chg_loss = 0
                # print('outputs',outputs.shape)
                # print('targets',targets.shape)

                # print(outputs,'outputs')
                # print(targets,'polarity')

                loss_1 = criterion(outputs, targets)
                loss_2 = reg_can_loss
                loss_3 = reg_aux_loss
                loss_4 = reg_chg_loss

                weighted_loss_2 = loss_2 * self.opt.can
                weighted_loss_3 = loss_3 * self.opt.aux
                weighted_loss_4 = loss_4 * self.opt.chg

                loss = 1 * loss_1 + weighted_loss_2 + weighted_loss_3 + weighted_loss_4

                if self.opt.adv > 0:
                    # print(inputs.shape)
                    if self.opt.adv_aux == 1:
                        loss_adv = self._loss_adv(weighted_loss_3,
                                                  bert_word_output,
                                                  criterion,
                                                  inputs,
                                                  targets,
                                                  p_mult=self.opt.adv)
                    else:
                        loss_adv = self._loss_adv(loss,
                                                  bert_word_output,
                                                  criterion,
                                                  inputs,
                                                  targets,
                                                  p_mult=self.opt.adv)
                    loss += loss_adv
                else:
                    loss_adv = 0
                loss.backward()

                # pgd.backup_grad()
                #     for t in range(K):
                #         pgd.attack(is_first_attack=(t==0)) # 在embedding上添加对抗扰动, first attack时备份param.data
                #         if t != K-1:
                #             model.zero_grad()
                #         else:
                #             pgd.restore_grad()
                #         loss_adv = model(batch_input, batch_label)
                #         loss_adv.backward() # 反向传播,并在正常的grad基础上,累加对抗训练的梯度
                #     pgd.restore() # 恢复embedding参数

                optimizer.step()

                n_correct += (torch.argmax(outputs,
                                           -1) == targets).sum().item()
                # print(outputs.shape)
                # n_correct += (torch.argmax(aux_cls_logeits, -1) == 4*targets).sum().item()
                n_total += len(outputs)
                loss_total += loss.item() * len(outputs)
                if global_step % self.opt.log_step == 0:
                    train_acc = n_correct / n_total
                    train_loss = loss_total / n_total
                    logger.info(
                        'loss_total: {:.4f}, acc: {:.4f},loss_main: {:.4f},reg_can_loss: {:.4f},loss_adv: {:.4f},reg_aux_loss {:.4f},reg_chg_loss {:.4f}'
                        .format(train_loss, train_acc, loss_1, weighted_loss_2,
                                loss_adv, weighted_loss_3, weighted_loss_4))
                    fitlog.add_metric(
                        {
                            "Train": {
                                'loss_total: {:.4f}, acc: {:.4f},loss_main: {:.4f},reg_can_loss: {:.4f},loss_adv: {:.4f},reg_aux_loss {:.4f},reg_chg_loss {:.4f}'
                                .format(train_loss, train_acc, loss_1,
                                        weighted_loss_2, loss_adv,
                                        weighted_loss_3, weighted_loss_4)
                            }
                        },
                        step=global_step)
            val_acc, val_f1 = self._evaluate_acc_f1(val_data_loader)
            test_acc, test_f1 = self._evaluate_acc_f1(test_data_loader)

            logger.info('> val_acc: {:.4f}, val_f1: {:.4f}'.format(
                val_acc, val_f1))
            logger.info('> test_acc: {:.4f}, test_f1: {:.4f}'.format(
                test_acc, test_f1))

            if val_acc > max_val_acc:
                max_val_acc = val_acc
                if not os.path.exists('state_dict'):
                    os.mkdir('state_dict')
                model_path = 'state_dict/{0}_{1}_doamin-{2}_can{3}_aug{4}_adv{5}_aux{6}_val_acc{7}_resplit{8}'.format(
                    self.opt.model_name, self.opt.dataset, self.opt.domain,
                    self.opt.can, self.opt.aug, self.opt.adv, self.opt.aux,
                    round(val_acc, 4), self.opt.resplit)
                bert_path = 'state_dict/{0}_{1}_doamin-{2}_can{3}_aug{4}_adv{5}_aux{6}_val_acc{7}_resplit{8}_bert'.format(
                    self.opt.model_name, self.opt.dataset, self.opt.domain,
                    self.opt.can, self.opt.aug, self.opt.adv, self.opt.aux,
                    round(val_acc, 4), self.opt.resplit)

                # fitlog.add_hyper({"model_name":self.opt.model_name,"dataset":self.opt.dataset,'resplit':self.opt.resplit,"domain":self.opt.domain,"aug":self.opt.aug,"adv":self.opt.adv,"aux":self.opt.aux})

                fitlog.add_metric(
                    {"val": {
                        "val_acc": val_acc,
                        "val_f1": val_f1
                    }},
                    step=global_step)
                fitlog.add_metric(
                    {"test": {
                        "test_acc": test_acc,
                        "test_f1": test_f1
                    }},
                    step=global_step)

                fitlog.add_best_metric(
                    {"val": {
                        "val_acc": val_acc,
                        "val_f1": val_f1
                    }})
                fitlog.add_best_metric(
                    {"test": {
                        "test_acc": test_acc,
                        "test_f1": test_f1
                    }})

                if last_model_path != None:
                    os.remove(last_model_path)
                    if self.opt.model_name not in ['lcf_bert']:
                        os.remove(last_bert_path)
                last_model_path = model_path
                last_bert_path = bert_path
                torch.save(self.model.state_dict(), model_path)
                if self.opt.model_name not in ['lcf_bert']:
                    torch.save(self.model.bert.state_dict(), bert_path)
                logger.info('>> saved: {}'.format(model_path))

                # max_val_f1 = val_f1
            if val_f1 > max_val_f1:
                max_val_f1 = val_f1
                # fitlog.add_metric(acc,name="Acc",step=step)

        return model_path
Example #9
0
    def fit(self):
        last_miou = .0  # record the best validation mIoU
        loss_step = 0  # step count
        for epoch in range(self.conf.epochs):
            train_loss = .0
            start = time.time()
            for i, (data, target) in enumerate(self.train_iter):
                gpu_datas = split_and_load(data, ctx_list=self.ctx)
                gpu_targets = split_and_load(target, ctx_list=self.ctx)
                with autograd.record():
                    loss_gpu = [
                        self.criterion(*self.net(gpu_data), gpu_target)
                        for gpu_data, gpu_target in zip(
                            gpu_datas, gpu_targets)
                    ]
                for loss in loss_gpu:
                    autograd.backward(loss)
                self.trainer.step(self.conf.bs_train)
                nd.waitall()
                loss_temp = .0
                for losses in loss_gpu:
                    loss_temp += losses.sum().asscalar()
                train_loss += (loss_temp / self.conf.bs_train)
                # log every n batch
                # add loss to draw curve, train_loss <class numpy.float64>
                interval = 5 if loss_step < 5000 else 50
                if (i % interval == 0) or (i + 1 == len(self.train_iter)):
                    fitlog.add_loss(name='loss',
                                    value=round(train_loss / (i + 1), 5),
                                    step=loss_step)
                    loss_step += 1
                    self.logger.info(
                        "Epoch %d, batch %d, training loss %.5f." %
                        (epoch, i, train_loss / (i + 1)))
            # log each epoch
            self.logger.info(
                ">>>>>> Epoch %d complete, time cost: %.1f sec. <<<<<<" %
                (epoch, time.time() - start))
            # validation each epoch
            if self.val:
                pixel_acc, mean_iou = self._validation()
                self.logger.info(
                    "Epoch %d validation, PixelAccuracy: %.4f, mIoU: %.4f." %
                    (epoch, pixel_acc, mean_iou))
                fitlog.add_metric(value=mean_iou, step=epoch, name='mIoU')
                fitlog.add_metric(value=pixel_acc, step=epoch, name='PA')
                if mean_iou > last_miou:
                    f_name = self._save_model(tag='best')
                    self.logger.info(
                        "Epoch %d mIoU: %.4f > %.4f(previous), save model: %s"
                        % (epoch, mean_iou, last_miou, f_name))
                    last_miou = mean_iou

        # save the final-epoch params
        f_name = self._save_model(tag='last')
        self.logger.info(">>>>>> Training complete, save model: %s. <<<<<<" %
                         f_name)
        # record
        fitlog.add_best_metric(value=round(last_miou, 4), name='mIoU')
        fitlog.add_other(value=self.id, name='record_id')
        fitlog.add_other(value=self.num_train, name='train')
        fitlog.add_other(value=self.num_val, name='val')
Example #10
0
    def _train(self, criterion, optimizer):
        max_test_acc = 0
        max_test_f1 = 0
        global_step = 0
        continue_not_increase = 0
        for epoch in range(self.opt.num_epoch):
            print(">" * 100)
            print("epoch: ", epoch)
            n_correct, n_total = 0, 0
            increase_flag = False
            for i_batch, sample_batched in enumerate(self.train_data_loader):
                global_step += 1

                # switch model to training mode, clear gradient accumulators
                self.model.train()
                optimizer.zero_grad()

                inputs = [
                    sample_batched[col].to(self.opt.device)
                    for col in self.opt.inputs_cols
                ]
                targets = sample_batched["polarity"].to(self.opt.device)

                outputs = self.model(inputs)
                loss = criterion(outputs, targets)
                loss.backward()
                optimizer.step()

                if global_step % self.opt.log_step == 0:
                    n_correct += (torch.argmax(outputs,
                                               -1) == targets).sum().item()
                    n_total += len(outputs)
                    train_acc = n_correct / n_total

                    test_acc, test_f1 = self._evaluate_acc_f1()
                    ################fitlog code####################
                    fitlog.add_metric(test_acc, name="acc", step=global_step)
                    fitlog.add_metric(test_f1, name="f1", step=global_step)
                    ################fitlog code####################
                    if test_acc > max_test_acc:
                        increase_flag = True
                        fitlog.add_best_metric(test_acc, "acc")
                        max_test_acc = test_acc
                    if test_f1 > max_test_f1:
                        increase_flag = True
                        max_test_f1 = test_f1
                        fitlog.add_best_metric(max_test_f1, "f1")
                        if self.opt.save and test_f1 > self.global_f1:
                            self.global_f1 = test_f1
                            torch.save(
                                self.model.state_dict(),
                                "state_dict/" + self.opt.model_name + "_" +
                                self.opt.dataset + ".pkl",
                            )
                            print(">>> best model saved.")
                    print(
                        "loss: {:.4f}, acc: {:.4f}, test_acc: {:.4f}, test_f1: {:.4f}"
                        .format(loss.item(), train_acc, test_acc, test_f1))
            if increase_flag == False:
                if continue_not_increase >= self.opt.early_stop:
                    print("early stop.")
                    break
                continue_not_increase += 1
            else:
                continue_not_increase = 0
        return max_test_acc, max_test_f1
Example #11
0
def train(args, train_dataset, model, test_dataset):
    '''Train the model'''
    tb_writer = SummaryWriter()

    args.train_batch_size = args.per_gpu_train_batch_size
    train_sampler = RandomSampler(train_dataset)
    collate_fn = get_collate_fn(args)
    train_dataloader = DataLoader(train_dataset, sampler=train_sampler,
                                  batch_size=args.train_batch_size,
                                  collate_fn=collate_fn)

    if args.max_steps > 0:
        t_total = args.max_steps
        args.num_train_epochs = args.max_steps // (
            len(train_dataloader) // args.gradient_accumulation_steps) + 1
    else:
        t_total = len(
            train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs

    
    if args.embedding_type in ('bert', 'roberta'):
        optimizer = get_bert_optimizer(args, model)
    else:
        parameters = filter(lambda param: param.requires_grad, model.parameters())
        optimizer = torch.optim.Adam(parameters, lr=args.learning_rate)
        # optimizer = torch.optim.SGD(parameters, lr=args.learning_rate, momentum=0.9)

    # Train
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_dataset))
    logger.info("  Num Epochs = %d", args.num_train_epochs)
    logger.info("  Instantaneous batch size per GPU = %d",
                args.per_gpu_train_batch_size)
    logger.info("  Gradient Accumulation steps = %d",
                args.gradient_accumulation_steps)
    logger.info("  Total optimization steps = %d", t_total)
    print("Total steps:", t_total)
    global_step = 0
    tr_loss, logging_loss = 0.0, 0.0
    all_eval_results = []
    model.zero_grad()
    train_iterator = trange(int(args.num_train_epochs), desc="Epoch")
    best_acc = 0
    best_f1 = 0
    # results, eval_loss = evaluate(args, test_dataset, model)
    with tqdm(total=args.num_train_epochs, desc='Epoch') as pbar:
    # for _ in train_iterator:
        for _ in range(int(args.num_train_epochs)):
            pbar.update()
        # epoch_iterator = tqdm(train_dataloader, desc='Iteration')
            for step, batch in enumerate(train_dataloader):
                model.train()
                batch = tuple(t.to(args.device) for t in batch)

                inputs, labels = get_input_from_batch(args, batch)
                logit = model(**inputs)
                loss = F.cross_entropy(logit, labels)

                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps

                loss.backward()
                torch.nn.utils.clip_grad_norm_(
                    model.parameters(), args.max_grad_norm)

                tr_loss += loss.item()
                # if (step + 1) % args.gradient_accumulation_steps == 0:
                    # scheduler.step()  # Update learning rate schedule
                optimizer.step()
                model.zero_grad()
                global_step += 1

                # Log metrics
                if args.logging_steps > 0 and global_step % args.logging_steps == 0:
                    results, eval_loss = evaluate(args, test_dataset, model)

                    all_eval_results.append(results)
                    if results['acc']>best_acc:
                        best_acc = results['acc']
                        best_f1 = results['f1']
                        pbar.write(f"Step:{global_step} acc:{round(best_acc, 4)}, f1:{round(best_f1, 4)}")
                        fitlog.add_best_metric({'acc':best_acc, 'f1':best_f1, 'step':global_step})

                    fitlog.add_metric(name='f1', value=results['f1'], step=global_step)
                    fitlog.add_metric(name='acc', value=results['acc'], step=global_step)
                    # for key, value in results.items():
                    #     tb_writer.add_scalar(
                    #         'eval_{}'.format(key), value, global_step)
                    # tb_writer.add_scalar('eval_loss', eval_loss, global_step)
                    # # tb_writer.add_scalar('lr', scheduler.get_lr()[0], global_step)
                    # tb_writer.add_scalar(
                    #     'train_loss', (tr_loss - logging_loss) / args.logging_steps, global_step)
                    # logging_loss = tr_loss

                    # Save model checkpoint

                if args.max_steps > 0 and global_step > args.max_steps:
                    epoch_iterator.close()
                    break
            if args.max_steps > 0 and global_step > args.max_steps:
                epoch_iterator.close()
                break

    tb_writer.close()
    return global_step, tr_loss/global_step, all_eval_results
Example #12
0
def main():
    args = set_config()
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    tokenizer = RobertaTokenizer.from_pretrained('roberta-large')
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    # Prepare model
    # encoder = BertForQuestionAnswering.from_pretrained(args.bert_model)
    # device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    #
    # encoder.to(device)
    # encoder.eval()
    # #freeze bert
    # # for name, param in model.named_parameters():
    # #     if "bert" in name:
    # #         param.requires_grad = False
    #
    # model = GraphFusionNet(args)
    model = DFGN_Roberta.from_pretrained(r'E:\DATA\bert_pretrained\roberta-large', graph_config=args)
    model.to(device)
    model.train()

    optimizer = torch.optim.Adam(model.parameters(), lr=args.learning_rate)
    global_step = 0

    if args.do_train:
        # load train data
        train_examples, train_features, train_graph = get_train_feature(args, args.do_train, tokenizer)
        train_examples_dict = example_dict(train_examples)
        train_data = DataIteratorPack(train_features, train_examples_dict, train_graph, args.train_batch_size, device,
                                      sent_limit=25, entity_limit=80,
                                      n_layers=args.n_layers, sequential=False)
        # (features, example_dict, graph_dict, bsz, device, sent_limit, entity_limit, n_layers = 2,
        # entity_type_dict = None, sequential = False,)
        # load dev data
        eval_examples, eval_features, eval_graph = get_train_feature(args, not args.do_train, tokenizer)
        eval_examples_dict = example_dict(eval_examples)
        eval_data = DataIteratorPack(eval_features, eval_examples_dict, eval_graph, args.predict_batch_size, device,
                                     sent_limit=25, entity_limit=80,
                                     n_layers=args.n_layers, sequential=False)
        with open(args.predict_file) as f:
            gold = json.load(f)

        logger.info("***** Running predictions *****")
        logger.info("  Num orig examples = %d", len(train_examples))
        logger.info("  Num split examples = %d", len(train_features))
        logger.info("  Batch size = %d", args.train_batch_size)
        cur_patience = 0
        VERBOSE_STEP = 100
        best_dev_F1 = None
        for epoch in trange(int(args.num_train_epochs), desc="Epoch"):
            # model.train()
            model.train()

            total_train_loss = [0] * 5

            for step, batch in enumerate(train_data):
                # batch = tuple(t.to(device) for t in batch)  # multi-gpu does scattering it-self
                input_ids = batch["context_idxs"]
                input_mask = batch["context_mask"]
                segment_ids = batch["segment_idxs"]

                start, end, sp, Type, softmask, ent, yp1, yp2 = model(input_ids, segment_ids, input_mask, batch=batch,
                                                                      return_yp=True, is_train=True)
                loss_list = compute_loss(batch, start, end, sp, Type, softmask, args)

                if args.gradient_accumulation_steps > 1:
                    loss_list = loss_list / args.gradient_accumulation_steps

                loss_list[0].backward()

                if (global_step + 1) % args.grad_accumulate_step == 0:
                    optimizer.step()
                    optimizer.zero_grad()

                global_step += 1

                for i, l in enumerate(loss_list):
                    if not isinstance(l, int):
                        total_train_loss[i] += l.item()

                if global_step % VERBOSE_STEP == 0:
                    print("-- In Epoch{}: ".format(epoch))
                    for i, l in enumerate(total_train_loss):
                        print("Avg-LOSS{}/batch/step: {}".format(i, l / VERBOSE_STEP))
                    total_train_loss = [0] * 5

            train_data.refresh()
            if args.do_predict:

                eval_examples_dict = example_dict(eval_examples)
                eval_features_dict = example_dict(eval_features)

                logger.info("***** Running predictions *****")
                logger.info("  Num split examples = %d", len(eval_features))
                logger.info("  Batch size = %d", args.predict_batch_size)

                model.eval()
                all_results = []
                answer_dict = {}
                sp_dict = {}
                total_test_loss = [0] * 5
                logger.info("Start evaluating")
                for step, batch in enumerate(eval_data):
                    # batch = tuple(t.to(device) for t in batch)  # multi-gpu does scattering it-self
                    input_ids = batch["context_idxs"]
                    input_mask = batch["context_mask"]
                    segment_ids = batch["segment_idxs"]

                    if len(sp_dict) % 1000 == 0:
                        logger.info("Processing example: %d" % (len(all_results)))

                    with torch.no_grad():
                        start, end, sp, Type, softmask, ent, yp1, yp2 = model(input_ids, segment_ids, input_mask,
                                                                              batch=batch, return_yp=True)
                        # context_encoding = encoder(input_ids, segment_ids, input_mask)
                        #
                        # # loss_list = model(context_encoding, batch=batch)
                        # start, end, sp, Type, softmask, ent, yp1, yp2 = model(context_encoding, batch=batch,
                        #                                                       return_yp=True)
                        loss_list = compute_loss(batch, start, end, sp, Type, softmask, args)
                        Type = Type.argmax(dim=1)

                        # batch_start_logits, batch_end_logits, batch_types, sp = model(input_ids, segment_ids, input_mask, batch=batch)
                    for i, l in enumerate(loss_list):
                        if not isinstance(l, int):
                            total_test_loss[i] += l.item()

                    answer_dict_ = convert_to_tokens(eval_examples_dict, eval_features_dict, batch['ids'],
                                                     yp1.data.cpu().numpy().tolist(),
                                                     yp2.data.cpu().numpy().tolist(),
                                                     Type.cpu().numpy())

                    answer_dict.update(answer_dict_)
                    predict_support_np = torch.sigmoid(sp[:, :, 1]).data.cpu().numpy()
                    for i in range(predict_support_np.shape[0]):
                        cur_sp_pred = []
                        cur_id = batch['ids'][i]
                        for j in range(predict_support_np.shape[1]):
                            if j >= len(eval_examples_dict[cur_id].sent_names):
                                break
                            if predict_support_np[i, j] > args.sp_threshold:
                                cur_sp_pred.append(eval_examples_dict[cur_id].sent_names[j])
                        sp_dict.update({cur_id: cur_sp_pred})

                # for i, l in enumerate(total_train_loss):
                #     print("Avg-LOSS{}/batch/step: {}".format(i, l / len(eval_features)))

                prediction = {'answer': answer_dict, 'sp': sp_dict}
                output_answer_sp_file = os.path.join(args.output_dir, "predictions_answer_sp_{}.json".format(epoch))
                with open(output_answer_sp_file, 'w') as f:
                    json.dump(prediction, f)

                # record results
                metrics = eval(prediction, gold)
                for i, l in enumerate(total_train_loss):
                    metrics["LOSS{}".format(i)] = l / len(eval_features)
                    print("Avg-LOSS{}/batch/step: {}".format(i, l / len(eval_features)))

                fitlog.add_best_metric({"Test": metrics})

                metrics = evaluate(eval_examples_dict, answer_dict)
                print('hotpotqa | EM {:.4f} | F1 {:.4f}'.format(metrics['exact_match'], metrics['f1']))
                eval_data.refresh()

                dev_F1 = metrics['f1']
                if best_dev_F1 is None or dev_F1 > best_dev_F1:
                    best_dev_F1 = dev_F1
                    output_model_file = os.path.join(args.output_dir, "pytorch_model.bin")
                    model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self

                    logger.info("model save in %s" % output_model_file)
                    # model_to_save.save_pretrained(output_model_file)
                    # tokenizer.save_pretrained(args.output_dir)
                    torch.save(model_to_save.state_dict(), output_model_file)
                    cur_patience = 0

                    # model = AlbertForQuestionAnswering.from_pretrained(args.output_dir, force_download=True)
                    # # tokenizer = AlbertTokenizer.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case)
                    # model.to(device)
                else:
                    cur_patience += 1
                    if cur_patience >= 3:
                        # for param_group in optimizer.param_groups:
                        #    param_group['lr'] /= 2.0
                        # if param_group['lr'] < 1e-8:
                        #    stop_train = True
                        break
Example #13
0
     batch_size = score.size(0)
     embedding_size = 300
     score = score.cuda()
     data = data.cuda()
     predict = model(data)
     # print(predict.shape)
     loss = loss_function(predict, score)
     losss2.append(loss.item())
     acc2 += (score == torch.argmax(predict, -1)).cpu().sum().item()
     total2 += score.size(0)
 print("dev epoch %s accuracy is %s loss is %s " %
       (str(i), str(acc2 / total2), str(np.mean(losss2))))
 fitlog.add_metric(acc2 / total2, name="test_acc", step=i)
 if acc2 / total2 > best_acc:
     best_acc = acc2 / total2
     fitlog.add_best_metric({"dev": best_acc})
 losss2 = []
 acc2 = 0
 total2 = 0
 for num, (score, data) in tqdm(enumerate(test_loader)):
     batch_size = score.size(0)
     embedding_size = 300
     score = score.cuda()
     data = data.cuda()
     predict = model(data)
     loss = loss_function(predict, score)
     losss2.append(loss.item())
     acc2 += (score == torch.argmax(predict, -1)).cpu().sum().item()
     total2 += score.size(0)
     # for each in zip(score,predict,data):
     #     file.write(str(each)+"\n")
Example #14
0
                                                T_0=epochs,
                                                T_mult=1,
                                                eta_min=1e-6)
        scaler = GradScaler()
        for epoch in range(epochs):
            train_one_epoch(fold,
                            epoch,
                            model,
                            criterion,
                            optimizer,
                            trn_loader,
                            device,
                            scheduler=scheduler)
            valid_one_epoch(fold,
                            epoch,
                            model,
                            criterion,
                            optimizer,
                            val_loader,
                            device,
                            scheduler=None)
        print(f'fold {fold} max acc {max_acc}')
        acc.append(max_acc)

        if not args.nfold:
            break

    if args.nfold:
        fitlog.add_best_metric({"val": {"mean_acc": np.mean(acc)}})
    fitlog.finish()
def main():
    # ====== preprocess ====== #
    args = preprocess()

    # ====== Loading dataset ====== #
    train_data, dev_data, test_data, joint_vocabs, parsing_vocabs = load_data(
        args.joint_input, args.parsing_input, args.batch_size,
        args.accum_steps, args.shuffle, args.num_workers, args.drop_last)
    # cross_labels_idx = generate_cross_labels_idx(vocabs['labels'])

    # ======= Preparing Model ======= #
    print("\nModel Preparing starts...")
    model = JointEncoderModel(
        joint_vocabs,
        parsing_vocabs,
        # cross_labels_idx,
        # Embedding
        args.subword,
        args.use_pos_tag,
        args.bert_path,
        args.transliterate,
        args.d_model,
        args.partition,
        args.pos_tag_emb_dropout,
        args.position_emb_dropout,
        args.bert_emb_dropout,
        args.emb_dropout,
        # Encoder
        args.layer_num,
        args.hidden_dropout,
        args.attention_dropout,
        args.dim_ff,
        args.nhead,
        args.kqv_dim,
        # classifier
        args.label_hidden,
        # loss
        args.lambda_scaler,
        args.alpha_scaler,
        args.language,
        args.device).cuda()
    # print(model, end='\n\n\n')
    optimizer = Optim(model, args.optim, args.lr, args.lr_fine_tune,
                      args.warmup_steps, args.lr_decay_factor,
                      args.weight_decay, args.clip_grad,
                      args.clip_grad_max_norm)
    optimizer.zero_grad()
    # if args.freeze_bert:
    #     optimizer.set_freeze_by_idxs([str(num) for num in range(0, config.freeze_bert_layers)], True)
    #     optimizer.free_embeddings()
    #     optimizer.freeze_pooler()
    #     print('freeze model of BERT %d layers' % config.freeze_bert_layers)

    # ========= Training ========= #
    print('Training starts...')
    start = time.time()
    steps, loss_value, total_batch_size = 1, 0., 0
    best_dev, best_test = None, None
    patience = args.patience
    for epoch_i in range(1, args.epoch):
        for batch_i, insts in enumerate(train_data, start=1):
            model.train()

            insts, batch_size, max_len = batch_filter(
                insts, args.language, args.DATASET_MAX_SNT_LENGTH)
            insts_list = batch_spliter(insts, max_len,
                                       args.BATCH_MAX_SNT_LENGTH)
            total_batch_size += batch_size
            for insts in insts_list:
                loss = model(insts)
                if loss.item() > 0.:
                    loss.backward()
                    loss_value += loss.item()
                    assert not isinstance(loss_value,
                                          torch.Tensor), 'GPU memory leak'

            if batch_i == args.accum_steps and not args.debug:
                args.visual_logger.visual_histogram(model,
                                                    steps // args.accum_steps)
            if steps % args.accum_steps == 0:
                optimizer.step()
                optimizer.zero_grad()
            if steps % (args.accum_steps * args.log_interval) == 0:
                print('[%d/%d], [%d/%d] Loss: %.05f' %
                      (epoch_i, args.epoch, batch_i // args.accum_steps,
                       len(train_data) // args.accum_steps,
                       loss_value / total_batch_size),
                      flush=True)
                visual_dic = {
                    'loss/train': loss_value,
                    'lr': optimizer.get_lr()[0]
                }
                if args.clip_grad:
                    visual_dic['norm'] = optimizer.get_dynamic_gard_norm()
                if not args.debug:
                    args.visual_logger.visual_scalars(
                        visual_dic, steps // args.accum_steps)
                loss_value, total_batch_size = 0., 0
                torch.cuda.empty_cache()
            if steps % (args.accum_steps * args.eval_interval) == 0:
                print('model evaluating starts...', flush=True)
                joint_fscore_dev, res_data_dev = eval_model(
                    model, dev_data, args.language,
                    args.DATASET_MAX_SNT_LENGTH, args.BATCH_MAX_SNT_LENGTH,
                    args.evalb_path, 'dev')
                joint_fscore_test, res_data_test = eval_model(
                    model, test_data, args.language,
                    args.DATASET_MAX_SNT_LENGTH, args.BATCH_MAX_SNT_LENGTH,
                    args.evalb_path, 'test')
                visual_dic = {
                    'F/parsing_dev': joint_fscore_dev.parsing_f,
                    'F/parsing_test': joint_fscore_test.parsing_f,
                    'F/ner_dev': joint_fscore_dev.ner_f,
                    'F/ner_test': joint_fscore_test.ner_f
                }
                if not args.debug:
                    args.visual_logger.visual_scalars(
                        visual_dic, steps // args.accum_steps)
                if best_dev is None or joint_fscore_dev.parsing_f > best_dev.parsing_f:
                    best_dev, best_test = joint_fscore_dev, joint_fscore_test
                    fitlog.add_best_metric({
                        'parsing_f_dev': best_dev.parsing_f,
                        'ner_f_test': best_test.ner_f
                    })
                    patience = args.patience
                    write_joint_data(args.save_path, res_data_dev, 'dev')
                    write_joint_data(args.save_path, res_data_test, 'test')
                    if args.save:
                        torch.save(
                            model.pack_state_dict(),
                            os.path.join(args.save_path,
                                         args.name + '.best.model.pt'))
                print('best performance:\ndev: %s\ntest: %s' %
                      (best_dev, best_test))
                print('model evaluating ends...', flush=True)
                del res_data_dev, res_data_test
                if args.debug:
                    exit(0)
            steps += 1

        if args.early_stop:
            patience -= 1
            if patience < 0:
                print('early stop')
                break

    # ====== postprocess ====== #
    postprocess(args, start)
Example #16
0
    return data


def run(path, force_reprocess, name="data.pkl"):

    if (not force_reprocess) and os.path.exists(name):
        with open(name, "rb") as fil:
            ret = pickle.load(fil)
    else:
        train_data = load(path)
        train_data = indexize(train_data)

        ret = vocab, train_data

    #pdb.set_trace()

    logger.log("vocab len:", len(ret[0]))
    logger.log(" data len:", len(ret[1]))

    with open("data.pkl", "wb") as fil:
        pickle.dump(ret, fil)
    return ret


if __name__ == "__main__":
    run(C.data_path, C.force_reprocess)

    fitlog.add_best_metric(2333, "test")
    fitlog.finish()
Example #17
0
                                                T_0=epochs,
                                                T_mult=1,
                                                eta_min=1e-6)
        scaler = GradScaler()
        for epoch in range(epochs):
            train_one_epoch(fold,
                            epoch,
                            model,
                            criterion,
                            optimizer,
                            trn_loader,
                            device,
                            scheduler=scheduler)
            valid_one_epoch(fold,
                            epoch,
                            model,
                            criterion,
                            optimizer,
                            val_loader,
                            device,
                            scheduler=None)
        print(f'fold {fold} max dice {max_dice}')
        dice.append(max_dice.cpu().numpy())

        if not args.nfold:
            break

    if args.nfold:
        fitlog.add_best_metric({"val": {"mean_dice": np.mean(dice)}})
    fitlog.finish()