def train(config, model, train_iter, dev_iter, test_iter):
    start_time = time.time()

    # EMA初始化
    ema = EMA(model, 0.999)
    ema.register()

    model.train()
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}]
    # optimizer = torch.optim.Adam(model.parameters(), lr=config.learning_rate)
    optimizer = BertAdam(optimizer_grouped_parameters,
                         lr=config.learning_rate,
                         warmup=0.05,
                         t_total=len(train_iter) * config.num_epochs)
    total_batch = 0  # 记录进行到多少batch
    dev_best_loss = float('inf')
    last_improve = 0  # 记录上次验证集loss下降的batch数
    flag = False  # 记录是否很久没有效果提升
    model.train()
    for epoch in range(config.num_epochs):
        print('Epoch [{}/{}]'.format(epoch + 1, config.num_epochs))
        for i, (trains, labels) in enumerate(train_iter):
            outputs = model(trains)
            model.zero_grad()
            loss = F.cross_entropy(outputs, labels)
            loss.backward()
            optimizer.step()
            ema.update()         # 训练过程中,更新完参数后,同步update shadow weights
            if total_batch % 100 == 0:
                # 每多少轮输出在训练集和验证集上的效果
                true = labels.data.cpu()
                predic = torch.max(outputs.data, 1)[1].cpu()
                train_acc = metrics.accuracy_score(true, predic)
                ema.apply_shadow()       # model应用shadow weights,进行验证、保存模型
                dev_acc, dev_loss = evaluate(config, model, dev_iter)
                if dev_loss < dev_best_loss:
                    dev_best_loss = dev_loss
                    torch.save(model.state_dict(), config.save_path)
                    improve = '*'
                    last_improve = total_batch
                else:
                    improve = ''
                ema.restore()  # 下一次训练之前,恢复模型参数
                time_dif = get_time_dif(start_time)
                msg = 'Iter: {0:>6},  Train Loss: {1:>5.2},  Train Acc: {2:>6.2%},  Val Loss: {3:>5.2},  Val Acc: {4:>6.2%},  Time: {5} {6}'
                print(msg.format(total_batch, loss.item(), train_acc, dev_loss, dev_acc, time_dif, improve))
                model.train()
            total_batch += 1
            if total_batch - last_improve > config.require_improvement:
                # 验证集loss超过1000batch没下降,结束训练
                print("No optimization for a long time, auto-stopping...")
                flag = True
                break
        if flag:
            break
    test(config, model, test_iter)
Esempio n. 2
0
def train(config, model, train_iter, dev_iter, test_iter):
    param_optimizer = list(model.named_parameters())
    #定义不需要衰减的参数,bert是基于transformer的encoder,add&norm层的参数不需要衰减
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']

    optimizer_grouped_parameters = [
        #需要衰减
        {'params':[p for n,p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay':0.01},
        #不需要衰减
        {'params':[p for n,p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_deacy':0.0}
    ]

    optimizer = BertAdam(params=optimizer_grouped_parameters,
                         lr=config.learning_rate,
                         warmup=0.05,
                         t_total=len(train_iter) * config.num_epochs)

    total_batch = 0
    dev_best_loss = float('inf')
    last_imporve = 0  # 上次校验集loss下降的batch数
    flag = False      # 记录是否有效果提升
    model.train()

    for epoch in range(config.num_epochs):
        print('Epoch [{}/{}'.format(epoch+1, config.num_epochs))
        for i, (trains, labels) in enumerate(train_iter):
            outputs = model(trains)
            model.zero_grad()
            loss = F.cross_entropy(outputs, labels)
            loss.backward()
            optimizer.step()

            if total_batch % 100 == 0:
                true = labels.data.cpu()
                predit = torch.max(outputs.data, 1)[1].cpu()
                train_acc = metrics.accuracy_score(true, predit)
                dev_acc, dev_loss = evaluate(config, model, dev_iter)
                if dev_loss < dev_best_loss:
                    dev_best_loss = dev_loss
                    torch.save(model.state_dict(), config.save_path)
                    imporve = '*'
                    last_imporve = total_batch
                else:
                    imporve = ''
                msg = 'Iter:{0:>6}, Train Loss:{1:>5.2}, Train Acc:{2:>6.2}, Val Loss:{3:>5.2}, Val Acc:{4:>6.2%}, {6}'
                print(msg.format(total_batch, loss.item(), train_acc, dev_loss, dev_acc, imporve))
                model.train()
            total_batch = total_batch + 1
            if total_batch - last_imporve > config.require_improvement:
                print('在校验数据集合上已经很长时间没有提升了,模型自动停止训练')
                flag = True
                break
        if flag:
            break
    test(config, model, test_iter)
    def train(self, config, model, train_iter, dev_iter, test_iter):
        start_time = time.time()
        model.train()
        param_optimizer = list(model.named_parameters())
        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [
            {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
            {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}]
        # optimizer = torch.optim.Adam(model.parameters(), lr=config.learning_rate)
        optimizer = BertAdam(optimizer_grouped_parameters,
                             lr=config.learning_rate,
                             warmup=0.05,
                             t_total=len(train_iter) * config.num_epochs)
        total_batch = 0  # 记录进行到多少batch
        dev_best_loss = float('inf')
        last_improve = 0  # 记录上次验证集loss下降的batch数
        flag = False  # 记录是否很久没有效果提升
        model.train()
        for epoch in range(config.num_epochs):
            print('Epoch [{}/{}]'.format(epoch + 1, config.num_epochs))
            for i, (trains, labels) in enumerate(train_iter):
                predicts, predicts_list = model(trains, labels, mode="train")
                losses = list(map(self._get_loss, predicts, labels))
                loss = torch.mean(torch.stack(losses))
                model.zero_grad()

                #loss = F.cross_entropy(losses, labels)
                loss.backward()
                optimizer.step()
                if total_batch % 1000 == 0:
                    dev_acc_lev1, dev_acc_lev2, dev_loss = self.evaluate(config, model, dev_iter)
                    if dev_loss < dev_best_loss:
                        dev_best_loss = dev_loss
                        torch.save(model.state_dict(), config.save_path)
                        improve = '*'
                        last_improve = total_batch
                    else:
                        improve = ''
                    time_dif = get_time_dif(start_time)
                    msg = 'Iter: {0:>6},  Train Loss: {1:>5.2},  Val Loss: {2:>5.2},  Val_lev1 Acc: {3:>6.2%},  Val_lev2 Acc: {4:>6.2%},  Time: {5} {6}'
                    print(msg.format(total_batch, loss.item(), dev_loss, dev_acc_lev1, dev_acc_lev2, time_dif, improve))
                    model.train()
                    if total_batch > 10 and total_batch % 90000 == 0:
                        self.test(config, model, train_iter)
                total_batch += 1
                if total_batch - last_improve > config.require_improvement:
                    # 验证集loss超过1000batch没下降,结束训练
                    print("No optimization for a long time, auto-stopping...")
                    flag = True
                    break
            if flag:
                break
        self.test(config, model, test_iter)
Esempio n. 4
0
def train(config, model, train_iter, dev_iter, test_iter):
    """
    模型训练方法
    :param config:
    :param model:
    :param train_iter:
    :param dev_iter:
    :param test_iter:
    :return:
    """
    start_time = time.time()
    #启动 BatchNormalization 和 dropout
    model.train()
    #拿到所有mode种的参数
    param_optimizer = list(model.named_parameters())
    # 不需要衰减的参数
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']

    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        0.01
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_deacy':
        0.0
    }]

    optimizer = BertAdam(params=optimizer_grouped_parameters,
                         lr=config.learning_rate,
                         warmup=0.05,
                         t_total=len(train_iter) * config.num_epochs)

    total_batch = 0  #记录进行多少batch
    dev_best_loss = float('inf')  #记录校验集合最好的loss
    last_imporve = 0  #记录上次校验集loss下降的batch数
    flag = False  #记录是否很久没有效果提升,停止训练
    model.train()
    for epoch in range(config.num_epochs):
        print('Epoch [{}/{}'.format(epoch + 1, config.num_epochs))
        for i, (trains, labels) in enumerate(train_iter):
            outputs = model(trains)
            model.zero_grad()
            loss = F.cross_entropy(outputs, labels)
            loss.backward(retain_graph=False)
            optimizer.step()
            if total_batch % 100 == 0:  #每多少次输出在训练集和校验集上的效果
                true = labels.data.cpu()
                predit = torch.max(outputs.data, 1)[1].cpu()
                train_acc = metrics.accuracy_score(true, predit)
                dev_acc, dev_loss = evaluate(config, model, dev_iter)
                if dev_loss < dev_best_loss:
                    dev_best_loss = dev_loss
                    torch.save(model.state_dict(), config.save_path)
                    imporve = '*'
                    last_imporve = total_batch
                else:
                    imporve = ''
                time_dif = utils.get_time_dif(start_time)
                msg = 'Iter:{0:>6}, Train Loss:{1:>5.2}, Train Acc:{2:>6.2}, Val Loss:{3:>5.2}, Val Acc:{4:>6.2%}, Time:{5} {6}'
                print(
                    msg.format(total_batch, loss.item(), train_acc, dev_loss,
                               dev_acc, time_dif, imporve))
                model.train()
            total_batch = total_batch + 1
            if total_batch - last_imporve > config.require_improvement:
                #在验证集合上loss超过1000batch没有下降,结束训练
                print('在校验数据集合上已经很长时间没有提升了,模型自动停止训练')
                flag = True
                break

        if flag:
            break
    test(config, model, test_iter)
Esempio n. 5
0
def train(config, model, train_iter, dev_iter, test_iter):
    """
    模型训练方法
    :param config:
    :param model:
    :param train_iter:
    :param dev_iter:
    :param test_iter:
    :return:
    """
    start_time = time.time()
    # 启动 BatchNormalization 和 dropout
    model.train()
    # 拿到 model 中的参数
    param_optimizer = list(model.named_parameters())
    # 不需要衰减的参数
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [
        {
            'params': [
                p for n, p in param_optimizer
                if not any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            0.01
        },
        {
            'params':
            [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
            'weight_decay':
            0.0
        },
    ]

    optimizer = BertAdam(optimizer_grouped_parameters,
                         lr=config.learning_rate,
                         warmup=0.05,
                         t_total=len(train_iter) * config.num_epochs)

    # 记录进行到了多少batch
    total_batch = 0
    # 记录校验集合最好的loss
    dev_best_loss = float('inf')
    # 记录上次验证集loss下降的batch数
    last_improve = 0
    # 记录是否很久没有效果提升
    flag = False
    model.train()
    for epoch in range(config.num_epochs):
        print('Epoch [{}/{}]'.format(epoch + 1, config.num_epochs))
        for i, (trains, labels) in enumerate(train_iter):
            outputs = model(trains)
            # 梯度清零
            model.zero_grad()
            loss = F.cross_entropy(outputs, labels)
            loss.backward()
            optimizer.step()
            # 每多少轮输出在训练集和校验集上的效果
            if total_batch % 100 == 0:
                true = labels.data.cpu()
                predict = torch.max(outputs.data, 1)[1].cpu()
                train_acc = metrics.accuracy_score(true, predict)
                # 评估校验集
                dev_acc, dev_loss = evaluate(config, model, dev_iter)
                if dev_loss < dev_best_loss:
                    dev_best_loss = dev_loss
                    # 保存模型
                    torch.save(model.state_dict(), config.save_path)
                    improve = '*'
                    last_improve = total_batch
                else:
                    improve = ''
                time_dif = get_time_dif(start_time)
                msg = 'Iter:{0:>6}, Train Loss:{1:>5.2}, Train Acc:{2:>6.2%}, Val Loss:{3:>5.2}, Val Acc:{4:>6.2%}, Time:{5} {6}'
                print(
                    msg.format(total_batch, loss.item(), train_acc, dev_loss,
                               dev_acc, time_dif, improve))
                model.train()
            total_batch += 1
            # 校验集loss超过1000batch没下降,结束训练
            if total_batch - last_improve > config.require_improvement:
                print("No optimization for a long time, auto-stopping...")
                flag = True
                break
        if flag:
            break
    print("开始测试模型")
    test(config, model, test_iter)
Esempio n. 6
0
 def train_(self):
     batch_size = self.arg_parser.batch_size
     epochs = self.arg_parser.num_epochs
     self.model.train()
     iter_counter = 0
     # Adadelta, Adagrad, Adam, AdamW, SGD, SparseAdam, Adamax, ASGD, RMSprop, LBFGS, Rprop
     best_dev_loss = float("inf")
     best_dev_accuracy = 0
     train_loss_info = {}  # collection loss data to draw the loss curve
     train_loss_info["num_epochs"] = epochs
     train_loss_info["batch_size"] = batch_size
     if self.arg_parser.bert:
         param_optimizer = list(self.model.named_parameters())
         no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
         optimizer_grouped_parameters = [{
             'params': [
                 p for n, p in param_optimizer
                 if not any(nd in n for nd in no_decay)
             ],
             'weight_decay':
             0.01
         }, {
             'params': [
                 p for n, p in param_optimizer
                 if any(nd in n for nd in no_decay)
             ],
             'weight_decay':
             0.0
         }]
         optimizer = BertAdam(optimizer_grouped_parameters,
                              lr=self.lr,
                              warmup=0.05,
                              t_total=epochs * len(self.train_batch))
     else:
         optimizer = getattr(torch.optim, self.arg_parser.optimizer)(
             self.model.parameters(), lr=self.lr)
     scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
         optimizer,
         mode="min",
         factor=self.lr_decay,
         patience=self.lr_patience,
         verbose=True)
     for epoch in range(epochs):
         logger.info(f"-" * 35 + ">" + f"Training {epoch}th epoch" + "<" +
                     "-" * 35)
         #optimizer = getattr(torch.optim, self.arg_parser.optimizer)(self.model.parameters(), lr = self.lr * (self.lr_decay ** epoch))
         last_epoch_dev_loss = float(
             "inf") if epoch == 0 else epoch_dev_loss
         last_epoch_dev_accuracy = 0 if epoch == 0 else epoch_dev_accuracy
         epoch_train_loss = 0
         epoch_train_accuracy = 0
         epoch_dev_loss = 0
         epoch_dev_accuracy = 0
         print_counter = 0
         for minibatch in self.train_batch:
             if self.arg_parser.bert:
                 input_x = minibatch[0]
                 masks = minibatch[2]
                 input_ = (input_x, masks)
             else:
                 input_ = minibatch[0]
             label_ = minibatch[1]
             output_ = self.model(input_)
             self.model.zero_grad()
             loss = F.cross_entropy(output_, label_)
             loss.backward()
             optimizer.step()
             iter_counter += 1
             if iter_counter % 100 == 0:
                 predict = output_.max(1)[1].cpu()
                 label_cpu = label_.cpu()
                 train_loss = loss.cpu().item()
                 train_loss = round(train_loss, 5)
                 #train_accuracy = round(accuracy_score(label_cpu, predict), 4)
                 train_accuracy = accuracy_score(label_cpu, predict)
                 dev_loss, dev_accuracy, dev_f1_macro, dev_f1_micro, dev_weighted = self.evaluation(
                     self.model, self.dev_batch)
                 epoch_train_loss += train_loss
                 epoch_train_accuracy += train_accuracy
                 epoch_dev_loss += dev_loss
                 epoch_dev_accuracy += dev_accuracy
                 logger.info(
                     f"Iter: {iter_counter}, train loss: {train_loss}, train accuracy: {train_accuracy}, val loss: {dev_loss}, val accuracy: {dev_accuracy}"
                 )
                 if self.unbalanced == True:
                     logger.info(
                         f"val F1 macro: {dev_f1_macro}, val F1 micro: {dev_f1_micro}, val F1 weighted: {dev_weighted}"
                     )
                 self.model.train()
                 if dev_loss < best_dev_loss and dev_accuracy > best_dev_accuracy:
                     best_dev_loss = dev_loss
                     best_dev_accuracy = dev_accuracy
                     #logger.info(f"Best validation loss updated: {best_dev_loss}")
                     torch.save(self.model, self.model_save_path)
                 else:
                     self.iter_patience += 1
                 print_counter += 1
         epoch_train_loss = round(epoch_train_loss / print_counter, 5)
         epoch_train_accuracy = round(epoch_train_accuracy / print_counter,
                                      5)
         epoch_dev_loss = round(epoch_dev_loss / print_counter, 5)
         epoch_dev_accuracy = round(epoch_dev_accuracy / print_counter, 5)
         scheduler.step(epoch_dev_loss)
         logger.info(
             f"{epoch}th epoch finished, val epoch loss:{epoch_dev_loss}, val epoch accuracy:{epoch_dev_accuracy}"
         )
         self.EarlyStopping(epoch_dev_loss, epoch_dev_accuracy,
                            last_epoch_dev_loss, last_epoch_dev_accuracy)
         if self.EarlyStopping_triggered == True:
             logger.info("=" * 70)
             logger.info(
                 f"Early Stopping triggered after {epoch + 1} epoches, calculating test accuracy..."
             )
             break
     if self.EarlyStopping_triggered == False:
         logger.info(
             "Training fnished, full epoch, calculating test accuracy...")
     self.evaluate_test()
Esempio n. 7
0
def train(config, model, train_iter, dev_iter, test_iter):
    start_time = time.time()

    ema = EMA(model, 0.999)
    ema.register()
    model.train()
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias',
                'LayerNorm.weight']  # bert官方将此三类免于正则化
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        0.01
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]
    # optimizer = torch.optim.Adam(model.parameters(), lr=config.learning_rate)
    optimizer = BertAdam(optimizer_grouped_parameters,
                         lr=config.learning_rate,
                         warmup=0.05,
                         t_total=len(train_iter) * config.num_epochs)
    total_batch = 0  # 记录进行到多少batch
    dev_best_loss = float('inf')  # 正无穷
    last_improve = 0  # 记录上次验证集loss下降的batch数
    flag = False  # 记录是否很久没有效果提升
    dev_f1_score = []
    model.train()

    for epoch in range(config.num_epochs):
        print('Epoch [{}/{}]'.format(epoch + 1, config.num_epochs))
        for i, (trains, labels) in enumerate(
                train_iter):  # trains, labels ==>  (x, seq_len, mask), y
            outputs = model(trains)
            model.zero_grad()
            loss = F.cross_entropy(outputs, labels)
            loss = loss / config.acc_grad
            loss.backward()
            if (i + 1) % config.acc_grad == 0:  # 梯度累加
                optimizer.step()
                ema.update()
            if total_batch % 100 == 0:
                # 每100轮输出在训练集和验证集上的效果
                true = labels.data.cpu()
                predic = torch.max(outputs.data, 1)[1].cpu()
                train_f1 = metrics.f1_score(true, predic, average='macro')
                ema.apply_shadow()
                dev_f1, dev_loss = evaluate(config, model, dev_iter)
                dev_f1_score.append(dev_f1)
                if dev_loss < dev_best_loss:
                    dev_best_loss = dev_loss
                    torch.save(model.state_dict(), config.save_path)  # 单gpu
                    improve = '*'
                    last_improve = total_batch
                else:
                    improve = ''
                ema.restore()
                time_dif = get_time_dif(start_time)
                msg = 'Iter: {0:>6},  Train Loss: {1:>5.2},  Train F1: {2:>6.2%},  Val Loss: {3:>5.2},  Val F1: {4:>6.2%},  Time: {5} {6}'
                print(
                    msg.format(total_batch, loss.item(), train_f1, dev_loss,
                               dev_f1, time_dif, improve))
                model.train()
            total_batch += 1
            if total_batch - last_improve > config.require_improvement:
                # 验证集loss超过1000batch没下降,结束训练
                print("No optimization for a long time, auto-stopping...")
                flag = True
                break
        # print(logits_res)
        print('Epoch {} Average F1-Score: {}'.format(epoch + 1,
                                                     np.mean(dev_f1_score)))
        if flag:
            break

    test(config, model, test_iter)
Esempio n. 8
0
def train(config, model, train_iter, dev_iter, test_iter):
    start_time = time.time()
    model.train()
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        0.01
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]
    # optimizer = torch.optim.Adam(model.parameters(), lr=config.learning_rate)
    optimizer = BertAdam(optimizer_grouped_parameters,
                         lr=config.learning_rate,
                         warmup=0.05,
                         t_total=len(train_iter) * config.num_epochs)
    total_batch = 0  # 记录进行到多少batch
    dev_best_loss = float('inf')
    last_improve = 0  # 记录上次验证集loss下降的batch数
    flag = False  # 记录是否很久没有效果提升
    model.train()
    for epoch in range(config.num_epochs):
        print('Epoch [{}/{}]'.format(epoch + 1, config.num_epochs))
        for i, (trains, labels) in enumerate(train_iter):
            outputs = model(trains)
            model.zero_grad()
            loss = F.cross_entropy(outputs, labels)
            loss.backward()
            optimizer.step()
            if total_batch % 5 == 0:
                # 每多少轮输出在训练集和验证集上的效果
                true = labels.data.cpu()
                predic = torch.max(outputs.data, 1)[1].cpu()
                train_acc = metrics.accuracy_score(true, predic)
                train_f1 = metrics.f1_score(true, predic, average='macro')
                dev_acc, dev_loss, dev_f1 = evaluate(config, model, dev_iter)
                if dev_loss < dev_best_loss:
                    dev_best_loss = dev_loss
                    torch.save(model.state_dict(), config.save_path)
                    improve = '*'
                    last_improve = total_batch
                else:
                    improve = ''
                time_dif = get_time_dif(start_time)
                print(
                    'Iter:{}, Train acc:{:.2f} , Train loss:{:.2f} , train-f1: {:.2f}'
                    .format(total_batch, train_acc, loss.item(), train_f1))
                print('dev_loss:{:.2f}, dev_acc:{:.2f},dev_f1:{:.2f}'.format(
                    dev_loss, dev_acc, dev_f1))
                print('using Time', time_dif)

                print('----' * 10)
                print('\n')
                # msg = 'Iter: {0:>6},  Train Loss: {1:>5.2},  Train Acc: {2:>6.2%},Train f1: {2:>6.2%}, Val Loss: {3:>5.2},  Val Acc: {4:>6.2%},  Val f1: {4:>6.2%},Time: {5} {6}'
                # print(msg.format(total_batch, loss.item(), train_acc, train_f1, dev_loss, dev_acc, dev_f1, time_dif, improve))

                model.train()
            total_batch += 1
            if total_batch - last_improve > config.require_improvement:
                # 验证集loss超过1000batch没下降,结束训练
                print("No optimization for a long time, auto-stopping...")
                flag = True
                break
        if flag:
            break
    test(config, model, test_iter)
def train(config, model, train_iter, dev_iter):
    """
    模型的训练
    :param config:
    :param model:
    :param train_iter:
    :param dev_iter:
    :return:
    """
    start_time = time.time()
    # 启动模型的训练模式
    model.train()
    # 拿到所有参数
    param_optimizer = list(model.named_parameters())
    # 定义不需要衰减的参数
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight': 0.0}
    ]
    optimizer = BertAdam(params=optimizer_grouped_parameters,
                         lr=config.learning_rate,
                         warmup=0.05,
                         t_total=len(train_iter) * config.epochs)

    total_batch = 0  # 记录进行多少batch
    dev_best_loss = float('inf')  # 记录上次最好的验证集loss
    last_improve = 0  # 记录上次提升的batch
    flag = False  # 停止位的标志, 是否很久没提升

    for epoch in range(config.epochs):
        print('Epoch [{}/{}]'.format(epoch+1, config.epochs))
        for i, (trains, labels) in enumerate(train_iter):
            outputs = model(trains)
            model.zero_grad()

            loss = F.cross_entropy(outputs, labels)
            loss.backward()
            optimizer.step()

            if total_batch % 50 == 0:  # 每训练50次输出在训练集和验证集上的效果
                torch.cuda.empty_cache()
                true = labels.data.cpu()
                predict = torch.max(outputs.data, 1)[1].cpu()
                score = metrics.accuracy_score(true, predict)

                dev_acc, dev_loss = evaluate(config, model, dev_iter)

                if dev_best_loss > dev_loss:
                    dev_best_loss = dev_loss

                    torch.save(model.state_dict(), config.model_save)
                    improve = '+'
                    last_improve = total_batch
                else:
                    improve = ''
                time_idf = utils.get_time_idf(start_time)
                msg = 'Iter:{0:>6}, Train Loss:{1:>5.2}, Train ACC:{2:>6.2%}, ' \
                      'Val Loss:{3:>5.2}, Val ACC:{4:>6.2%}, Time:{5}  {6}'
                print(msg.format(total_batch, loss.item(), score, dev_loss, dev_acc, time_idf, improve))
                model.train()
            total_batch = total_batch + 1

            if total_batch - last_improve > config.require_improvement:
                # 在验证集上loss超过1000batch没有下降, 结束训练
                print('在验证集上loss超过1000batch没有下降, 结束训练')
                flag = True
                break
        if flag:
            break
Esempio n. 10
0
def train(config, model, train_iter, dev_iter, test_iter):
    """
    Train model
    """

    # get the model's parameters
    param_optimizer = list(model.named_parameters())
    # which parameters do not need to be decay
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']

    optimizer_grouped_parameters = [
        {'params': [p for n,p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay':config.weight_decay},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay':0.0}
    ]
    optimizer = BertAdam(params=optimizer_grouped_parameters,
                        schedule = None,
                        lr = config.learning_rate,
                        warmup=config.warmup,
                        t_total=len(train_iter) * config.num_epochs)


    config.hvd.broadcast_parameters(model.state_dict(), root_rank=0)
    config.hvd.broadcast_optimizer_state(optimizer, root_rank=0)

    optimizer = config.hvd.DistributedOptimizer(optimizer, named_parameters=model.named_parameters())
    start_time = time.time()
    # activate BatchNormalization & dropout
    model.train()


   
    flag = False
    # progress
    total_batch = 0
    # Best loss in dev
    dev_best_loss = float('inf')
    # Last time loss was decreased's batch number in dev
    last_improve = 0
    # no improvement in long time, it's ok to stop train
    
    for epoch in range(config.num_epochs):
        if config.hvd.rank() == 0:
            print('Epoch [{}/{}]'.format(epoch + 1, config.num_epochs))
        for i, (token_ids, label, seq_len, mask) in enumerate(train_iter):
            
            token_ids = token_ids.to(config.device)
            label_gpu = label.to(config.device)
            seq_len = seq_len.to(config.device)
            mask = mask.to(config.device)

            outputs = model(token_ids, seq_len, mask)
            model.zero_grad()
            loss = F.cross_entropy(outputs, label_gpu)
            loss.backward()
            optimizer.step()
            #Every n batches, go dev
            if total_batch % 100 == 0:
                # Get the highest one from Softmax()
                predit = torch.max(outputs.data, 1)[1].cpu()
                train_acc = metrics.accuracy_score(label, predit)
                dev_acc, dev_loss = evaluate(config, model, dev_iter)
                if dev_loss < dev_best_loss:
                    dev_best_loss = dev_loss
                    if config.hvd.rank() == 0:
                        torch.save(model.state_dict(), config.save_path)
                    improve = '*'
                    last_improve = total_batch
                else:
                    improve = ''
                time_dif = utils.get_time_dif(start_time)
                msg = 'Iter: {0:>6}, Train Loss: {1:>5.2}, Train Acc:{2:>6.2}, Val Loss: {3:>5.2}, Val Acc: {4:>6.2%}, Time: {5} {6}'
                if config.hvd.rank() == 0:
                    print(msg.format(total_batch, loss.item(), train_acc, dev_loss, dev_acc, time_dif, improve))
                model.train()
            total_batch = total_batch + 1
            if total_batch - last_improve > config.require_improvement:
                # No improvement for too long (longer than config.require_improvement)
                print('No improvement for too long. Stop training automatically.')
                flag = True
                break

        if flag:
            break
Esempio n. 11
0
def train(config, model, train_iter, dev_iter, test_iter):
    start_time = time.time()
    TIMESTAMP = "{}_bs{}_lr{}_ps{}_{:%Y_%m_%d_%H_%M_%S/}".format(
        config.model_name, config.batch_size, config.learning_rate,
        config.pad_size, datetime.now())
    writer = SummaryWriter('/data-output/{}'.format(TIMESTAMP))
    model.train()
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        0.01
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]
    # optimizer = torch.optim.Adam(model.parameters(), lr=config.learning_rate)
    optimizer = BertAdam(optimizer_grouped_parameters,
                         lr=config.learning_rate,
                         warmup=0.05,
                         t_total=len(train_iter) * config.num_epochs)
    # optimizer = BertAdam(optimizer_grouped_parameters, lr=config.learning_rate, warmup=0.05)

    model, optimizer = amp.initialize(model, optimizer, opt_level="O1")

    # dev_best_loss = float('inf')
    dev_best_acc = 0
    last_improve = 0  # 记录上次验证集loss下降的batch数
    flag = False  # 记录是否很久没有效果提升
    model.train()
    sum_batch = 0
    for epoch in range(config.num_epochs):
        total_batch = 0
        print('Epoch [{}/{}]'.format(epoch + 1, config.num_epochs))
        pbar = tqdm(total=len(train_iter))
        for i, (trains, labels) in enumerate(train_iter):
            outputs = model(trains)
            model.zero_grad()
            loss = F.cross_entropy(outputs, labels)
            # loss = FocalLoss(gamma=config.gamma, num_class=config.num_classes)(outputs, labels)
            # loss.backward()
            with amp.scale_loss(loss, optimizer) as scaled_loss:
                scaled_loss.backward()

            # if i % 2 == 1:
            #     optimizer.step()
            #     model.zero_grad()

            optimizer.step()
            # model.zero_grad()

            if total_batch % 50 == 0 and total_batch != 0:

                true = labels.data.cpu()
                predic = torch.max(outputs.data, 1)[1].cpu()
                train_acc = metrics.accuracy_score(true, predic)
                dev_acc, dev_loss = evaluate(config, model, dev_iter)
                if dev_acc > dev_best_acc:
                    dev_best_acc = dev_acc
                    # torch.save(model.state_dict(), config.save_path)
                    # torch.save(model.state_dict(), 'out/model/epoch{}_{}_pytorch_model.bin'.format(epoch, config.model_name))
                    torch.save(
                        model.state_dict(),
                        'out/model/best_{}_pytorch_model.bin'.format(
                            config.model_name))
                    improve = '*'
                    last_improve = total_batch
                else:
                    improve = ''
                # time_dif = get_time_dif(start_time)
                print(
                    '\n Iter: {},  Train Loss: {:.3f}  Train Acc: {:.3f}%  Val Loss: {:.3f}  Val Acc: {:.3f}% {} '
                    .format(total_batch, loss.item(), train_acc * 100,
                            dev_loss, dev_acc * 100, improve))

                writer.add_scalar('Loss/train', loss.item(), sum_batch)
                writer.add_scalar('Loss/dev', dev_loss, sum_batch)
                writer.add_scalar('Acc/train', train_acc, sum_batch)
                writer.add_scalar('Acc/dev', dev_acc, sum_batch)
                writer.flush()

                model.train()

            pbar.update(1)
            sum_batch += 1
            total_batch += 1
            if total_batch - last_improve > config.require_improvement:
                print("No optimization for a long time, auto-stopping...")
                flag = True
                break
        pbar.close()

        true = labels.data.cpu()
        predic = torch.max(outputs.data, 1)[1].cpu()
        train_acc = metrics.accuracy_score(true, predic)
        dev_acc, dev_loss = evaluate(config, model, dev_iter)

        writer.add_scalar('Loss/train', loss.item(), sum_batch)
        writer.add_scalar('Loss/dev', dev_loss, sum_batch)
        writer.add_scalar('Acc/train', train_acc, sum_batch)
        writer.add_scalar('Acc/dev', dev_acc, sum_batch)
        writer.flush()

        print(
            '\n Epoch{},  Train Loss: {:.3f} Train Acc: {:.3f} Val Loss: {:.3f}  Val Acc: {:.3f}  '
            .format(epoch + 1, loss.item(), train_acc, dev_loss, dev_acc))
        torch.save(model.state_dict(),
                   'out/model/{}_pytorch_model.bin'.format(config.model_name))

        if flag:
            break

    # torch.save(model.state_dict(), 'out/model/{}_pytorch_model.bin'.format(config.model_name))
    predict(config, model, test_iter, dev_acc)

    writer.close()
def train(config, model, train_iter, dev_iter, test_iter):
    start_time = time.time()
    model.train()
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}]
    # optimizer = torch.optim.Adam(model.parameters(), lr=config.learning_rate)
    optimizer = BertAdam(optimizer_grouped_parameters,
                         lr=config.learning_rate,
                         warmup=0.05,
                         t_total=len(train_iter) * config.num_epochs)
    total_batch = 0  # 记录进行到多少batch
    dev_best_loss = float('inf')
    last_improve = 0  # 记录上次验证集loss下降的batch数
    flag = False  # 记录是否很久没有效果提升
    model.train()

    # 初始化
    pgd = PGD(model)
    K = 3

    for epoch in range(config.num_epochs):
        print('Epoch [{}/{}]'.format(epoch + 1, config.num_epochs))
        for i, (trains, labels) in enumerate(train_iter):
            outputs = model(trains)
            # model.zero_grad()
            loss = F.cross_entropy(outputs, labels)
            loss.backward()
            pgd.backup_grad()
            # 对抗训练
            for t in range(K):
                pgd.attack(is_first_attack=(t == 0))  # 在embedding上添加对抗扰动, first attack时备份param.data
                if t != K - 1:
                    model.zero_grad()
                else:
                    pgd.restore_grad()
                outputs_adv = model(trains)
                loss_adv = F.cross_entropy(outputs_adv, labels)
                loss_adv.backward()  # 反向传播,并在正常的grad基础上,累加对抗训练的梯度
            pgd.restore()            # 恢复embedding参数

            optimizer.step()
            model.zero_grad()
            if total_batch % 100 == 0:
                # 每多少轮输出在训练集和验证集上的效果
                true = labels.data.cpu()
                predic = torch.max(outputs.data, 1)[1].cpu()
                train_acc = metrics.accuracy_score(true, predic)
                dev_acc, dev_loss = evaluate(config, model, dev_iter)
                if dev_loss < dev_best_loss:
                    dev_best_loss = dev_loss
                    torch.save(model.state_dict(), config.save_path)
                    improve = '*'
                    last_improve = total_batch
                else:
                    improve = ''
                time_dif = get_time_dif(start_time)
                msg = 'Iter: {0:>6},  Train Loss: {1:>5.2},  Train Acc: {2:>6.2%},  Val Loss: {3:>5.2},  Val Acc: {4:>6.2%},  Time: {5} {6}'
                print(msg.format(total_batch, loss.item(), train_acc, dev_loss, dev_acc, time_dif, improve))
                model.train()
            total_batch += 1
            if total_batch - last_improve > config.require_improvement:
                # 验证集loss超过1000batch没下降,结束训练
                print("No optimization for a long time, auto-stopping...")
                flag = True
                break
        if flag:
            break
    test(config, model, test_iter)