def test_model(test_dataset):
    num_test = len(test_dataset)
    test_useful_end_idx = get_useful_end_idx(sequence_length, num_test)
    test_idx = []
    for i in test_useful_end_idx:
        for j in range(sequence_length):
            test_idx.append(i - j * srate)
    test_idx.reverse()
    test_loader = DataLoader(
        test_dataset,
        batch_size=test_batch_size,
        sampler=SeqSampler(test_dataset, test_idx),
        # sampler=test_idx,
        num_workers=0,
        pin_memory=False)
    model = res34_tcn()
    model = DataParallel(model)
    model.load_state_dict(torch.load(model_name))
    # model = model.module
    # model = DataParallel(model)

    if use_gpu:
        model = model.cuda()
    # model = DataParallel(model)
    # model = model.module

    model.eval()

    all_preds_s = []

    num = 0
    with torch.no_grad():
        for data in test_loader:
            num = num + 1
            inputs, _, kdatas = data
            if use_gpu:
                inputs = Variable(inputs.cuda())
                kdatas = Variable(kdatas.cuda())
            else:
                inputs = Variable(inputs)
                kdatas = Variable(kdatas)

            outputs_s = model.forward(inputs, kdatas)

            #outputs_s = outputs_s[-1, (sequence_length - 1):: sequence_length]
            outputs_s = outputs_s[-1]
            outputs_s = F.softmax(outputs_s, dim=-1)

            _, preds_s = torch.max(outputs_s.data, -1)

            for j in range(preds_s.shape[0]):
                all_preds_s.append(preds_s[j].data.item())

    return all_preds_s
Example #2
0
def test_model(test_dataset, test_num_each):
    num_test = len(test_dataset)
    test_useful_start_idx = get_useful_start_idx(sequence_length,
                                                 test_num_each)

    num_test_we_use = len(test_useful_start_idx)
    # num_test_we_use = 804
    # num_test_we_use = len(test_useful_start_idx) // (test_batch_size // sequence_length) * (
    #     test_batch_size // sequence_length)

    test_we_use_start_idx = test_useful_start_idx[0:num_test_we_use]

    test_idx = []
    for i in range(num_test_we_use):
        for j in range(sequence_length):
            test_idx.append(test_we_use_start_idx[i] + j)

    num_test_all = len(test_idx)

    print('num test start idx : {:6d}'.format(len(test_useful_start_idx)))
    print('last idx test start: {:6d}'.format(test_useful_start_idx[-1]))
    print('num of test dataset: {:6d}'.format(num_test))
    print('num of test we use : {:6d}'.format(num_test_we_use))
    print('num of all test use: {:6d}'.format(num_test_all))

    test_loader = DataLoader(test_dataset,
                             batch_size=test_batch_size,
                             sampler=test_idx,
                             num_workers=workers,
                             pin_memory=False)
    model = resnet_lstm()
    model = DataParallel(model)
    model.load_state_dict(torch.load(model_name))

    if use_gpu:
        model = model.cuda()
    # 应该可以直接多gpu计算
    # model = model.module            #要测试一下
    criterion = nn.CrossEntropyLoss(size_average=False)

    model.eval()
    test_loss = 0.0
    test_corrects = 0
    test_start_time = time.time()

    all_preds = []

    for data in test_loader:
        inputs, labels_1, labels_2 = data
        labels_2 = labels_2[(sequence_length - 1)::sequence_length]

        if use_gpu:
            inputs = Variable(inputs.cuda(), volatile=True)
            labels = Variable(labels_2.cuda(), volatile=True)
        else:
            inputs = Variable(inputs, volatile=True)
            labels = Variable(labels_2, volatile=True)

        if crop_type == 0 or crop_type == 1:
            outputs = model.forward(inputs)
        elif crop_type == 5:
            inputs = inputs.permute(1, 0, 2, 3, 4).contiguous()
            inputs = inputs.view(-1, 3, 224, 224)
            outputs = model.forward(inputs)
            outputs = outputs.view(5, -1, 7)
            outputs = torch.mean(outputs, 0)
        elif crop_type == 10:
            inputs = inputs.permute(1, 0, 2, 3, 4).contiguous()
            inputs = inputs.view(-1, 3, 224, 224)
            outputs = model.forward(inputs)
            outputs = outputs.view(10, -1, 7)
            outputs = torch.mean(outputs, 0)

        outputs = outputs[sequence_length - 1::sequence_length]
        _, preds = torch.max(outputs.data, 1)
        for i in range(len(preds)):
            all_preds.append(preds[i])
        print(len(all_preds))
        loss = criterion(outputs, labels)
        test_loss += loss.data[0]
        test_corrects += torch.sum(preds == labels.data)

    test_elapsed_time = time.time() - test_start_time
    test_accuracy = test_corrects / num_test_we_use
    test_average_loss = test_loss / num_test_we_use

    print('type of all_preds:', type(all_preds))
    print('leng of all preds:', len(all_preds))
    save_test = int("{:4.0f}".format(test_accuracy * 10000))
    pred_name = model_pure_name + '_test_' + str(save_test) + '_crop_' + str(
        crop_type) + '.pkl'

    with open(pred_name, 'wb') as f:
        pickle.dump(all_preds, f)
    print('test elapsed: {:2.0f}m{:2.0f}s'
          ' test loss: {:4.4f}'
          ' test accu: {:.4f}'.format(test_elapsed_time // 60,
                                      test_elapsed_time % 60,
                                      test_average_loss, test_accuracy))
def train_model(train_dataset, train_num_each, val_dataset, val_num_each):
    num_train = len(train_dataset)
    num_val = len(val_dataset)

    train_useful_start_idx = get_useful_start_idx(sequence_length, train_num_each)
    #print('train_useful_start_idx ',train_useful_start_idx )
    val_useful_start_idx = get_useful_start_idx(sequence_length, val_num_each)
    #print('test_useful_start_idx ', val_useful_start_idx)

    num_train_we_use = len(train_useful_start_idx) // num_gpu * num_gpu
    # print('num_train_we_use',num_train_we_use) #92166
    num_val_we_use = len(val_useful_start_idx) // num_gpu * num_gpu
    # print('num_val_we_use', num_val_we_use)
    # num_train_we_use = 8000
    # num_val_we_use = 800

    train_we_use_start_idx = train_useful_start_idx[0:num_train_we_use]  # 训练数据开始位置
    val_we_use_start_idx = val_useful_start_idx[0:num_val_we_use]

    np.random.seed(0)
    np.random.shuffle(train_we_use_start_idx)  # 将序列的所有元素随机排序
    train_idx = []
    for i in range(num_train_we_use):  # 训练集帧数
        for j in range(sequence_length):
            train_idx.append(train_we_use_start_idx[i] + j * srate)  # 训练数据位置,每一张图是一个数据
    # print('train_idx',train_idx)

    val_idx = []
    for i in range(num_val_we_use):
        for j in range(sequence_length):
            val_idx.append(val_we_use_start_idx[i] + j * srate)
    # print('val_idx',val_idx)

    num_train_all = float(len(train_idx))
    num_val_all = float(len(val_idx))
    print('num of train dataset: {:6d}'.format(num_train))
    print('num train start idx : {:6d}'.format(len(train_useful_start_idx)))
    print('last idx train start: {:6d}'.format(train_useful_start_idx[-1]))
    print('num of train we use : {:6d}'.format(num_train_we_use))
    print('num of all train use: {:6d}'.format(int(num_train_all)))
    print('num of valid dataset: {:6d}'.format(num_val))
    print('num valid start idx : {:6d}'.format(len(val_useful_start_idx)))
    print('last idx valid start: {:6d}'.format(val_useful_start_idx[-1]))
    print('num of valid we use : {:6d}'.format(num_val_we_use))
    print('num of all valid use: {:6d}'.format(int(num_val_all)))

    val_loader = DataLoader(
        val_dataset,
        batch_size=val_batch_size,
        # sampler=val_idx,
        sampler=SeqSampler(val_dataset, val_idx),
        num_workers=workers,
        pin_memory=False
    )
    model = res34_tcn()
    if use_gpu:
        model = model.cuda()

    model = DataParallel(model)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    # model.parameters()与model.state_dict()是Pytorch中用于查看网络参数的方法。前者多见于优化器的初始化,后者多见于模型的保存
    best_model_wts = copy.deepcopy(model.state_dict())
    best_val_accuracy = 0.0
    correspond_train_acc = 0.0

    record_np = np.zeros([epochs, 4])

    for epoch in range(epochs):
        np.random.seed(epoch)
        np.random.shuffle(train_we_use_start_idx)  # 将序列的所有元素随机排序
        train_idx = []
        for i in range(num_train_we_use):
            for j in range(sequence_length):
                train_idx.append(train_we_use_start_idx[i] + j * srate)

        train_loader = DataLoader(
            train_dataset,
            batch_size=train_batch_size,
            sampler=SeqSampler(train_dataset, train_idx),
            num_workers=workers,
            pin_memory=False
        )

        model.train()
        train_loss = 0.0
        train_corrects = 0
        train_start_time = time.time()
        num = 0
        train_num = 0
        for data in train_loader:
            num = num + 1
            # inputs, labels_phase = data
            inputs, labels_phase, kdata = data
            if use_gpu:
                inputs = Variable(inputs.cuda())  # Variable就是一个存放会变化值的地理位置,里面的值会不停发生变化
                labels = Variable(labels_phase.cuda())
                kdatas = Variable(kdata.cuda())
            else:
                inputs = Variable(inputs)
                labels = Variable(labels_phase)
                kdatas = Variable(kdata)
            optimizer.zero_grad()  # 梯度初始化为零,也就是把loss关于weight的导数变成0.
            # outputs = model.forward(inputs)  # 前向传播
            outputs = model.forward(inputs, kdatas)
            #outputs = F.softmax(outputs, dim=-1)
            _, preds = torch.max(outputs.data, -1)  # .data 获取Variable的内部Tensor;torch.max(a,1)返回每一行中最大值的那个元素,且返回其索引
            #_, yp = torch.max(y.data, 1)
            #print(yp)
            # print(yp.shape)
            print(num)
            print(preds)
            print(labels)


            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            train_loss += loss.data
            train_corrects += torch.sum(preds == labels.data)
            train_num += labels.shape[0]
            print(train_corrects.cpu().numpy() / train_num)
            if train_corrects.cpu().numpy() / train_num > 0.75:
                torch.save(copy.deepcopy(model.state_dict()), 'test.pth')  # .state_dict()只保存网络中的参数(速度快,占内存少)

        train_elapsed_time = time.time() - train_start_time

        #train_accuracy1 = train_corrects1.cpu().numpy() / train_num
        train_accuracy = train_corrects.cpu().numpy() / train_num
        train_average_loss = train_loss / train_num

        # begin eval
        model.eval()
        val_loss = 0.0
        val_corrects = 0
        val_num = 0
        val_start_time = time.time()
        for data in val_loader:
            inputs, labels_phase, kdata = data
            #inputs, labels_phase = data
            #labels_phase = labels_phase[(sequence_length - 1)::sequence_length]
            #kdata = kdata[(sequence_length - 1)::sequence_length]
            if use_gpu:
                inputs = Variable(inputs.cuda())
                labels = Variable(labels_phase.cuda())
                kdatas = Variable(kdata.cuda())
            else:
                inputs = Variable(inputs)
                labels = Variable(labels_phase)
                kdatas = Variable(kdata)

            if crop_type == 0 or crop_type == 1:
                #outputs = model.forward(inputs)
                outputs = model.forward(inputs, kdatas)
            elif crop_type == 5:
                inputs = inputs.permute(1, 0, 2, 3, 4).contiguous()
                inputs = inputs.view(-1, 3, 224, 224)
                outputs = model.forward(inputs, kdatas)
                # outputs = model.forward(inputs)
                outputs = outputs.view(5, -1, 3)
                outputs = torch.mean(outputs, 0)
            elif crop_type == 10:
                inputs = inputs.permute(1, 0, 2, 3, 4).contiguous()
                inputs = inputs.view(-1, 3, 224, 224)
                outputs = model.forward(inputs, kdatas)
                #outputs = model.forward(inputs)
                outputs = outputs.view(10, -1, 3)
                outputs = torch.mean(outputs, 0)

            #outputs = outputs[sequence_length - 1::sequence_length]

            _, preds = torch.max(outputs.data, -1)
            #_, yp = torch.max(y.data, 1)
            print(num)
            print(preds)
            print(labels)


            loss = criterion(outputs, labels)
            #loss = 0.05 * loss1 + 0.15 * loss2 + 0.3 * loss3 + 0.5 * loss4
            #loss = 0.05 * loss1 + 0.1 * loss2 + 0.25 * loss3 + 0.6 * loss4
            val_loss += loss.data
            val_corrects += torch.sum(preds == labels.data)
            val_num += labels.shape[0]
        val_elapsed_time = time.time() - val_start_time
        val_accuracy = val_corrects.cpu().numpy() / val_num
        val_average_loss = val_loss / val_num
        print('epoch: {:4d}'
              ' train in: {:2.0f}m{:2.0f}s'
              ' train loss: {:4.4f}'
              ' train accu: {:.4f}'
              ' valid in: {:2.0f}m{:2.0f}s'
              ' valid loss: {:4.4f}'
              ' valid accu: {:.4f}'
              .format(epoch,
                      train_elapsed_time // 60,
                      train_elapsed_time % 60,
                      train_average_loss,
                      train_accuracy,
                      val_elapsed_time // 60,
                      val_elapsed_time % 60,
                      val_average_loss,
                      val_accuracy))

        if optimizer_choice == 0:
            if sgd_adjust_lr == 0:
                exp_lr_scheduler.step()
            elif sgd_adjust_lr == 1:
                exp_lr_scheduler.step(val_average_loss)

        if val_accuracy > best_val_accuracy:
            best_val_accuracy = val_accuracy
            correspond_train_acc = train_accuracy
            best_model_wts = copy.deepcopy(model.state_dict())
        if val_accuracy == best_val_accuracy:
            if train_accuracy > correspond_train_acc:
                correspond_train_acc = train_accuracy
                best_model_wts = copy.deepcopy(model.state_dict())


        record_np[epoch, 0] = train_accuracy
        record_np[epoch, 1] = train_average_loss
        record_np[epoch, 2] = val_accuracy
        record_np[epoch, 3] = val_average_loss
        np.save(str(epoch) + '.npy', record_np)

    print('best accuracy: {:.4f} cor train accu: {:.4f}'.format(best_val_accuracy, correspond_train_acc))

    save_val = int("{:4.0f}".format(best_val_accuracy * 10000))
    save_train = int("{:4.0f}".format(correspond_train_acc * 10000))
    model_name = "tcn" \
                 + "_epoch_" + str(epochs) \
                 + "_length_" + str(sequence_length) \
                 + "_opt_" + str(optimizer_choice) \
                 + "_mulopt_" + str(multi_optim) \
                 + "_flip_" + str(use_flip) \
                 + "_crop_" + str(crop_type) \
                 + "_batch_" + str(train_batch_size) \
                 + "_train_" + str(save_train) \
                 + "_val_" + str(save_val) \
                 + ".pth"

    torch.save(best_model_wts, model_name)

    record_name = "tcn" \
                  + "_epoch_" + str(epochs) \
                  + "_length_" + str(sequence_length) \
                  + "_opt_" + str(optimizer_choice) \
                  + "_mulopt_" + str(multi_optim) \
                  + "_flip_" + str(use_flip) \
                  + "_crop_" + str(crop_type) \
                  + "_batch_" + str(train_batch_size) \
                  + "_train_" + str(save_train) \
                  + "_val_" + str(save_val) \
                  + ".npy"
    np.save(record_name, record_np)
Example #4
0
def train_model(train_dataset, train_num_each, val_dataset, val_num_each):
    num_train = len(train_dataset)
    num_val = len(val_dataset)

    train_useful_start_idx = get_useful_start_idx(sequence_length,
                                                  train_num_each)

    val_useful_start_idx = get_useful_start_idx(sequence_length, val_num_each)

    num_train_we_use = len(train_useful_start_idx) // num_gpu * num_gpu
    num_val_we_use = len(val_useful_start_idx) // num_gpu * num_gpu
    # num_train_we_use = 4
    # num_val_we_use = 800

    train_we_use_start_idx = train_useful_start_idx[0:num_train_we_use]
    val_we_use_start_idx = val_useful_start_idx[0:num_val_we_use]

    train_idx = []
    for i in range(num_train_we_use):
        for j in range(sequence_length):
            train_idx.append(train_we_use_start_idx[i] + j)

    val_idx = []
    for i in range(num_val_we_use):
        for j in range(sequence_length):
            val_idx.append(val_we_use_start_idx[i] + j)

    num_train_all = len(train_idx)
    num_val_all = len(val_idx)

    print('num train start idx : {:6d}'.format(len(train_useful_start_idx)))
    print('last idx train start: {:6d}'.format(train_useful_start_idx[-1]))
    print('num of train dataset: {:6d}'.format(num_train))
    print('num of train we use : {:6d}'.format(num_train_we_use))
    print('num of all train use: {:6d}'.format(num_train_all))
    print('num valid start idx : {:6d}'.format(len(val_useful_start_idx)))
    print('last idx valid start: {:6d}'.format(val_useful_start_idx[-1]))
    print('num of valid dataset: {:6d}'.format(num_val))
    print('num of valid we use : {:6d}'.format(num_val_we_use))
    print('num of all valid use: {:6d}'.format(num_val_all))

    train_loader = DataLoader(train_dataset,
                              batch_size=train_batch_size,
                              sampler=train_idx,
                              num_workers=workers,
                              pin_memory=False)
    val_loader = DataLoader(val_dataset,
                            batch_size=val_batch_size,
                            sampler=val_idx,
                            num_workers=workers,
                            pin_memory=False)
    model = multi_lstm_4loss()
    sig_f = nn.Sigmoid()

    if use_gpu:
        model = model.cuda()
        sig_f = sig_f.cuda()
    model = DataParallel(model)
    criterion_1 = nn.BCEWithLogitsLoss(size_average=False)
    criterion_2 = nn.CrossEntropyLoss(size_average=False)

    if multi_optim == 0:
        if optimizer_choice == 0:
            optimizer = optim.SGD(model.parameters(),
                                  lr=learning_rate,
                                  momentum=momentum,
                                  dampening=dampening,
                                  weight_decay=weight_decay,
                                  nesterov=use_nesterov)
            if sgd_adjust_lr == 0:
                exp_lr_scheduler = lr_scheduler.StepLR(optimizer,
                                                       step_size=sgd_adjust_lr,
                                                       gamma=sgd_gamma)
            elif sgd_adjust_lr == 1:
                exp_lr_scheduler = lr_scheduler.ReduceLROnPlateau(
                    optimizer, 'min')
        elif optimizer_choice == 1:
            optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    elif multi_optim == 1:
        if optimizer_choice == 0:
            optimizer = optim.SGD([
                {
                    'params': model.module.share.parameters()
                },
                {
                    'params': model.module.lstm.parameters(),
                    'lr': learning_rate
                },
                {
                    'params': model.module.fc_tool.parameters(),
                    'lr': learning_rate
                },
                {
                    'params': model.module.fc_tool1.parameters(),
                    'lr': learning_rate
                },
                {
                    'params': model.module.fc_tool2.parameters(),
                    'lr': learning_rate
                },
                {
                    'params': model.module.fc_phase1.parameters(),
                    'lr': learning_rate
                },
                {
                    'params': model.module.fc_phase2.parameters(),
                    'lr': learning_rate
                },
            ],
                                  lr=learning_rate / 10,
                                  momentum=momentum,
                                  dampening=dampening,
                                  weight_decay=weight_decay,
                                  nesterov=use_nesterov)
            if sgd_adjust_lr == 0:
                exp_lr_scheduler = lr_scheduler.StepLR(optimizer,
                                                       step_size=sgd_adjust_lr,
                                                       gamma=sgd_gamma)
            elif sgd_adjust_lr == 1:
                exp_lr_scheduler = lr_scheduler.ReduceLROnPlateau(
                    optimizer, 'min')
        elif optimizer_choice == 1:
            optimizer = optim.Adam([
                {
                    'params': model.module.share.parameters()
                },
                {
                    'params': model.module.lstm.parameters(),
                    'lr': learning_rate
                },
                {
                    'params': model.module.fc_tool.parameters(),
                    'lr': learning_rate
                },
                {
                    'params': model.module.fc_tool1.parameters(),
                    'lr': learning_rate
                },
                {
                    'params': model.module.fc_tool2.parameters(),
                    'lr': learning_rate
                },
                {
                    'params': model.module.fc_phase1.parameters(),
                    'lr': learning_rate
                },
                {
                    'params': model.module.fc_phase2.parameters(),
                    'lr': learning_rate
                },
            ],
                                   lr=learning_rate / 10)

    best_model_wts = copy.deepcopy(model.state_dict())
    best_val_accuracy_1 = 0.0
    best_val_accuracy_2 = 0.0  # judge by accu2
    correspond_train_acc_1 = 0.0
    correspond_train_acc_2 = 0.0

    record_np = np.zeros([epochs, 8])

    for epoch in range(epochs):
        # np.random.seed(epoch)
        np.random.shuffle(train_we_use_start_idx)
        train_idx = []
        for i in range(num_train_we_use):
            for j in range(sequence_length):
                train_idx.append(train_we_use_start_idx[i] + j)

        train_loader = DataLoader(train_dataset,
                                  batch_size=train_batch_size,
                                  sampler=train_idx,
                                  num_workers=workers,
                                  pin_memory=False)

        model.train()
        train_loss_11 = 0.0
        train_loss_12 = 0.0
        train_loss_21 = 0.0
        train_loss_22 = 0.0
        train_corrects_11 = 0
        train_corrects_12 = 0
        train_corrects_21 = 0
        train_corrects_22 = 0

        train_start_time = time.time()
        for data in train_loader:
            inputs, labels_1, labels_2 = data
            if use_gpu:
                inputs = Variable(inputs.cuda())
                labels_1 = Variable(labels_1.cuda())
                labels_2 = Variable(labels_2.cuda())
            else:
                inputs = Variable(inputs)
                labels_1 = Variable(labels_1)
                labels_2 = Variable(labels_2)

            optimizer.zero_grad()

            outputs_11, outputs_12, outputs_21, outputs_22 = model.forward(
                inputs)

            _, preds_12 = torch.max(outputs_12.data, 1)
            _, preds_22 = torch.max(outputs_22.data, 1)

            sig_out_11 = sig_f(outputs_11.data)
            sig_out_21 = sig_f(outputs_21.data)

            preds_11 = torch.ByteTensor(sig_out_11.cpu() > 0.5)
            preds_11 = preds_11.long()
            train_corrects_11 += torch.sum(preds_11 == labels_1.data.cpu())
            preds_21 = torch.ByteTensor(sig_out_21.cpu() > 0.5)
            preds_21 = preds_21.long()
            train_corrects_21 += torch.sum(preds_21 == labels_1.data.cpu())

            labels_1 = Variable(labels_1.data.float())
            loss_11 = criterion_1(outputs_11, labels_1)
            loss_21 = criterion_1(outputs_21, labels_1)

            loss_12 = criterion_2(outputs_12, labels_2)
            loss_22 = criterion_2(outputs_22, labels_2)
            loss = loss_11 + loss_12 + loss_21 + loss_22
            loss.backward()
            optimizer.step()

            train_loss_11 += loss_11.data[0]
            train_loss_12 += loss_12.data[0]
            train_loss_21 += loss_21.data[0]
            train_loss_22 += loss_22.data[0]
            train_corrects_12 += torch.sum(preds_12 == labels_2.data)
            train_corrects_22 += torch.sum(preds_22 == labels_2.data)

        train_elapsed_time = time.time() - train_start_time
        train_accuracy_11 = train_corrects_11 / num_train_all / 7
        train_accuracy_21 = train_corrects_21 / num_train_all / 7
        train_accuracy_12 = train_corrects_12 / num_train_all
        train_accuracy_22 = train_corrects_22 / num_train_all
        train_average_loss_11 = train_loss_11 / num_train_all / 7
        train_average_loss_21 = train_loss_21 / num_train_all / 7
        train_average_loss_12 = train_loss_12 / num_train_all
        train_average_loss_22 = train_loss_22 / num_train_all

        # begin eval

        model.eval()
        val_loss_11 = 0.0
        val_loss_12 = 0.0
        val_loss_21 = 0.0
        val_loss_22 = 0.0
        val_corrects_11 = 0
        val_corrects_12 = 0
        val_corrects_21 = 0
        val_corrects_22 = 0

        val_start_time = time.time()
        for data in val_loader:
            inputs, labels_1, labels_2 = data
            labels_2 = labels_2[(sequence_length - 1)::sequence_length]
            if use_gpu:
                inputs = Variable(inputs.cuda(), volatile=True)
                labels_1 = Variable(labels_1.cuda(), volatile=True)
                labels_2 = Variable(labels_2.cuda(), volatile=True)
            else:
                inputs = Variable(inputs, volatile=True)
                labels_1 = Variable(labels_1, volatile=True)
                labels_2 = Variable(labels_2, volatile=True)

            # if crop_type == 0 or crop_type == 1:
            #     outputs_1, outputs_2 = model.forward(inputs)
            # elif crop_type == 5:
            #     inputs = inputs.permute(1, 0, 2, 3, 4).contiguous()
            #     inputs = inputs.view(-1, 3, 224, 224)
            #     outputs_1, outputs_2 = model.forward(inputs)
            #     outputs_1 = outputs_1.view(5, -1, 7)
            #     outputs_1 = torch.mean(outputs_1, 0)
            #     outputs_2 = outputs_2.view(5, -1, 7)
            #     outputs_2 = torch.mean(outputs_2, 0)
            # elif crop_type == 10:
            #     inputs = inputs.permute(1, 0, 2, 3, 4).contiguous()
            #     inputs = inputs.view(-1, 3, 224, 224)
            #     outputs_1, outputs_2 = model.forward(inputs)
            #     outputs_1 = outputs_1.view(10, -1, 7)
            #     outputs_1 = torch.mean(outputs_1, 0)
            #     outputs_2 = outputs_2.view(10, -1, 7)
            #     outputs_2 = torch.mean(outputs_2, 0)
            outputs_11, outputs_12, outputs_21, outputs_22 = model.forward(
                inputs)
            outputs_12 = outputs_12[sequence_length - 1::sequence_length]
            outputs_22 = outputs_22[sequence_length - 1::sequence_length]

            _, preds_12 = torch.max(outputs_12.data, 1)
            _, preds_22 = torch.max(outputs_22.data, 1)

            sig_out_11 = sig_f(outputs_11.data)
            sig_out_21 = sig_f(outputs_21.data)

            preds_11 = torch.ByteTensor(sig_out_11.cpu() > 0.5)
            preds_11 = preds_11.long()
            train_corrects_11 += torch.sum(preds_11 == labels_1.data.cpu())
            preds_21 = torch.ByteTensor(sig_out_21.cpu() > 0.5)
            preds_21 = preds_21.long()
            train_corrects_21 += torch.sum(preds_21 == labels_1.data.cpu())

            labels_1 = Variable(labels_1.data.float())
            loss_11 = criterion_1(outputs_11, labels_1)
            loss_21 = criterion_1(outputs_21, labels_1)

            loss_12 = criterion_2(outputs_12, labels_2)
            loss_22 = criterion_2(outputs_22, labels_2)

            val_loss_11 += loss_11.data[0]
            val_loss_12 += loss_12.data[0]
            val_loss_21 += loss_21.data[0]
            val_loss_22 += loss_22.data[0]
            val_corrects_12 += torch.sum(preds_12 == labels_2.data)
            val_corrects_22 += torch.sum(preds_22 == labels_2.data)

        val_elapsed_time = time.time() - val_start_time
        val_accuracy_11 = val_corrects_11 / num_val_all / 7
        val_accuracy_21 = val_corrects_21 / num_val_all / 7
        val_accuracy_12 = val_corrects_12 / num_val_we_use
        val_accuracy_22 = val_corrects_22 / num_val_we_use
        val_average_loss_11 = val_loss_11 / num_val_all / 7
        val_average_loss_21 = val_loss_21 / num_val_all / 7
        val_average_loss_12 = val_loss_12 / num_val_we_use
        val_average_loss_22 = val_loss_22 / num_val_we_use

        print('epoch: {:4d}'
              ' train time: {:2.0f}m{:2.0f}s'
              ' train accu_11: {:.4f}'
              ' train accu_21: {:.4f}'
              ' valid time: {:2.0f}m{:2.0f}s'
              ' valid accu_11: {:.4f}'
              ' valid accu_21: {:.4f}'.format(
                  epoch, train_elapsed_time // 60, train_elapsed_time % 60,
                  train_accuracy_11, train_accuracy_21, val_elapsed_time // 60,
                  val_elapsed_time % 60, val_accuracy_11, val_accuracy_21))
        print('epoch: {:4d}'
              ' train time: {:2.0f}m{:2.0f}s'
              ' train accu_12: {:.4f}'
              ' train accu_22: {:.4f}'
              ' valid time: {:2.0f}m{:2.0f}s'
              ' valid accu_12: {:.4f}'
              ' valid accu_22: {:.4f}'.format(
                  epoch, train_elapsed_time // 60, train_elapsed_time % 60,
                  train_accuracy_12, train_accuracy_22, val_elapsed_time // 60,
                  val_elapsed_time % 60, val_accuracy_12, val_accuracy_22))

        if optimizer_choice == 0:
            if sgd_adjust_lr == 0:
                exp_lr_scheduler.step()
            elif sgd_adjust_lr == 1:
                exp_lr_scheduler.step(val_average_loss_11 +
                                      val_average_loss_12 +
                                      val_average_loss_21 +
                                      val_average_loss_22)
Example #5
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--device',
                        default='0,1,2,3',
                        type=str,
                        required=False,
                        help='设置使用哪些显卡')
    parser.add_argument('--model_config',
                        default='config/config.json',
                        type=str,
                        required=False,
                        help='选择模型参数')
    parser.add_argument('--tokenizer_path',
                        default='vocab/vocab.txt',
                        type=str,
                        required=False,
                        help='选择词库')
    parser.add_argument('--raw_data_path',
                        default='data/train.txt',
                        type=str,
                        required=False,
                        help='原始训练语料')
    parser.add_argument('--tokenized_data_path',
                        default='data/tokenized/',
                        type=str,
                        required=False,
                        help='tokenized语料存放位置')
    parser.add_argument('--epochs',
                        default=1,
                        type=int,
                        required=False,
                        help='训练循环')
    parser.add_argument('--batch_size',
                        default=8,
                        type=int,
                        required=False,
                        help='训练batch size')
    parser.add_argument('--lr',
                        default=1.5e-4,
                        type=float,
                        required=False,
                        help='学习率')
    parser.add_argument('--warmup_steps',
                        default=2000,
                        type=int,
                        required=False,
                        help='warm up步数')
    parser.add_argument('--log_step',
                        default=1,
                        type=int,
                        required=False,
                        help='多少步汇报一次loss,设置为gradient accumulation的整数倍')
    parser.add_argument('--stride',
                        default=768,
                        type=int,
                        required=False,
                        help='训练时取训练数据的窗口步长')
    parser.add_argument('--gradient_accumulation',
                        default=1,
                        type=int,
                        required=False,
                        help='梯度积累')
    parser.add_argument('--fp16', action='store_true', help='混合精度')
    parser.add_argument('--fp16_opt_level',
                        default='O1',
                        type=str,
                        required=False)
    parser.add_argument('--max_grad_norm',
                        default=1.0,
                        type=float,
                        required=False)
    parser.add_argument('--num_pieces',
                        default=100,
                        type=int,
                        required=False,
                        help='将训练语料分成多少份')
    parser.add_argument('--min_length',
                        default=128,
                        type=int,
                        required=False,
                        help='最短收录文章长度')
    parser.add_argument('--output_dir',
                        default='model/',
                        type=str,
                        required=False,
                        help='模型输出路径')
    parser.add_argument('--pretrained_model',
                        default='model/',
                        type=str,
                        required=False,
                        help='模型训练起点路径')
    parser.add_argument('--writer_dir',
                        default='tensorboard_summary/',
                        type=str,
                        required=False,
                        help='Tensorboard路径')
    parser.add_argument('--segment', action='store_true', help='中文以词为单位')
    parser.add_argument('--bpe_token', action='store_true', help='subword')
    parser.add_argument('--encoder_json',
                        default="tokenizations/encoder.json",
                        type=str,
                        help="encoder.json")
    parser.add_argument('--vocab_bpe',
                        default="tokenizations/vocab.bpe",
                        type=str,
                        help="vocab.bpe")

    args = parser.parse_args()
    print('args:\n' + args.__repr__())

    if args.segment:
        from tokenizations import tokenization_bert_word_level as tokenization_bert
    else:
        from tokenizations import tokenization_bert

    os.environ["CUDA_VISIBLE_DEVICES"] = args.device  # 此处设置程序使用哪些显卡

    model_config = transformers.modeling_gpt2.GPT2Config.from_json_file(
        args.model_config)
    print('config:\n' + model_config.to_json_string())

    n_ctx = model_config.n_ctx
    if args.bpe_token:
        full_tokenizer = get_encoder(args.encoder_json, args.vocab_bpe)
    else:
        full_tokenizer = tokenization_bert.BertTokenizer(
            vocab_file=args.tokenizer_path)
    full_tokenizer.max_len = 999999
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    print('using device:', device)

    raw_data_path = args.raw_data_path
    tokenized_data_path = args.tokenized_data_path
    epochs = args.epochs
    batch_size = args.batch_size
    lr = args.lr
    warmup_steps = args.warmup_steps
    log_step = args.log_step
    stride = args.stride
    gradient_accumulation = args.gradient_accumulation
    fp16 = args.fp16
    fp16_opt_level = args.fp16_opt_level
    max_grad_norm = args.max_grad_norm
    num_pieces = args.num_pieces
    min_length = args.min_length
    output_dir = args.output_dir
    tb_writer = SummaryWriter(log_dir=args.writer_dir)
    assert log_step % gradient_accumulation == 0

    if not os.path.exists(output_dir):
        os.mkdir(output_dir)

    if not args.pretrained_model:
        model = transformers.modeling_gpt2.GPT2LMHeadModel(config=model_config)
    else:
        model = transformers.modeling_gpt2.GPT2LMHeadModel.from_pretrained(
            args.pretrained_model)
    model.train()
    model.to(device)

    num_parameters = 0
    parameters = model.parameters()
    for parameter in parameters:
        num_parameters += parameter.numel()
    print('number of parameters: {}'.format(num_parameters))

    multi_gpu = False
    full_len = 0
    print('calculating total steps')
    for i in tqdm(range(num_pieces)):
        with open(tokenized_data_path + 'tokenized_train_{}.txt'.format(i),
                  'r') as f:
            full_len += len([int(item) for item in f.read().strip().split()])
    total_steps = int(full_len / stride * epochs / batch_size /
                      gradient_accumulation)
    print('total steps = {}'.format(total_steps))

    optimizer = transformers.AdamW(model.parameters(),
                                   lr=lr,
                                   correct_bias=True)
    scheduler = transformers.WarmupLinearSchedule(optimizer,
                                                  warmup_steps=warmup_steps,
                                                  t_total=total_steps)

    if fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
            )
        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level=fp16_opt_level)

    if torch.cuda.device_count() > 1:
        print("Let's use", torch.cuda.device_count(), "GPUs!")
        model = DataParallel(
            model, device_ids=[int(i) for i in args.device.split(',')])
        multi_gpu = True
    print('starting training')
    overall_step = 0
    running_loss = 0
    for epoch in range(epochs):
        print('epoch {}'.format(epoch + 1))
        now = datetime.now()
        print('time: {}'.format(now))
        x = np.linspace(0, num_pieces - 1, num_pieces, dtype=np.int32)
        random.shuffle(x)
        piece_num = 0
        for i in x:
            with open(tokenized_data_path + 'tokenized_train_{}.txt'.format(i),
                      'r') as f:
                line = f.read().strip()
            tokens = line.split()
            tokens = [int(token) for token in tokens]
            start_point = 0
            samples = []
            while start_point < len(tokens) - n_ctx:
                samples.append(tokens[start_point:start_point + n_ctx])
                start_point += stride
            if start_point < len(tokens):
                samples.append(tokens[len(tokens) - n_ctx:])
            random.shuffle(samples)
            for step in range(len(samples) // batch_size):  # drop last

                #  prepare data
                batch = samples[step * batch_size:(step + 1) * batch_size]
                batch_inputs = []
                for ids in batch:
                    int_ids = [int(x) for x in ids]
                    batch_inputs.append(int_ids)
                batch_inputs = torch.tensor(batch_inputs).long().to(device)

                #  forward pass
                with torch.no_grad():
                    outputs = model.forward(input_ids=batch_inputs,
                                            labels=batch_inputs)
                loss, logits = outputs[:2]
                loss.requires_grad_(True)

                #  get loss
                if multi_gpu:
                    loss = loss.mean()
                if gradient_accumulation > 1:
                    loss = loss / gradient_accumulation

                #  loss backward
                if fp16:
                    with amp.scale_loss(loss, optimizer) as scaled_loss:
                        scaled_loss.backward()
                        torch.nn.utils.clip_grad_norm_(
                            amp.master_params(optimizer), max_grad_norm)
                else:
                    loss.backward()
                    torch.nn.utils.clip_grad_norm_(model.parameters(),
                                                   max_grad_norm)

                #  optimizer step
                if (overall_step + 1) % gradient_accumulation == 0:
                    running_loss += loss.item()
                    optimizer.step()
                    optimizer.zero_grad()
                    scheduler.step()
                if (overall_step + 1) % log_step == 0:
                    tb_writer.add_scalar('loss',
                                         loss.item() * gradient_accumulation,
                                         overall_step)
                    print(
                        'now time: {}:{}. Step {} of piece {} of epoch {}, loss {}'
                        .format(
                            datetime.now().hour,
                            datetime.now().minute, step + 1, piece_num,
                            epoch + 1, running_loss * gradient_accumulation /
                            (log_step / gradient_accumulation)))
                    running_loss = 0
                overall_step += 1
            piece_num += 1

        print('saving model for epoch {}'.format(epoch + 1))
        if not os.path.exists(output_dir + 'model_epoch{}'.format(epoch + 1)):
            os.mkdir(output_dir + 'model_epoch{}'.format(epoch + 1))
        model_to_save = model.module if hasattr(model, 'module') else model
        model_to_save.save_pretrained(output_dir +
                                      'model_epoch{}'.format(epoch + 1))
        print('epoch {} finished'.format(epoch + 1))

        then = datetime.now()
        print('time: {}'.format(then))
        print('time for one epoch: {}'.format(then - now))

    print('training finished')
    if not os.path.exists(output_dir + 'final_model'):
        os.mkdir(output_dir + 'final_model')
    model_to_save = model.module if hasattr(model, 'module') else model
    model_to_save.save_pretrained(output_dir + 'final_model')
def test_model(test_dataset, test_num_each):
    num_test = len(test_dataset)
    test_count = 0
    for i in range(len(test_num_each)):
        test_count += test_num_each[i]

    test_useful_start_idx = get_useful_start_idx(sequence_length, test_num_each)

    num_test_we_use = len(test_useful_start_idx)
    # 其实需要除以gpu个数再乘以gpu个数,但是为了保证所有都测试到,尽量保证test个数完整
    # num_test_we_use = 804

    test_we_use_start_idx = test_useful_start_idx[0:num_test_we_use]

    test_idx = []
    for i in range(num_test_we_use):
        for j in range(sequence_length):
            test_idx.append(test_we_use_start_idx[i] + j)

    num_test_all = len(test_idx)

    print('num test start idx : {:6d}'.format(len(test_useful_start_idx)))
    print('last idx test start: {:6d}'.format(test_useful_start_idx[-1]))
    print('num of test dataset: {:6d}'.format(num_test))
    print('num of test we use : {:6d}'.format(num_test_we_use))
    print('num of all test use: {:6d}'.format(num_test_all))

    test_loader = DataLoader(
        test_dataset,
        batch_size=test_batch_size,
        sampler=test_idx,
        num_workers=1,
        pin_memory=False
    )
    model = multi_lstm()
    model = DataParallel(model)
    model.load_state_dict(torch.load(model_name))
    # model = model.module
    # model = DataParallel(model)

    if use_gpu:
        model = model.cuda()
    # model = DataParallel(model)
    # model = model.module
    criterion_1 = nn.BCEWithLogitsLoss(size_average=False)
    criterion_2 = nn.CrossEntropyLoss(size_average=False)
    sig_f = nn.Sigmoid()

    model.eval()
    test_loss_1 = 0.0
    test_loss_2 = 0.0
    test_corrects_2 = 0

    test_start_time = time.time()
    all_preds_1 = []
    all_labels_1 = []
    all_preds_2 = []

    for data in test_loader:
        inputs, labels_1, labels_2 = data

        # labels_1 = labels_1[(sequence_length - 1)::sequence_length]
        #labels_2 = labels_2[(sequence_length - 1)::sequence_length]
        if use_gpu:
            inputs = Variable(inputs.cuda(), volatile=True)
            labels_1 = Variable(labels_1.cuda(), volatile=True)
            #labels_2 = Variable(labels_2.cuda(), volatile=True)
        else:
            inputs = Variable(inputs, volatile=True)
            labels_1 = Variable(labels_1, volatile=True)
            #labels_2 = Variable(labels_2, voatile=True)

        if crop_type == 0 or crop_type == 1:
            outputs_1 = model.forward(inputs)
        elif crop_type == 5:
            inputs = inputs.permute(1, 0, 2, 3, 4).contiguous()
            inputs = inputs.view(-1, 3, 224, 224)
            outputs_1 = model.forward(inputs)
            outputs_1 = outputs_1.view(5, -1, 7)
            outputs_1 = torch.mean(outputs_1, 0)
            #outputs_2 = outputs_2.view(5, -1, 7)
            #outputs_2 = torch.mean(outputs_2, 0)
        elif crop_type == 10:
            inputs = inputs.permute(1, 0, 2, 3, 4).contiguous()
            inputs = inputs.view(-1, 3, 224, 224)
            outputs_1 = model.forward(inputs)
            outputs_1 = outputs_1.view(10, -1, 7)
            outputs_1 = torch.mean(outputs_1, 0)
            #outputs_2 = outputs_2.view(10, -1, 7)
            #outputs_2 = torch.mean(outputs_2, 0)

        # outputs_1 = outputs_1[sequence_length-1::sequence_length]
        #outputs_2 = outputs_2[sequence_length-1::sequence_length]

        #_, preds_2 = torch.max(outputs_2.data, 1)

        for i in range(len(outputs_1)):
            all_preds_1.append(outputs_1[i].data.cpu().numpy().tolist())
            all_labels_1.append(labels_1[i].data.cpu().numpy().tolist())
        # for i in range(len(preds_2)):
        #     all_preds_2.append(preds_2[i])
        print('preds_1: {:6d}'.format(len(all_preds_1)))

        # labels_1 = Variable(labels_1.data.float())
        # loss_1 = criterion_1(outputs_1, labels_1)

        # test_loss_1 += loss_1.data[0]
        #loss_2 = criterion_2(outputs_2, labels_2)
        #test_loss_2 += loss_2.data[0]
        #test_corrects_2 += torch.sum(preds_2 == labels_2.data)

    all_preds_1_cor = []
    all_labels_1_cor = []
    cor_count = 0
    for i in range(len(test_num_each)):
        for j in range(cor_count, cor_count + test_num_each[i] - (sequence_length - 1)):
            if j==cor_count:
                for k in range(sequence_length-1):
                    all_preds_1_cor.append(all_preds_1[sequence_length * j + k])
                    all_labels_1_cor.append(all_labels_1[sequence_length * j + k])
            all_preds_1_cor.append(all_preds_1[sequence_length * j + sequence_length - 1])
            all_labels_1_cor.append(all_labels_1[sequence_length * j + sequence_length - 1])
        cor_count += test_num_each[i] + 1 - sequence_length

    print('all_preds_1 : {:6d}'.format(len(all_preds_1)))
    print('all_labels_1: {:6d}'.format(len(all_labels_1)))
    print('cor_labels_1: {:6d}'.format(len(all_preds_1_cor)))
    print('cor_labels_1: {:6d}'.format(len(all_labels_1_cor)))

    pt_preds_1 = torch.from_numpy(np.asarray(all_preds_1_cor, dtype=np.float32))
    pt_labels_1 = torch.from_numpy(np.asarray(all_labels_1_cor, dtype=np.float32))
    pt_labels_1 = Variable(pt_labels_1, requires_grad=False)
    pt_preds_1 = Variable(pt_preds_1, requires_grad=False)
    loss_1 = criterion_1(pt_preds_1, pt_labels_1)
    test_loss_1 += loss_1.data[0]

    pt_labels_1 = pt_labels_1.data
    pt_preds_1 = pt_preds_1.data
    sig_out = sig_f(pt_preds_1)
    preds_cor = torch.ByteTensor(sig_out > 0.5)
    preds_cor = preds_cor.long()
    pt_labels_1 = pt_labels_1.long()
    test_corrects_1 = torch.sum(preds_cor == pt_labels_1)

    test_elapsed_time = time.time() - test_start_time
    test_accuracy_1 = test_corrects_1 / num_test / 7
    #test_accuracy_2 = test_corrects_2 / num_test_we_use
    test_average_loss_1 = test_loss_1 / num_test / 7
    #test_average_loss_2 = test_loss_2 / num_test_we_use

    print('preds_1 num: {:6d}'.format(len(all_preds_1_cor)))

    save_test1 = int("{:4.0f}".format(test_accuracy_1 * 10000))
    #save_test2 = int("{:4.0f}".format(test_accuracy_2 * 10000))

    pred_1_name = model_pure_name + '_test1_' + str(save_test1) + '_crop_' + str(crop_type) + '.pkl'
    #pred_2_name = model_pure_name + '_test2_' + str(save_test2) + '_crop_' + str(crop_type) + '.pkl'

    with open(pred_1_name, 'wb') as f:
        pickle.dump(all_preds_1_cor, f)
    # with open(pred_2_name, 'wb') as f:
    #     pickle.dump(all_preds_2, f)

    print('test completed in:'
          ' {:2.0f}m{:2.0f}s'
          ' test loss_1: {:4.4f}'
          ' test accu_1: {:.4f}'
          .format(test_elapsed_time // 60,
                  test_elapsed_time % 60,
                  test_average_loss_1,
                  test_accuracy_1))
Example #7
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--device',
                        default='0,1,2,3',
                        type=str,
                        required=False,
                        help='cuda visible devices')
    parser.add_argument('--model_config',
                        default='config/model_config.json',
                        type=str,
                        required=False,
                        help='path of the model configration file')
    parser.add_argument('--tokenizer_path',
                        default='data/vocabs.txt',
                        type=str,
                        required=False,
                        help='path of the vocabulary file')
    parser.add_argument('--raw_data_path',
                        default='data/samples.json',
                        type=str,
                        required=False,
                        help='path of the samples file')
    parser.add_argument('--tokenized_data_path',
                        default='data/tokenized/',
                        type=str,
                        required=False,
                        help='save the tokenized samples file to this dir')
    parser.add_argument(
        '--raw',
        action='store_true',
        help=
        'do tokenize before training, no need if already tokenized with same configration'
    )
    parser.add_argument('--epochs', default=24, type=int, required=False)
    parser.add_argument('--batch_size', default=16, type=int, required=False)
    parser.add_argument('--lr', default=2e-4, type=float, required=False)
    parser.add_argument('--warmup_steps',
                        default=4000,
                        type=int,
                        required=False)
    parser.add_argument('--log_step',
                        default=4000,
                        type=int,
                        required=False,
                        help='period of reporting loss')
    parser.add_argument('--gradient_accumulation',
                        default=1,
                        type=int,
                        required=False)
    parser.add_argument('--max_grad_norm',
                        default=1.0,
                        type=float,
                        required=False)
    parser.add_argument('--output_dir',
                        default='model/',
                        type=str,
                        required=False,
                        help='save the model to this dir')
    parser.add_argument('--pretrained_model',
                        default='',
                        type=str,
                        required=False,
                        help='pre-trained model dir')

    args = parser.parse_args()
    print('args:\n' + args.__repr__())

    from tokenizations import tokenization_bert

    os.environ["CUDA_VISIBLE_DEVICES"] = args.device

    model_config = transformers.modeling_gpt2.GPT2Config.from_json_file(
        args.model_config)
    print('config:\n' + model_config.to_json_string())

    n_ctx = model_config.n_ctx
    full_tokenizer = tokenization_bert.BertTokenizer(
        vocab_file=args.tokenizer_path)
    full_tokenizer.max_len = 999999

    if torch.cuda.is_available():
        device = 'cuda'
        print(torch.cuda.get_device_name(0))
    else:
        device = 'cpu'
        print(device)

    raw_data_path = args.raw_data_path
    tokenized_data_path = args.tokenized_data_path
    raw = args.raw
    epochs = args.epochs
    batch_size = args.batch_size
    lr = args.lr
    warmup_steps = args.warmup_steps
    log_step = args.log_step
    gradient_accumulation = args.gradient_accumulation
    max_grad_norm = args.max_grad_norm
    output_dir = args.output_dir
    assert log_step % gradient_accumulation == 0

    if not os.path.exists(output_dir):
        os.mkdir(output_dir)

    if raw:
        print('building files')
        build_files(data_path=raw_data_path,
                    tokenized_data_path=tokenized_data_path,
                    full_tokenizer=full_tokenizer,
                    n_ctx=n_ctx)
        print('files built')

    if not args.pretrained_model:
        model = transformers.modeling_gpt2.GPT2LMHeadModel(config=model_config)
    else:
        model = transformers.modeling_gpt2.GPT2LMHeadModel.from_pretrained(
            args.pretrained_model)
    model.train()
    model.to(device)

    num_parameters = 0
    parameters = model.parameters()
    for parameter in parameters:
        num_parameters += parameter.numel()
    print('number of parameters: {}'.format(num_parameters))

    multi_gpu = False
    full_len = 0
    print('calculating total steps')

    with open(tokenized_data_path + 'tokenized_train_{}.txt'.format(0),
              'r') as f:
        full_len += len([int(item) for item in f.read().strip().split()])

    total_steps = int(full_len / n_ctx * epochs / batch_size /
                      gradient_accumulation)
    print('total steps = {}'.format(total_steps))

    optimizer = transformers.AdamW(model.parameters(),
                                   lr=lr,
                                   correct_bias=True)
    scheduler = transformers.WarmupLinearSchedule(optimizer,
                                                  warmup_steps=warmup_steps,
                                                  t_total=total_steps)

    if torch.cuda.device_count() > 1:
        print("Let's use", torch.cuda.device_count(), "GPUs!")

        device_ids = []
        for i in args.device.split(','):
            try:
                print(torch.cuda.get_device_name(int(i)))
                device_ids.append(int(i))
            except:
                pass
        model = DataParallel(model, device_ids=device_ids)
        multi_gpu = True
    print('starting training')
    overall_step = 0
    running_loss = 0

    with open(tokenized_data_path + 'tokenized_train_{}.txt'.format(0),
              'r') as f:
        line = f.read().strip()
    tokens = line.split()
    tokens = [int(token) for token in tokens]
    start_point = 0
    samples = []

    while start_point < len(tokens) - n_ctx:
        samples.append(tokens[start_point:start_point + n_ctx])
        start_point += n_ctx
    if start_point < len(tokens):
        samples.append(tokens[len(tokens) - n_ctx:])

    for epoch in range(epochs):
        print('epoch {}'.format(epoch + 1))
        now = datetime.now()
        print('time: {}'.format(now))

        samples2 = copy.deepcopy(samples)
        random.shuffle(samples2)

        for step in range(len(samples2) // batch_size):  # drop last
            #  prepare data
            batch = samples2[step * batch_size:(step + 1) * batch_size]
            batch_inputs = torch.tensor(batch).long().to(device)
            #  forward pass
            outputs = model.forward(input_ids=batch_inputs,
                                    labels=batch_inputs)
            loss, logits = outputs[:2]

            if multi_gpu:
                loss = loss.mean()
            if gradient_accumulation > 1:
                loss = loss / gradient_accumulation

            #  loss backward
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)

            #  optimizer step
            if (overall_step + 1) % gradient_accumulation == 0:
                running_loss += loss.item()
                optimizer.step()
                optimizer.zero_grad()
                scheduler.step()
            if (overall_step + 1) % log_step == 0:
                tb_writer.add_scalar('loss',
                                     loss.item() * gradient_accumulation,
                                     overall_step)
                print('now time: {}:{}. Step {} of epoch {}, loss {}'.format(
                    datetime.now().hour,
                    datetime.now().minute, step + 1, epoch + 1,
                    running_loss * gradient_accumulation /
                    (log_step / gradient_accumulation)))
                running_loss = 0
            overall_step += 1

        print('saving model for epoch {}'.format(epoch + 1))
        temp_epoch = (epoch + 1) % 2  # save disk space

        if not os.path.exists(output_dir + 'model_epoch{}'.format(temp_epoch)):
            os.mkdir(output_dir + 'model_epoch{}'.format(temp_epoch))
        model_to_save = model.module if hasattr(model, 'module') else model
        model_to_save.save_pretrained(output_dir +
                                      'model_epoch{}'.format(temp_epoch))
        #torch.save(scheduler, output_dir + 'model_epoch{}/scheduler.pt'.format(temp_epoch))
        #torch.save(optimizer, output_dir + 'model_epoch{}/optimizer.pt'.format(temp_epoch))
        print('epoch {} finished'.format(epoch + 1))

        then = datetime.now()
        print('time: {}'.format(then))
        print('time for one epoch: {}'.format(then - now))

    print('training finished')
Example #8
0
def train_model(train_dataset, train_num_each, val_dataset, val_num_each):
    num_train = len(train_dataset)
    num_val = len(val_dataset)

    train_useful_start_idx = get_useful_start_idx(sequence_length,
                                                  train_num_each)
    val_useful_start_idx = get_useful_start_idx(sequence_length, val_num_each)

    num_train_we_use = len(train_useful_start_idx) // num_gpu * num_gpu
    num_val_we_use = len(val_useful_start_idx) // num_gpu * num_gpu
    # num_train_we_use = 8000
    # num_val_we_use = 800

    train_we_use_start_idx = train_useful_start_idx[0:num_train_we_use]
    val_we_use_start_idx = val_useful_start_idx[0:num_val_we_use]

    #    np.random.seed(0)
    # np.random.shuffle(train_we_use_start_idx)
    train_idx = []
    for i in range(num_train_we_use):
        for j in range(sequence_length):
            train_idx.append(train_we_use_start_idx[i] + j)

    val_idx = []
    for i in range(num_val_we_use):
        for j in range(sequence_length):
            val_idx.append(val_we_use_start_idx[i] + j)

    num_train_all = len(train_idx)
    num_val_all = len(val_idx)
    print('num of train dataset: {:6d}'.format(num_train))
    print('num train start idx : {:6d}'.format(len(train_useful_start_idx)))
    print('last idx train start: {:6d}'.format(train_useful_start_idx[-1]))
    print('num of train we use : {:6d}'.format(num_train_we_use))
    print('num of all train use: {:6d}'.format(num_train_all))
    print('num of valid dataset: {:6d}'.format(num_val))
    print('num valid start idx : {:6d}'.format(len(val_useful_start_idx)))
    print('last idx valid start: {:6d}'.format(val_useful_start_idx[-1]))
    print('num of valid we use : {:6d}'.format(num_val_we_use))
    print('num of all valid use: {:6d}'.format(num_val_all))

    val_loader = DataLoader(val_dataset,
                            batch_size=val_batch_size,
                            sampler=SeqSampler(val_dataset, val_idx),
                            num_workers=workers,
                            pin_memory=False)
    model = resnet_lstm()
    if use_gpu:
        model = DataParallel(model)
        model.to(device)

    criterion = nn.CrossEntropyLoss(size_average=False)

    optimizer = None
    exp_lr_scheduler = None

    if multi_optim == 0:
        if optimizer_choice == 0:
            optimizer = optim.SGD(model.parameters(),
                                  lr=learning_rate,
                                  momentum=momentum,
                                  dampening=dampening,
                                  weight_decay=weight_decay,
                                  nesterov=use_nesterov)
            if sgd_adjust_lr == 0:
                exp_lr_scheduler = lr_scheduler.StepLR(optimizer,
                                                       step_size=sgd_adjust_lr,
                                                       gamma=sgd_gamma)
            elif sgd_adjust_lr == 1:
                exp_lr_scheduler = lr_scheduler.ReduceLROnPlateau(
                    optimizer, 'min')
        elif optimizer_choice == 1:
            optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    elif multi_optim == 1:
        if optimizer_choice == 0:
            optimizer = optim.SGD([
                {
                    'params': model.module.share.parameters()
                },
                {
                    'params': model.module.lstm.parameters(),
                    'lr': learning_rate
                },
                {
                    'params': model.module.fc.parameters(),
                    'lr': learning_rate
                },
            ],
                                  lr=learning_rate / 10,
                                  momentum=momentum,
                                  dampening=dampening,
                                  weight_decay=weight_decay,
                                  nesterov=use_nesterov)
            if sgd_adjust_lr == 0:
                exp_lr_scheduler = lr_scheduler.StepLR(optimizer,
                                                       step_size=sgd_adjust_lr,
                                                       gamma=sgd_gamma)
            elif sgd_adjust_lr == 1:
                exp_lr_scheduler = lr_scheduler.ReduceLROnPlateau(
                    optimizer, 'min')
        elif optimizer_choice == 1:
            optimizer = optim.Adam([
                {
                    'params': model.module.share.parameters()
                },
                {
                    'params': model.module.lstm.parameters(),
                    'lr': learning_rate
                },
                {
                    'params': model.module.fc.parameters(),
                    'lr': learning_rate
                },
            ],
                                   lr=learning_rate / 10)

    best_model_wts = copy.deepcopy(model.module.state_dict())
    best_val_accuracy = 0.0
    correspond_train_acc = 0.0
    best_epoch = 0

    record_np = np.zeros([epochs, 4])

    for epoch in range(epochs):
        # np.random.seed(epoch)
        np.random.shuffle(train_we_use_start_idx)
        train_idx = []
        for i in range(num_train_we_use):
            for j in range(sequence_length):
                train_idx.append(train_we_use_start_idx[i] + j)

        train_loader = DataLoader(train_dataset,
                                  batch_size=train_batch_size,
                                  sampler=SeqSampler(train_dataset, train_idx),
                                  num_workers=workers,
                                  pin_memory=False)

        # Sets the module in training mode.
        model.train()
        train_loss = 0.0
        train_corrects = 0
        batch_progress = 0.0
        train_start_time = time.time()
        for data in train_loader:
            optimizer.zero_grad()
            # 释放显存
            torch.cuda.empty_cache()

            if use_gpu:
                inputs, labels = data[0].to(device), data[1].to(device)
                labels = labels[(sequence_length - 1)::sequence_length]
            else:
                inputs, labels = data[0], data[1]
                labels = labels[(sequence_length - 1)::sequence_length]

            inputs = inputs.view(-1, sequence_length, 3, 224, 224)

            outputs = model.forward(inputs)
            outputs = outputs[sequence_length - 1::sequence_length]

            _, preds = torch.max(outputs.data, 1)

            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            train_loss += loss.data.item()
            batch_corrects = torch.sum(preds == labels.data)
            train_corrects += batch_corrects

            batch_acc = float(
                batch_corrects) / train_batch_size * sequence_length

            batch_progress += 1
            if batch_progress * train_batch_size >= num_train_all:
                percent = 100.0
                print('Batch progress: %s [%d/%d] Batch acc:%.2f' %
                      (str(percent) + '%', num_train_all, num_train_all,
                       batch_acc),
                      end='\n')
            else:
                percent = round(
                    batch_progress * train_batch_size / num_train_all * 100, 2)
                print('Batch progress: %s [%d/%d] Batch acc:%.2f' %
                      (str(percent) + '%', batch_progress * train_batch_size,
                       num_train_all, batch_acc),
                      end='\r')

        train_elapsed_time = time.time() - train_start_time
        train_accuracy = float(train_corrects) / float(
            num_train_all) * sequence_length
        train_average_loss = train_loss / num_train_all * sequence_length

        # Sets the module in evaluation mode.
        model.eval()
        val_loss = 0.0
        val_corrects = 0
        val_start_time = time.time()
        val_progress = 0

        with torch.no_grad():
            for data in val_loader:
                # 释放显存
                torch.cuda.empty_cache()
                if use_gpu:
                    inputs, labels = data[0].to(device), data[1].to(device)
                    labels = labels[(sequence_length - 1)::sequence_length]
                else:
                    inputs, labels = data[0], data[1]
                    labels = labels[(sequence_length - 1)::sequence_length]

                if crop_type == 0 or crop_type == 1:
                    inputs = inputs.view(-1, sequence_length, 3, 224, 224)
                    outputs = model.forward(inputs)
                elif crop_type == 5:
                    inputs = inputs.permute(1, 0, 2, 3, 4).contiguous()
                    inputs = inputs.view(-1, 3, 224, 224)
                    outputs = model.forward(inputs)
                    outputs = outputs.view(5, -1, 7)
                    outputs = torch.mean(outputs, 0)
                elif crop_type == 10:
                    inputs = inputs.permute(1, 0, 2, 3, 4).contiguous()
                    inputs = inputs.view(-1, 3, 224, 224)
                    outputs = model.forward(inputs)
                    outputs = outputs.view(10, -1, 7)
                    outputs = torch.mean(outputs, 0)

                outputs = outputs[sequence_length - 1::sequence_length]

                _, preds = torch.max(outputs.data, 1)

                loss = criterion(outputs, labels)
                val_loss += loss.data.item()
                val_corrects += torch.sum(preds == labels.data)

                val_progress += 1
                if val_progress * val_batch_size >= num_val_all:
                    percent = 100.0
                    print('Val progress: %s [%d/%d]' %
                          (str(percent) + '%', num_val_all, num_val_all),
                          end='\n')
                else:
                    percent = round(
                        val_progress * val_batch_size / num_val_all * 100, 2)
                    print('Val progress: %s [%d/%d]' %
                          (str(percent) + '%', val_progress * val_batch_size,
                           num_val_all),
                          end='\r')

        val_elapsed_time = time.time() - val_start_time
        val_accuracy = float(val_corrects) / float(num_val_we_use)
        val_average_loss = val_loss / num_val_we_use
        print('epoch: {:4d}'
              ' train in: {:2.0f}m{:2.0f}s'
              ' train loss: {:4.4f}'
              ' train accu: {:.4f}'
              ' valid in: {:2.0f}m{:2.0f}s'
              ' valid loss: {:4.4f}'
              ' valid accu: {:.4f}'.format(epoch, train_elapsed_time // 60,
                                           train_elapsed_time % 60,
                                           train_average_loss, train_accuracy,
                                           val_elapsed_time // 60,
                                           val_elapsed_time % 60,
                                           val_average_loss, val_accuracy))

        if optimizer_choice == 0:
            if sgd_adjust_lr == 0:
                exp_lr_scheduler.step()
            elif sgd_adjust_lr == 1:
                exp_lr_scheduler.step(val_average_loss)

        if val_accuracy > best_val_accuracy:
            best_val_accuracy = val_accuracy
            correspond_train_acc = train_accuracy
            best_model_wts = copy.deepcopy(model.module.state_dict())
            best_epoch = epoch
        if val_accuracy == best_val_accuracy:
            if train_accuracy > correspond_train_acc:
                correspond_train_acc = train_accuracy
                best_model_wts = copy.deepcopy(model.module.state_dict())
                best_epoch = epoch

        record_np[epoch, 0] = train_accuracy
        record_np[epoch, 1] = train_average_loss
        record_np[epoch, 2] = val_accuracy
        record_np[epoch, 3] = val_average_loss

        save_val = int("{:4.0f}".format(best_val_accuracy * 10000))
        save_train = int("{:4.0f}".format(correspond_train_acc * 10000))
        model_name = "lstm" \
                     + "_epoch_" + str(best_epoch) \
                     + "_length_" + str(sequence_length) \
                     + "_opt_" + str(optimizer_choice) \
                     + "_mulopt_" + str(multi_optim) \
                     + "_flip_" + str(use_flip) \
                     + "_crop_" + str(crop_type) \
                     + "_batch_" + str(train_batch_size) \
                     + "_train_" + str(save_train) \
                     + "_val_" + str(save_val) \
                     + ".pth"

        torch.save(best_model_wts, model_name)
        print("best_epoch", str(best_epoch))

        record_name = "lstm" \
                      + "_epoch_" + str(best_epoch) \
                      + "_length_" + str(sequence_length) \
                      + "_opt_" + str(optimizer_choice) \
                      + "_mulopt_" + str(multi_optim) \
                      + "_flip_" + str(use_flip) \
                      + "_crop_" + str(crop_type) \
                      + "_batch_" + str(train_batch_size) \
                      + "_train_" + str(save_train) \
                      + "_val_" + str(save_val) \
                      + ".npy"
        np.save(record_name, record_np)

    print('best accuracy: {:.4f} cor train accu: {:.4f}'.format(
        best_val_accuracy, correspond_train_acc))
Example #9
0
def train_model(train_dataset, train_num_each, val_dataset, val_num_each):
    num_train = len(train_dataset)
    num_val = len(val_dataset)

    train_idx = [i for i in range(num_train)]
    np.random.seed(0)
    np.random.shuffle(train_idx)
    val_idx = [i for i in range(num_val)]

    print('num of train dataset: {:6d}'.format(num_train))
    print('num of valid dataset: {:6d}'.format(num_val))

    train_loader = DataLoader(
        train_dataset,
        batch_size=train_batch_size,
        sampler=train_idx,
        num_workers=workers,
        pin_memory=False
    )
    val_loader = DataLoader(
        val_dataset,
        batch_size=val_batch_size,
        sampler=val_idx,
        num_workers=workers,
        pin_memory=False
    )

    # model = models.resnet50(pretrained=True)
    # num_ftrs = model.fc.in_features
    # model.fc = nn.Linear(num_ftrs, 7)
    model = pure_resnet()
    if use_gpu:
        model = model.cuda()
    model = DataParallel(model)
    criterion = nn.CrossEntropyLoss(size_average=False)
    if multi_optim == 0:
        if optimizer_choice == 0:
            optimizer = optim.SGD(model.parameters(), lr=1e-3, momentum=0.9)
            exp_lr_scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, 'min')
        elif optimizer_choice == 1:
            optimizer = optim.Adam(model.parameters())
    elif multi_optim == 1:
        if optimizer_choice == 0:
            optimizer = optim.SGD([
                {'params': model.module.share.parameters()},
                {'params': model.module.fc1.parameters(), 'lr': 1e-3},
            ], lr=1e-4, momentum=0.9)

            exp_lr_scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, 'min')
        elif optimizer_choice == 1:
            optimizer = optim.Adam([
                {'params': model.module.share.parameters()},
                {'params': model.module.fc1.parameters(), 'lr': 1e-3},
            ], lr=1e-4)

    best_model_wts = copy.deepcopy(model.state_dict())
    best_val_accuracy = 0.0
    correspond_train_acc = 0.0

    all_info = []
    all_train_accuracy = []
    all_train_loss = []
    all_val_accuracy = []
    all_val_loss = []

    for epoch in range(epochs):

        train_idx = [i for i in range(num_train)]
        np.random.seed(0)
        np.random.shuffle(train_idx)

        train_loader = DataLoader(
            train_dataset,
            batch_size=train_batch_size,
            sampler=train_idx,
            num_workers=workers,
            pin_memory=False
        )

        model.train()
        train_loss = 0.0
        train_corrects = 0
        train_start_time = time.time()
        for data in train_loader:
            inputs, labels_1, labels_2 = data
            if use_gpu:
                inputs = Variable(inputs.cuda())
                labels = Variable(labels_2.cuda())
            else:
                inputs = Variable(inputs)
                labels = Variable(labels_2)
            optimizer.zero_grad()  # 如果optimizer(net.parameters()), 那么效果和net.zero_grad()一样

            outputs = model.forward(inputs)

            _, preds = torch.max(outputs.data, 1)

            loss = criterion(outputs, labels)
            # print(loss)
            loss.backward()
            # count +=1
            optimizer.step()
            train_loss += loss.data[0]
            train_corrects += torch.sum(preds == labels.data)
            # print(train_corrects)
        train_elapsed_time = time.time() - train_start_time
        train_accuracy = train_corrects / num_train
        train_average_loss = train_loss / num_train

        model.eval()
        val_loss = 0.0
        val_corrects = 0
        val_start_time = time.time()
        for data in val_loader:
            inputs, labels_1, labels_2 = data
            if use_gpu:
                inputs = Variable(inputs.cuda())
                labels = Variable(labels_2.cuda())
            else:
                inputs = Variable(inputs)
                labels = Variable(labels_2)
            outputs = model.forward(inputs)
            _, preds = torch.max(outputs.data, 1)

            loss = criterion(outputs, labels)
            val_loss += loss.data[0]
            val_corrects += torch.sum(preds == labels.data)
            # print(val_corrects)
        val_elapsed_time = time.time() - val_start_time
        val_accuracy = val_corrects / num_val
        val_average_loss = val_loss / num_val
        print('epoch: {:4d}'
              ' train in: {:2.0f}m{:2.0f}s'
              ' train loss: {:4.4f}'
              ' train accu: {:.4f}'
              ' valid in: {:2.0f}m{:2.0f}s'
              ' valid loss: {:4.4f}'
              ' valid accu: {:.4f}'
              .format(epoch,
                      train_elapsed_time // 60,
                      train_elapsed_time % 60,
                      train_average_loss,
                      train_accuracy,
                      val_elapsed_time // 60,
                      val_elapsed_time % 60,
                      val_average_loss,
                      val_accuracy))

        if optimizer_choice == 0:
            exp_lr_scheduler.step(val_average_loss)

        if val_accuracy > best_val_accuracy:
            best_val_accuracy = val_accuracy
            correspond_train_acc = train_accuracy
            best_model_wts = copy.deepcopy(model.state_dict())
        elif val_accuracy == best_val_accuracy:
            if train_accuracy > correspond_train_acc:
                correspond_train_acc = train_accuracy
                best_model_wts = copy.deepcopy(model.state_dict())

        all_train_loss.append(train_average_loss)
        all_train_accuracy.append(train_accuracy)
        all_val_loss.append(val_average_loss)
        all_val_accuracy.append(val_accuracy)

    print('best accuracy: {:.4f} cor train accu: {:.4f}'.format(best_val_accuracy, correspond_train_acc))
    save_val = int("{:4.0f}".format(best_val_accuracy * 10000))
    save_train = int("{:4.0f}".format(correspond_train_acc * 10000))
    model_name = "phase" \
                 + "_epoch_" + str(epochs) \
                 + "_opt_" + str(optimizer_choice) \
                 + "_mulopt_" + str(multi_optim) \
                 + "_flip_" + str(use_flip) \
                 + "_crop_" + str(crop_type) \
                 + "_batch_" + str(train_batch_size) \
                 + "_train_" + str(save_train) \
                 + "_val_" + str(save_val) \
                 + ".pth"
    torch.save(best_model_wts, model_name)
    all_info.append(all_train_accuracy)
    all_info.append(all_train_loss)
    all_info.append(all_val_accuracy)
    all_info.append(all_val_loss)
    record_name = "phase" \
                  + "_epoch_" + str(epochs) \
                  + "_opt_" + str(optimizer_choice) \
                  + "_mulopt_" + str(multi_optim) \
                  + "_flip_" + str(use_flip) \
                  + "_crop_" + str(crop_type) \
                  + "_batch_" + str(train_batch_size) \
                  + "_train_" + str(save_train) \
                  + "_val_" + str(save_val) \
                  + ".pkl"
    with open(record_name, 'wb') as f:
        pickle.dump(all_info, f)
    print()
Example #10
0
def test_model(test_dataset, test_num_each):
    num_test = len(test_dataset)
    test_useful_start_idx = get_useful_start_idx(sequence_length,
                                                 test_num_each)

    num_test_we_use = len(test_useful_start_idx)

    test_we_use_start_idx = test_useful_start_idx[0:num_test_we_use]

    test_idx = []
    for i in range(num_test_we_use):
        for j in range(sequence_length):
            test_idx.append(test_we_use_start_idx[i] + j)

    num_test_all = len(test_idx)

    print('num test start idx : {:6d}'.format(len(test_useful_start_idx)))
    print('last idx test start: {:6d}'.format(test_useful_start_idx[-1]))
    print('num of test dataset: {:6d}'.format(num_test))
    print('num of test we use : {:6d}'.format(num_test_we_use))
    print('num of all test use: {:6d}'.format(num_test_all))
    # TODO sampler

    test_loader = DataLoader(test_dataset,
                             batch_size=test_batch_size,
                             sampler=SeqSampler(test_dataset, test_idx),
                             num_workers=workers)

    model = resnet_lstm()
    print(model)
    if torch.cuda.device_count() > 1:
        print("Let's use", torch.cuda.device_count(), "GPUs!")
    model.load_state_dict(torch.load(model_name))
    model = DataParallel(model)

    if use_gpu:
        model.to(device)
    # 应该可以直接多gpu计算
    # model = model.module            #要测试一下
    criterion = nn.CrossEntropyLoss(size_average=False)

    model.eval()
    test_loss = 0.0
    test_corrects = 0
    test_start_time = time.time()

    all_preds = []
    all_preds_score = []

    with torch.no_grad():

        for data in test_loader:

            # 释放显存
            torch.cuda.empty_cache()

            if use_gpu:
                inputs, labels = data[0].to(device), data[1].to(device)
                labels = labels[(sequence_length - 1)::sequence_length]
            else:
                inputs, labels = data
                labels = labels[(sequence_length - 1)::sequence_length]

            inputs = inputs.view(-1, sequence_length, 3, 224, 224)

            if crop_type == 0 or crop_type == 1 or crop_type == 2 or crop_type == 3:
                outputs = model.forward(inputs)
            elif crop_type == 5:
                inputs = inputs.permute(1, 0, 2, 3, 4).contiguous()
                inputs = inputs.view(-1, 3, 224, 224)
                outputs = model.forward(inputs)
                outputs = outputs.view(5, -1, 7)
                outputs = torch.mean(outputs, 0)
            elif crop_type == 10:
                inputs = inputs.permute(1, 0, 2, 3, 4).contiguous()
                inputs = inputs.view(-1, 3, 224, 224)
                outputs = model.forward(inputs)
                outputs = outputs.view(10, -1, 7)
                outputs = torch.mean(outputs, 0)

            outputs = outputs[sequence_length - 1::sequence_length]
            Sm = nn.Softmax()
            outputs = Sm(outputs)
            possibility, preds = torch.max(outputs.data, 1)
            print("possibility:", possibility)

            for i in range(len(preds)):
                all_preds.append(preds[i])
            for i in range(len(possibility)):
                all_preds_score.append(possibility[i])
            print("all_preds length:", len(all_preds))
            print("all_preds_score length:", len(all_preds_score))
            loss = criterion(outputs, labels)
            # TODO 和batchsize相关
            # test_loss += loss.data[0]/test_loss += loss.data.item()
            print("preds:", preds.data.cpu())
            print("labels:", labels.data.cpu())

            test_loss += loss.data.item()
            test_corrects += torch.sum(preds == labels.data)
            print("test_corrects:", test_corrects)

    test_elapsed_time = time.time() - test_start_time
    test_accuracy = float(test_corrects) / float(num_test_we_use)
    test_average_loss = test_loss / num_test_we_use

    print('type of all_preds:', type(all_preds))
    print('leng of all preds:', len(all_preds))
    save_test = int("{:4.0f}".format(test_accuracy * 10000))
    pred_name = model_pure_name + '_test_' + str(save_test) + '_crop_' + str(
        crop_type) + '.pkl'
    pred_score_name = model_pure_name + '_test_' + str(
        save_test) + '_crop_' + str(crop_type) + '_score' + '.pkl'

    with open(pred_name, 'wb') as f:
        pickle.dump(all_preds, f)
    with open(pred_score_name, 'wb') as f:
        pickle.dump(all_preds_score, f)
    print('test elapsed: {:2.0f}m{:2.0f}s'
          ' test loss: {:4.4f}'
          ' test accu: {:.4f}'.format(test_elapsed_time // 60,
                                      test_elapsed_time % 60,
                                      test_average_loss, test_accuracy))
Example #11
0
def train_model(train_dataset, train_num_each, val_dataset, val_num_each):
    if if_load_old == True:
        pdb.set_trace()
        print("please choose the previous one")
        time_cur = '1586310709.4848218'
    else:
        time_cur = time.time()

    writer = SummaryWriter(summary_dir + str(time_cur))
    logger = utils.get_log('log/' + str(time_cur) + '.txt')

    # num_train = len(train_dataset)
    # num_val = len(val_dataset)

    train_useful_start_idx = get_useful_start_idx(sequence_length,
                                                  train_num_each)
    val_useful_start_idx = get_useful_start_idx(sequence_length, val_num_each)
    # train_idx = []
    # for i in range(num_train_we_use):
    #     for j in range(sequence_length):
    #         train_idx.append(train_we_use_start_idx[i] + j)

    val_idx = []
    for i in range(len(val_useful_start_idx)):
        for j in range(sequence_length):
            val_idx.append(val_useful_start_idx[i] + j)

    # num_train_all = len(train_idx)
    num_val_all = len(val_idx)
    # print('num of train dataset: {:6d}'.format(num_train))
    # print('num train start idx : {:6d}'.format(len(train_useful_start_idx)))
    # print('last idx train start: {:6d}'.format(train_useful_start_idx[-1]))
    # print('num of train we use : {:6d}'.format(num_train_we_use))
    # print('num of all train use: {:6d}'.format(num_train_all))
    # print('num of valid dataset: {:6d}'.format(num_val))
    # print('num valid start idx : {:6d}'.format(len(val_useful_start_idx)))
    # print('last idx valid start: {:6d}'.format(val_useful_start_idx[-1]))
    # print('num of valid we use : {:6d}'.format(num_val_we_use))
    # print('num of all valid use: {:6d}'.format(num_val_all))

    val_loader = DataLoader(val_dataset,
                            batch_size=val_batch_size,
                            sampler=SeqSampler(val_dataset, val_idx),
                            num_workers=workers,
                            pin_memory=True)

    #select data to train
    X = train_useful_start_idx
    select_num = math.floor(len(X) * quary_portion)  #every time choose 10%
    if is_first_selection is True:
        pdb.set_trace()
        print("this is first selectin!!!! please check your parameter in .sh")
        import random
        mask = [1 for n in range(0, len(X))]
        selected = random.sample(X, select_num)
        for i in range(len(X)):
            if X[i] in selected:
                mask[i] = 0
        unselected = [X[i] for i in range(len(X)) if X[i] not in selected]
        save_select_data(save_select_txt_path, selected, unselected, mask,
                         time_cur)
    else:
        # load_select_data return: data['selected'],data['unselected'],data['mask']
        selected, unselected, mask = load_select_data(
            os.path.join(save_select_txt_path, json_name))
        if select_chose == 'non_local':
            print("this is non_local select")
            test_idx = []
            for i in range(len(unselected)):
                for j in range(sequence_length):
                    test_idx.append(unselected[i] + j)
            num_test_all = len(test_idx)
            subset = Subset(train_dataset, test_idx)
            selected, unselected, mask = non_local_select(
                val_model_path, subset, sequence_length, X, select_num,
                selected, unselected, mask)
        elif select_chose == 'DBN':
            print("this is DBN select")
            test_idx = []
            for i in range(len(unselected)):
                for j in range(sequence_length):
                    test_idx.append(unselected[i] + j)
            num_test_all = len(test_idx)
            subset = Subset(train_dataset, test_idx)
            selected, unselected, mask = DBN_select(val_model_path, subset,
                                                    sequence_length, X,
                                                    select_num, selected,
                                                    unselected, mask)
        elif select_chose == 'random':
            print("this is random select")
            test_idx = []
            for i in range(len(unselected)):
                for j in range(sequence_length):
                    test_idx.append(unselected[i] + j)
            num_test_all = len(test_idx)
            selected, unselected, mask = random_select_data(
                X, select_num, selected, unselected, mask)
            pdb.set_trace()
            selected = [
                selected[i] for i in range(len(selected))
                if selected[i] in test_idx
            ]
        else:
            print(
                "just using old load select data to train without select new data"
            )
            # pdb.set_trace()
        if is_save_json is True:
            save_select_data(save_select_txt_path, selected, unselected, mask,
                             time_cur)
    pdb.set_trace()
    # save_dir = save_dir_base + '/' + str(time_cur) + '_' + str(learning_rate) + '_tbs' + str(train_batch_size) \
    # + '_seq' + str(sequence_length) + '_opt' + str(optimizer_choice) + '_crop' + str(crop_type) + '_adjlr' \
    #  + '_adamgamma' + str(adamgamma) + '_adamstep' + str(adam_step) + '_weight_decay' + str(adamweightdecay) + '_block_num' + str(block_num)
    if train_mode == 'RESLSTM' or train_mode == 'RESLSTM_DBN':
        save_dir = save_dir_base + '/' + str(train_mode) + '/' + str(time_cur) + 'txtname' + json_name + '_' + str(learning_rate) + '_tbs' + str(train_batch_size) \
        + '_seq' + str(sequence_length) + '_opt' + str(optimizer_choice) + '_crop' + str(crop_type)  \
        + '_sgdstep' + str(sgd_step) + '_sgd_gamma' + str(sgd_gamma) + '_sgd_adjust_lr' + str(sgd_adjust_lr)+ '_weight_decay' + str(weight_decay)
    elif train_mode == 'RESLSTM_NOLOCAL' or train_mode == 'RESLSTM_NOLOCAL_dropout0.2':
        save_dir = save_dir_base + '/' + str(train_mode) + '/' + str(time_cur) + 'txtname' + json_name + '_' + str(learning_rate) + '_tbs' + str(train_batch_size) \
        + '_seq' + str(sequence_length) + '_opt' + str(optimizer_choice) + '_crop' + str(crop_type)  \
         + '_adamgamma' + str(adamgamma) + '_adamstep' + str(adam_step) + '_adamweightdecay' + str(adamweightdecay) + '_block_num' + str(block_num)

    if if_load_old == True:
        # Check if a checkpoint is in there
        if len([name for name in os.listdir(save_dir)]) > 0:
            print("Loading old model")
        else:
            print("nothing to load")
            pdb.set_trace()

    else:
        os.makedirs(save_dir)

    if train_mode == 'RESLSTM':
        model = resnet_lstm()
    elif train_mode == 'RESLSTM_NOLOCAL':
        model = resnet_lstm_nonlocal()
    elif train_mode == 'RESLSTM_NOLOCAL_dropout0.2':
        model = resnet_lstm_nonlocal()
        chk = 'results_ResLSTM_Nolocal/RESLSTM_NOLOCAL/1572847215.642195txtname42974_1572767025.1601517.json_0.0005_tbs400_seq10_opt1_crop0_adamgamma0.1_adamstep3_adamweightdecay0.0001_block_num1/checkpoint_best-23.pt'
        print("Restoring: ", chk)
        # Load
        state = torch.load(chk)
        # newdict = {}
        # for k,v in state['state_dict'].items():
        #     if k[0:7] != 'module.':
        #         name = 'module.' + k
        #         newdict[name] = v
        #     else:
        #         newdict[k] = v

        model.load_state_dict(state['state_dict'])
    elif train_mode == 'RESLSTM_DBN':
        model = resnet_lstm_dropout()
    else:
        print("not implemented")
        pdb.set_trace()
    # print (model)
    # pdb.set_trace()
    if use_gpu:
        model = DataParallel(model)
        model.to(device)

    criterion = nn.CrossEntropyLoss(size_average=False)

    optimizer = None
    exp_lr_scheduler = None

    if multi_optim == 0:
        if optimizer_choice == 0:
            optimizer = optim.SGD(model.parameters(),
                                  lr=learning_rate,
                                  momentum=momentum,
                                  dampening=dampening,
                                  weight_decay=weight_decay,
                                  nesterov=use_nesterov)
            if sgd_adjust_lr == 0:
                exp_lr_scheduler = lr_scheduler.StepLR(optimizer,
                                                       step_size=sgd_step,
                                                       gamma=sgd_gamma)
            elif sgd_adjust_lr == 1:
                exp_lr_scheduler = lr_scheduler.ReduceLROnPlateau(
                    optimizer, 'min')
        elif optimizer_choice == 1:
            optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    elif multi_optim == 1:
        if optimizer_choice == 0:
            optimizer = optim.SGD([
                {
                    'params': model.module.share.parameters()
                },
                {
                    'params': model.module.lstm.parameters(),
                    'lr': learning_rate
                },
                {
                    'params': model.module.fc.parameters(),
                    'lr': learning_rate
                },
            ],
                                  lr=learning_rate / 10,
                                  momentum=momentum,
                                  dampening=dampening,
                                  weight_decay=weight_decay,
                                  nesterov=use_nesterov)
            if sgd_adjust_lr == 0:
                exp_lr_scheduler = lr_scheduler.StepLR(optimizer,
                                                       step_size=sgd_step,
                                                       gamma=sgd_gamma)
            elif sgd_adjust_lr == 1:
                exp_lr_scheduler = lr_scheduler.ReduceLROnPlateau(
                    optimizer, 'min')
        elif optimizer_choice == 1:
            # optimizer = optim.Adam([
            #     {'params': model.module.share.parameters()},
            #     {'params': model.module.lstm.parameters(), 'lr': learning_rate},
            #     {'params': model.module.fc.parameters(), 'lr': learning_rate},
            # ], lr=learning_rate / 10)
            optim_params = list(
                filter(lambda p: p.requires_grad, model.parameters()))
            print('Optimizing %d paramters' % len(optim_params))
            optimizer = optim.Adam(optim_params,
                                   lr=learning_rate,
                                   weight_decay=adamweightdecay)
            exp_lr_scheduler = lr_scheduler.StepLR(optimizer,
                                                   step_size=adam_step,
                                                   gamma=adamgamma)

    #check if need load old weigth, optimizer
    if if_load_old:
        # Find last, not last best checkpoint
        files = glob(save_dir + '/*')
        global_steps = np.zeros([len(files)])
        for i in range(len(files)):
            # Use meta files to find the highest index
            if 'best' in files[i]:
                continue
            if 'checkpoint-' not in files[i]:
                continue
            # Extract global step
            nums = [int(s) for s in re.findall(r'\d+', files[i])]
            global_steps[i] = nums[-1]
        # Create path with maximum global step found
        chkPath = save_dir + '/checkpoint-' + str(int(
            np.max(global_steps))) + '.pt'
        print("Restoring: ", chkPath)
        # Load
        state = torch.load(chkPath)
        # Initialize model and optimizer
        newdict = {}
        for k, v in state['state_dict'].items():
            if k[0:7] != 'module.':
                name = 'module.' + k
                newdict[name] = v
            else:
                newdict[k] = v
        model.load_state_dict(newdict)
        # model.load_state_dict(state['state_dict'])
        optimizer.load_state_dict(state['optimizer'])
        # pdb.set_trace()
        start_epoch = state['epoch']
        best_epoch = int(np.max(global_steps))
        best_val_accuracy = state['best_val_accuracy']
        correspond_train_acc = state['correspond_train_acc']
    else:
        start_epoch = 1
        best_epoch = -1
        best_val_accuracy = 0.0
        correspond_train_acc = 0.0
        if sv_init_model is not None:
            print("Restoring supervised model: ", sv_init_model)
            # Load
            state = torch.load(sv_init_model)
            # Initialize model and optimizer
            newdict = {}
            for k, v in state['state_dict'].items():
                if k[0:7] != 'module.':
                    name = 'module.' + k
                    newdict[name] = v
                else:
                    newdict[k] = v
            model.load_state_dict(newdict)

    best_model_wts = copy.deepcopy(model.module.state_dict())

    for epoch in range(start_epoch, epochs + 1):
        np.random.shuffle(selected)
        train_idx = []
        for i in range(len(selected)):
            for j in range(sequence_length):
                train_idx.append(selected[i] + j)
        num_train_all = len(train_idx)
        # subset = Subset(train_dataset,train_idx)
        # train_loader = DataLoader(
        #     subset,
        #     batch_size=train_batch_size,
        #     sampler=SeqSampler(subset, train_idx),
        #     num_workers=workers,
        #     pin_memory=True
        # )
        train_loader = DataLoader(train_dataset,
                                  batch_size=train_batch_size,
                                  sampler=SeqSampler(train_dataset, train_idx),
                                  num_workers=workers,
                                  pin_memory=True)
        # pdb.set_trace()
        # Sets the module in training mode.
        model.train()
        train_loss = 0.0
        train_corrects = 0
        batch_progress = 0.0
        train_start_time = time.time()
        for data in train_loader:
            optimizer.zero_grad()
            # torch.cuda.empty_cache()
            with torch.set_grad_enabled(True):
                if use_gpu:
                    inputs, labels = data[0].to(device), data[1].to(device)
                    labels = labels[(sequence_length - 1)::sequence_length]
                else:
                    inputs, labels = data[0], data[1]
                    labels = labels[(sequence_length - 1)::sequence_length]
                # pdb.set_trace()
                inputs = inputs.view(-1, sequence_length, 3, 224, 224)
                # pdb.set_trace()
                outputs = model.forward(inputs)
                # pdb.set_trace()
                outputs = outputs[sequence_length - 1::sequence_length]
                _, preds = torch.max(outputs.data, 1)
                loss = criterion(outputs, labels)
                loss.backward()
                optimizer.step()

                train_loss += loss.data.item()
                batch_corrects = torch.sum(preds == labels.data)
                train_corrects += batch_corrects

                batch_acc = float(
                    batch_corrects) / train_batch_size * sequence_length

                batch_progress += 1
                if batch_progress * train_batch_size >= num_train_all:
                    percent = 100.0
                    print('Batch progress: %s [%d/%d] Batch acc:%.2f' %
                          (str(percent) + '%', num_train_all, num_train_all,
                           batch_acc),
                          end='\n')
                else:
                    percent = round(
                        batch_progress * train_batch_size / num_train_all *
                        100, 2)
                    print('Batch progress: %s [%d/%d] Batch acc:%.2f' %
                          (str(percent) + '%', batch_progress *
                           train_batch_size, num_train_all, batch_acc),
                          end='\r')

        train_elapsed_time = time.time() - train_start_time
        train_accuracy = float(train_corrects) / float(
            num_train_all) * sequence_length
        train_average_loss = train_loss / num_train_all * sequence_length

        # Sets the module in evaluation mode.
        model.eval()
        val_loss = 0.0
        val_corrects = 0
        val_start_time = time.time()
        val_progress = 0

        with torch.no_grad():
            for data in val_loader:
                # torch.cuda.empty_cache()
                if use_gpu:
                    inputs, labels = data[0].to(device), data[1].to(device)
                    labels = labels[(sequence_length - 1)::sequence_length]
                else:
                    inputs, labels = data[0], data[1]
                    labels = labels[(sequence_length - 1)::sequence_length]

                if crop_type == 0 or crop_type == 1:
                    inputs = inputs.view(-1, sequence_length, 3, 224, 224)
                    outputs = model.forward(inputs)

                elif crop_type == 5:
                    inputs = inputs.permute(1, 0, 2, 3, 4).contiguous()
                    inputs = inputs.view(-1, 3, 224, 224)
                    outputs = model.forward(inputs)
                    outputs = outputs.view(5, -1, 7)
                    outputs = torch.mean(outputs, 0)
                elif crop_type == 10:
                    inputs = inputs.permute(1, 0, 2, 3, 4).contiguous()
                    inputs = inputs.view(-1, 3, 224, 224)
                    outputs = model.forward(inputs)
                    outputs = outputs.view(10, -1, 7)
                    outputs = torch.mean(outputs, 0)

                outputs = outputs[sequence_length - 1::sequence_length]

                _, preds = torch.max(outputs.data, 1)

                loss = criterion(outputs, labels)
                val_loss += loss.data.item()
                val_corrects += torch.sum(preds == labels.data)

                val_progress += 1
                if val_progress * val_batch_size >= num_val_all:
                    percent = 100.0
                    print('Val progress: %s [%d/%d]' %
                          (str(percent) + '%', num_val_all, num_val_all),
                          end='\n')
                else:
                    percent = round(
                        val_progress * val_batch_size / num_val_all * 100, 2)
                    print('Val progress: %s [%d/%d]' %
                          (str(percent) + '%', val_progress * val_batch_size,
                           num_val_all),
                          end='\r')

        val_elapsed_time = time.time() - val_start_time
        val_accuracy = float(val_corrects) / float(
            num_val_all) * sequence_length
        val_average_loss = val_loss / num_val_all * sequence_length
        write_dict = {
            "train_loss": train_average_loss,
            "val_loss": val_average_loss,
            "train_accuracy": train_accuracy,
            "val_accuracy": val_accuracy
        }
        writer.add_scalars('scalar', write_dict, epoch)

        if optimizer_choice == 0:
            if sgd_adjust_lr == 0:
                exp_lr_scheduler.step()
            elif sgd_adjust_lr == 1:
                exp_lr_scheduler.step(val_average_loss)

        if optimizer_choice == 1:
            exp_lr_scheduler.step()

        if val_accuracy >= best_val_accuracy:
            if val_accuracy > best_val_accuracy:
                best_val_accuracy = val_accuracy
                correspond_train_acc = train_accuracy
                best_model_wts = copy.deepcopy(model.module.state_dict())
                oldBestInd = best_epoch
                best_epoch = epoch
            if val_accuracy == best_val_accuracy:
                if train_accuracy > correspond_train_acc:
                    correspond_train_acc = train_accuracy
                    best_model_wts = copy.deepcopy(model.module.state_dict())
                    oldBestInd = best_epoch
                    best_epoch = epoch
            # Delte previously best model
            if os.path.isfile(save_dir + '/checkpoint_best-' +
                              str(oldBestInd) + '.pt'):
                os.remove(save_dir + '/checkpoint_best-' + str(oldBestInd) +
                          '.pt')
            # Save currently best model
            state = {
                'epoch': epoch,
                'state_dict': best_model_wts,
                'optimizer': optimizer.state_dict(),
                'best_val_accuracy': best_val_accuracy,
                'correspond_train_acc': correspond_train_acc
            }
            torch.save(state,
                       save_dir + '/checkpoint_best-' + str(epoch) + '.pt')

        # If its not better, just save it delete the last checkpoint if it is not current best one
        # Save current model
        state = {
            'epoch': epoch,
            'state_dict': model.module.state_dict(),
            'optimizer': optimizer.state_dict(),
            'best_val_accuracy': best_val_accuracy,
            'correspond_train_acc': correspond_train_acc
        }
        torch.save(state, save_dir + '/checkpoint-' + str(epoch) + '.pt')
        # Delete last one
        if os.path.isfile(save_dir + '/checkpoint-' + str(epoch - 1) + '.pt'):
            os.remove(save_dir + '/checkpoint-' + str(epoch - 1) + '.pt')

        logger.info("\n")
        logger.info('Epoch: %d/%d (%d h %d m %d s)' %
                    (epoch, epochs, int(train_elapsed_time / 3600),
                     int(np.mod(train_elapsed_time, 3600) / 60),
                     int(np.mod(np.mod(train_elapsed_time, 3600), 60))) +
                    time.strftime("%d.%m.-%H:%M:%S", time.localtime()))
        logger.info('validation time: %d h %d m %d s' %
                    (int(val_elapsed_time / 3600),
                     int(np.mod(val_elapsed_time, 3600) / 60),
                     int(np.mod(np.mod(val_elapsed_time, 3600), 60))) +
                    time.strftime("%d.%m.-%H:%M:%S", time.localtime()))
        logger.info("training loss: %6f" % train_average_loss)
        logger.info("validation loss: %6f" % val_average_loss)
        logger.info("train accu: %6f" % train_accuracy)
        logger.info("validation accu: %6f" % val_accuracy)
        logger.info("best val accu: %6f at Epoch %d" %
                    (best_val_accuracy, best_epoch))
        logger.info("best corresponding train accu: %6f" %
                    correspond_train_acc)
    writer.close()
    def train(self):
        if not self.pretrained_model:
            model = pytorch_transformers.modeling_gpt2.GPT2LMHeadModel(
                config=self.model_config)
        else:
            model = pytorch_transformers.modeling_gpt2.GPT2LMHeadModel.from_pretrained(
                self.pretrained_model)
        model.train()
        model.to(self.device)
        # 计算模型参数量
        num_parameters = 0
        parameters = model.parameters()
        for parameter in parameters:
            num_parameters += parameter.numel()
        self.print_and_log('模型参数量 = {}'.format(num_parameters))

        if self.do_tokenize:
            self.print_and_log("开始加载训练集")
            self.tokenize_and_save()
            self.print_and_log("训练集加载完毕")

        full_len = 0
        for i in range(self.split_num):
            with open(
                    self.tokenized_data_path +
                    'tokenized_train_{}.txt'.format(i), 'r') as f:
                full_len += len(
                    [int(item) for item in f.read().strip().split()])
        sample_num = int(full_len / self.stride)
        epoch_steps = int(full_len / self.stride / self.batch_size /
                          self.gradient_accumulation)
        total_steps = int(full_len / self.stride * self.epochs /
                          self.batch_size / self.gradient_accumulation)
        self.print_and_log('样本数 = {}'.format(sample_num))
        self.print_and_log('epoch 步数 = {}'.format(epoch_steps))
        self.print_and_log('总步数 = {}'.format(total_steps))

        optimizer = pytorch_transformers.AdamW(model.parameters(),
                                               lr=self.lr,
                                               correct_bias=True)
        scheduler = pytorch_transformers.WarmupLinearSchedule(
            optimizer, warmup_steps=self.warmup_steps, t_total=total_steps)

        if self.fp16:
            try:
                from apex import amp
            except ImportError:
                raise ImportError(
                    "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
                )
            model, optimizer = amp.initialize(model,
                                              optimizer,
                                              opt_level=self.fp16_opt_level)

        if torch.cuda.device_count() > 1:
            model = DataParallel(model)
            multi_gpu = True
        else:
            multi_gpu = False

        overall_step = 0
        running_loss = 0
        for epoch in range(self.epochs):
            self.print_and_log('epoch {}'.format(epoch + 1))
            now = datetime.now()
            self.print_and_log('time: {}'.format(now))
            optimizer.zero_grad()
            split_indices = np.linspace(0,
                                        self.split_num - 1,
                                        self.split_num,
                                        dtype=np.int32)
            random.shuffle(split_indices)
            for split_index in split_indices:
                with open(
                        self.tokenized_data_path +
                        'tokenized_train_{}.txt'.format(split_index),
                        'r') as f:
                    line = f.read().strip()
                all_ids = line.split()
                all_ids = [int(x) for x in all_ids]
                start_point = 0
                samples = []
                while start_point < len(all_ids) - self.n_ctx:
                    samples.append(all_ids[start_point:start_point +
                                           self.n_ctx])
                    start_point += self.stride
                random.shuffle(samples)
                for i in range(len(samples) // self.batch_size):  # drop last
                    batch = samples[i * self.batch_size:(i + 1) *
                                    self.batch_size]
                    batch_labels = torch.tensor(batch, dtype=torch.long).to(
                        self.device)
                    batch_inputs = torch.tensor(batch, dtype=torch.long).to(
                        self.device)
                    outputs = model.forward(input_ids=batch_inputs,
                                            labels=batch_labels)
                    loss, logits = outputs[:2]

                    if multi_gpu:
                        loss = loss.mean()

                    if self.gradient_accumulation > 1:
                        loss = loss / self.gradient_accumulation

                    #  loss backward
                    if self.fp16:
                        with amp.scale_loss(loss, optimizer) as scaled_loss:
                            scaled_loss.backward()
                            torch.nn.utils.clip_grad_norm_(
                                amp.master_params(optimizer),
                                self.max_grad_norm)
                    else:
                        loss.backward()
                        torch.nn.utils.clip_grad_norm_(model.parameters(),
                                                       self.max_grad_norm)

                    if (i + 1) % self.gradient_accumulation == 0:
                        running_loss += loss.item()
                        scheduler.step()
                        optimizer.step()
                        optimizer.zero_grad()
                        overall_step += 1

                    if (overall_step +
                            1) % self.log_step == 0 and running_loss != 0:
                        self.print_and_log(
                            'now time: {}:{}. Step {} of epoch {}, loss {}'.
                            format(
                                datetime.now().hour,
                                datetime.now().minute, overall_step + 1,
                                epoch + 1, running_loss *
                                self.gradient_accumulation / self.log_step))
                        running_loss = 0

            if not os.path.exists(self.output_dir +
                                  'model_epoch{}'.format(epoch + 1)):
                os.makedirs(self.output_dir +
                            'model_epoch{}'.format(epoch + 1))
            gpt2_model = model.transformer
            model_to_save = gpt2_model.module if hasattr(
                gpt2_model, 'module') else gpt2_model
            model_to_save.save_pretrained(self.output_dir +
                                          'model_epoch{}'.format(epoch + 1))
            # torch.save(scheduler.state_dict(), output_dir + 'model_epoch{}/scheduler.pt'.format(epoch + 1))
            # torch.save(optimizer.state_dict(), output_dir + 'model_epoch{}/optimizer.pt'.format(epoch + 1))

            then = datetime.now()
            self.print_and_log('time: {}'.format(then))
            self.print_and_log('time for one epoch: {}'.format(then - now))

        self.print_and_log('training finished')
        self.f_log.close()
        if not os.path.exists(self.output_dir + 'final_model'):
            os.makedirs(self.output_dir + 'final_model')
        gpt2_model = model.transformer
        model_to_save = gpt2_model.module if hasattr(gpt2_model,
                                                     'module') else gpt2_model
        model_to_save.save_pretrained(self.output_dir + 'final_model')
def train_model(train_dataset, train_num_each, val_dataset, val_num_each):
    num_train = len(train_dataset)
    num_val = len(val_dataset)

    train_useful_start_idx = get_useful_start_idx(sequence_length, train_num_each)
    val_useful_start_idx = get_useful_start_idx(sequence_length, val_num_each)

    num_train_we_use = len(train_useful_start_idx) // num_gpu * num_gpu
    num_val_we_use = len(val_useful_start_idx) // num_gpu * num_gpu
    # num_train_we_use = 8000
    # num_val_we_use = 800

    train_we_use_start_idx = train_useful_start_idx[0:num_train_we_use]  # 训练数据开始位置
    val_we_use_start_idx = val_useful_start_idx[0:num_val_we_use]

    np.random.seed(0)
    np.random.shuffle(train_we_use_start_idx)
    train_idx = []
    for i in range(num_train_we_use):
        for j in range(sequence_length):
            train_idx.append(train_we_use_start_idx[i] + j * srate)  # 训练数据位置,每一张图是一个数据

    val_idx = []
    for i in range(num_val_we_use):
        for j in range(sequence_length):
            val_idx.append(val_we_use_start_idx[i] + j * srate)

    num_train_all = float(len(train_idx))
    num_val_all = float(len(val_idx))
    print('num of train dataset: {:6d}'.format(num_train))
    print('num train start idx : {:6d}'.format(len(train_useful_start_idx)))
    print('last idx train start: {:6d}'.format(train_useful_start_idx[-1]))
    print('num of train we use : {:6d}'.format(num_train_we_use))
    print('num of all train use: {:6d}'.format(int(num_train_all)))
    print('num of valid dataset: {:6d}'.format(num_val))
    print('num valid start idx : {:6d}'.format(len(val_useful_start_idx)))
    print('last idx valid start: {:6d}'.format(val_useful_start_idx[-1]))
    print('num of valid we use : {:6d}'.format(num_val_we_use))
    print('num of all valid use: {:6d}'.format(int(num_val_all)))

    val_loader = DataLoader(
        val_dataset,
        batch_size=val_batch_size,
        # sampler=val_idx,
        sampler=SeqSampler(val_dataset, val_idx),
        num_workers=workers,
        pin_memory=False
    )
    model = resnet_lstm()
    if use_gpu:
        model = model.cuda()

    model = DataParallel(model)
    criterion = nn.CrossEntropyLoss()
    '''
    if multi_optim == 0:
        if optimizer_choice == 0:
            optimizer = optim.SGD(model.parameters(), lr=learning_rate, momentum=momentum, dampening=dampening,
                                  weight_decay=weight_decay, nesterov=use_nesterov)
            if sgd_adjust_lr == 0:
                exp_lr_scheduler = lr_scheduler.StepLR(optimizer, step_size=sgd_adjust_lr, gamma=sgd_gamma)
            elif sgd_adjust_lr == 1:
                exp_lr_scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, 'min')
        elif optimizer_choice == 1:
            optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    elif multi_optim == 1:
        if optimizer_choice == 0:
            optimizer = optim.SGD([
                {'params': model.module.share.parameters()},
                {'params': model.module.lstm.parameters(), 'lr': learning_rate},
                {'params': model.module.fc.parameters(), 'lr': learning_rate},
            ], lr=learning_rate / 10, momentum=momentum, dampening=dampening,
                weight_decay=weight_decay, nesterov=use_nesterov)
            if sgd_adjust_lr == 0:
                exp_lr_scheduler = lr_scheduler.StepLR(optimizer, step_size=sgd_adjust_lr, gamma=sgd_gamma)
            elif sgd_adjust_lr == 1:
                exp_lr_scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, 'min')
        elif optimizer_choice == 1:
            optimizer = optim.Adam([
                {'params': model.module.share.parameters()},
                {'params': model.module.lstm.parameters(), 'lr': learning_rate},
                {'params': model.module.fc.parameters(), 'lr': learning_rate},
            ], lr=learning_rate / 10)
    '''
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    best_model_wts = copy.deepcopy(model.state_dict())
    best_val_accuracy = 0.0
    correspond_train_acc = 0.0

    record_np = np.zeros([epochs, 4])

    for epoch in range(epochs):
        np.random.seed(epoch)
        np.random.shuffle(train_we_use_start_idx)
        train_idx = []
        for i in range(num_train_we_use):
            for j in range(sequence_length):
                train_idx.append(train_we_use_start_idx[i] + j * srate)

        train_loader = DataLoader(
            train_dataset,
            batch_size=train_batch_size,
            sampler=SeqSampler(train_dataset, train_idx),
            num_workers=workers,
            pin_memory=False
        )

        model.train()
        train_loss = 0.0
        train_corrects = 0
        train_start_time = time.time()
        num = 0
        train_num = 0
        for data in train_loader:
            num = num + 1
            #inputs, labels_phase, kdata = data
            inputs, labels_phase = data
            if use_gpu:
                inputs = Variable(inputs.cuda())
                labels = Variable(labels_phase.cuda())
                #kdatas = Variable(kdata.cuda())
            else:
                inputs = Variable(inputs)
                labels = Variable(labels_phase)
                #kdatas = Variable(kdata)
            optimizer.zero_grad()
            #outputs = model.forward(inputs, kdatas)
            outputs = model.forward(inputs)
            outputs = F.softmax(outputs, dim=1)
            _, preds = torch.max(outputs.data, 1)
            print(num)
            print(preds)
            print(labels)

            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            train_loss += loss.data
            train_corrects += torch.sum(preds == labels.data)
            train_num += labels.shape[0]
            print(train_corrects.cpu().numpy() / train_num)
            if train_corrects.cpu().numpy() / train_num > 0.75:
                torch.save(copy.deepcopy(model.state_dict()), 'test.pth')

        train_elapsed_time = time.time() - train_start_time
        train_accuracy = train_corrects.cpu().numpy() / train_num
        train_average_loss = train_loss / train_num

        # begin eval
        model.eval()
        val_loss = 0.0
        val_corrects = 0
        val_num = 0
        val_start_time = time.time()
        for data in val_loader:
            #inputs, labels_phase, kdata = data
            inputs, labels_phase = data
            #labels_phase = labels_phase[(sequence_length - 1)::sequence_length]
            #kdata = kdata[(sequence_length - 1)::sequence_length]
            if use_gpu:
                inputs = Variable(inputs.cuda())
                labels = Variable(labels_phase.cuda())
                #kdatas = Variable(kdata.cuda())
            else:
                inputs = Variable(inputs)
                labels = Variable(labels_phase)
                #kdatas = Variable(kdata)

            if crop_type == 0 or crop_type == 1:
                #outputs = model.forward(inputs, kdatas)
                outputs = model.forward(inputs)
            elif crop_type == 5:
                inputs = inputs.permute(1, 0, 2, 3, 4).contiguous()
                inputs = inputs.view(-1, 3, 224, 224)
                #outputs = model.forward(inputs, kdatas)
                outputs = model.forward(inputs)
                outputs = outputs.view(5, -1, 3)
                outputs = torch.mean(outputs, 0)
            elif crop_type == 10:
                inputs = inputs.permute(1, 0, 2, 3, 4).contiguous()
                inputs = inputs.view(-1, 3, 224, 224)
                #outputs = model.forward(inputs, kdatas)
                outputs = model.forward(inputs)
                outputs = outputs.view(10, -1, 3)
                outputs = torch.mean(outputs, 0)

            #outputs = outputs[sequence_length - 1::sequence_length]

            _, preds = torch.max(outputs.data, 1)
            print(num)
            print(preds)
            print(labels)
            loss = criterion(outputs, labels)
            val_loss += loss.data
            val_corrects += torch.sum(preds == labels.data)
            val_num += labels.shape[0]
        val_elapsed_time = time.time() - val_start_time
        val_accuracy = val_corrects.cpu().numpy() / val_num
        val_average_loss = val_loss / val_num
        print('epoch: {:4d}'
              ' train in: {:2.0f}m{:2.0f}s'
              ' train loss: {:4.4f}'
              ' train accu: {:.4f}'
              ' valid in: {:2.0f}m{:2.0f}s'
              ' valid loss: {:4.4f}'
              ' valid accu: {:.4f}'
              .format(epoch,
                      train_elapsed_time // 60,
                      train_elapsed_time % 60,
                      train_average_loss,
                      train_accuracy,
                      val_elapsed_time // 60,
                      val_elapsed_time % 60,
                      val_average_loss,
                      val_accuracy))

        if optimizer_choice == 0:
            if sgd_adjust_lr == 0:
                exp_lr_scheduler.step()
            elif sgd_adjust_lr == 1:
                exp_lr_scheduler.step(val_average_loss)

        if val_accuracy > best_val_accuracy:
            best_val_accuracy = val_accuracy
            correspond_train_acc = train_accuracy
            best_model_wts = copy.deepcopy(model.state_dict())
        if val_accuracy == best_val_accuracy:
            if train_accuracy > correspond_train_acc:
                correspond_train_acc = train_accuracy
                best_model_wts = copy.deepcopy(model.state_dict())

        record_np[epoch, 0] = train_accuracy
        record_np[epoch, 1] = train_average_loss
        record_np[epoch, 2] = val_accuracy
        record_np[epoch, 3] = val_average_loss
        np.save(str(epoch) + '.npy', record_np)

    print('best accuracy: {:.4f} cor train accu: {:.4f}'.format(best_val_accuracy, correspond_train_acc))

    save_val = int("{:4.0f}".format(best_val_accuracy * 10000))
    save_train = int("{:4.0f}".format(correspond_train_acc * 10000))
    model_name = "lstm" \
                 + "_epoch_" + str(epochs) \
                 + "_length_" + str(sequence_length) \
                 + "_opt_" + str(optimizer_choice) \
                 + "_mulopt_" + str(multi_optim) \
                 + "_flip_" + str(use_flip) \
                 + "_crop_" + str(crop_type) \
                 + "_batch_" + str(train_batch_size) \
                 + "_train_" + str(save_train) \
                 + "_val_" + str(save_val) \
                 + ".pth"

    torch.save(best_model_wts, model_name)

    record_name = "lstm" \
                  + "_epoch_" + str(epochs) \
                  + "_length_" + str(sequence_length) \
                  + "_opt_" + str(optimizer_choice) \
                  + "_mulopt_" + str(multi_optim) \
                  + "_flip_" + str(use_flip) \
                  + "_crop_" + str(crop_type) \
                  + "_batch_" + str(train_batch_size) \
                  + "_train_" + str(save_train) \
                  + "_val_" + str(save_val) \
                  + ".npy"
    np.save(record_name, record_np)
Example #14
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--device', default='0,1,2,3', type=str, required=False, help='设置使用哪些显卡')
    parser.add_argument('--model_config', default='config/model_config_small.json', type=str, required=False,
                        help='选择模型参数')
    parser.add_argument('--tokenizer_path', default='cache/vocab_small.txt', type=str, required=False, help='选择词库')
    parser.add_argument('--raw_data_path', default='data/train.json', type=str, required=False, help='原始训练语料')
    parser.add_argument('--tokenized_data_path', default='data/tokenized/', type=str, required=False,
                        help='tokenized语料存放位置')
    parser.add_argument('--raw', action='store_true', help='是否先做tokenize')
    parser.add_argument('--epochs', default=5, type=int, required=False, help='训练循环')
    parser.add_argument('--batch_size', default=64, type=int, required=False, help='训练batch size')
    parser.add_argument('--lr', default=1.5e-4, type=float, required=False, help='学习率')
    parser.add_argument('--warmup_steps', default=2000, type=int, required=False, help='warm up步数')
    parser.add_argument('--log_step', default=1, type=int, required=False, help='多少步汇报一次loss,设置为gradient accumulation的整数倍')
    parser.add_argument('--stride', default=768, type=int, required=False, help='训练时取训练数据的窗口步长')
    parser.add_argument('--gradient_accumulation', default=1, type=int, required=False, help='梯度积累')
    parser.add_argument('--fp16', action='store_true', help='混合精度')
    parser.add_argument('--fp16_opt_level', default='O1', type=str, required=False)
    parser.add_argument('--max_grad_norm', default=1.0, type=float, required=False)
    parser.add_argument('--num_pieces', default=100, type=int, required=False, help='将训练语料分成多少份')
    parser.add_argument('--min_length', default=128, type=int, required=False, help='最短收录文章长度')
    parser.add_argument('--max_length', default=256, type=int, required=False, help='最短收录文章长度')
    parser.add_argument('--output_dir', default='model/', type=str, required=False, help='模型输出路径')
    parser.add_argument('--pretrained_model', default='', type=str, required=False, help='模型训练起点路径')
    parser.add_argument('--writer_dir', default='tensorboard_summary/', type=str, required=False, help='Tensorboard路径')
    parser.add_argument('--segment', action='store_true', help='中文以词为单位')
    parser.add_argument('--bpe_token', action='store_true', help='subword')
    parser.add_argument('--encoder_json', default="tokenizations/encoder.json", type=str, help="encoder.json")
    parser.add_argument('--vocab_bpe', default="tokenizations/vocab.bpe", type=str, help="vocab.bpe")
    parser.add_argument('--max_steps_perEpoch_perPiece', default=1000000, type=int, required=False)
    parser.add_argument('--steps_savemodel', default=10000, type=int, required=False, help='保存模型步数')
    parser.add_argument('--padding', action='store_true', help='输入是否定长')
    args = parser.parse_args()
    print('args:\n' + args.__repr__())

    if args.segment:
        from tokenizations import tokenization_bert_word_level as tokenization_bert
    else:
        from tokenizations import tokenization_bert

    #os.environ["CUDA_VISIBLE_DEVICES"] = args.device  # 此处设置程序使用哪些显卡

    model_config = transformers.modeling_gpt2.GPT2Config.from_json_file(args.model_config)
    print('config:\n' + model_config.to_json_string())

    n_ctx = model_config.n_ctx
    if args.bpe_token:
        full_tokenizer = get_encoder(args.encoder_json, args.vocab_bpe)
    else:
        full_tokenizer = tokenization_bert.BertTokenizer(vocab_file=args.tokenizer_path)
    full_tokenizer.max_len = 999999
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    print('using device:', device)

    raw_data_path = args.raw_data_path
    tokenized_data_path = args.tokenized_data_path
    raw = args.raw  # 选择是否从零开始构建数据集
    epochs = args.epochs
    batch_size = args.batch_size
    lr = args.lr
    warmup_steps = args.warmup_steps
    log_step = args.log_step
    stride = args.stride
    gradient_accumulation = args.gradient_accumulation
    fp16 = args.fp16  # 不支持半精度的显卡请勿打开
    fp16_opt_level = args.fp16_opt_level
    max_grad_norm = args.max_grad_norm
    num_pieces = args.num_pieces
    min_length = args.min_length
    output_dir = args.output_dir
    padding = args.padding
    max_length = args.max_length
    #tb_writer = SummaryWriter(log_dir=args.writer_dir)
    assert log_step % gradient_accumulation == 0
    if not os.path.exists(output_dir):
        os.mkdir(output_dir)
    if not args.pretrained_model:
        model = transformers.modeling_gpt2.GPT2LMHeadModel(config=model_config)
    else:
        model = transformers.modeling_gpt2.GPT2LMHeadModel.from_pretrained(args.pretrained_model)
    model.train()
    model.to(device)

    num_parameters = 0
    parameters = model.parameters()
    for parameter in parameters:
        num_parameters += parameter.numel()
    print('number of parameters: {}'.format(num_parameters))
    multi_gpu = False
    optimizer = transformers.AdamW(model.parameters(), lr=lr, correct_bias=True)
    #scheduler = transformers.WarmupLinearSchedule(optimizer, warmup_steps=warmup_steps,
    #                                                      t_total=total_steps)
    if fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
        model, optimizer = amp.initialize(model, optimizer, opt_level=fp16_opt_level)

    if torch.cuda.device_count() > 1:
        print("Let's use", torch.cuda.device_count(), "GPUs!")
        model = DataParallel(model, device_ids=[int(i) for i in args.device.split(',')])
        multi_gpu = True
    print('starting training')
    step_loss = 0
    running_loss = 10
    loss_ = 10
    iter = iterData(args.tokenized_data_path, rate=1.0, batch_size=batch_size, epochs=epochs)
    step = 0
    epoch0 = -1
    while True:
        data = next(iter)
        if data=='__STOP__':
            break
        epoch, epochs, idx_file, nb_files, batch_inputs = data
        random.shuffle(batch_inputs)
        batch_inputs = torch.tensor(batch_inputs).long().to(device)
        #  forward pass
        outputs = model.forward(input_ids=batch_inputs, labels=batch_inputs)
        loss, logits = outputs[:2]
        #  get loss
        if multi_gpu:
            loss = loss.mean()
        if gradient_accumulation > 1:
            loss = loss / gradient_accumulation
        #  loss backward
        if fp16:
            with amp.scale_loss(loss, optimizer) as scaled_loss:
                scaled_loss.backward()
                torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), max_grad_norm)
        else:
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
        #  optimizer step
        if (step + 1) % gradient_accumulation == 0:
            running_loss += loss.item()
            optimizer.step()
            optimizer.zero_grad()
            step_loss += 1
            #scheduler.step()
        if (step + 1) % log_step == 0:
            loss_ = running_loss * gradient_accumulation / (log_step / gradient_accumulation)
            print('now time: {}:{}. step: {}, progress-innerEpoch: {}/{}, progress-outerEpoch: {}/{}, loss {}'.format(
                    datetime.now().hour,
                    datetime.now().minute,
                    step+1,
                    idx_file+1,
                    nb_files,
                    epoch + 1,
                    epochs,
                    loss_))
            running_loss = 0
        if step%args.steps_savemodel==0:
            print('saving model for epoch {}'.format(epoch + 1))
            output_dir_ = output_dir + 'model_epoch{}_step{}_loss-{}'.format(epoch + 1, step,'%0.2f'%loss_)
            if not os.path.exists(output_dir_):
                os.mkdir(output_dir_)
            model_to_save = model.module if hasattr(model, 'module') else model
            model_to_save.save_pretrained(output_dir_)
        step += 1
        if epoch!=epoch0:
            if not os.path.exists(output_dir + 'model_epoch{}'.format(epoch + 1)):
                os.mkdir(output_dir + 'model_epoch{}'.format(epoch + 1))
            model_to_save = model.module if hasattr(model, 'module') else model
            model_to_save.save_pretrained(output_dir + 'model_epoch{}'.format(epoch + 1))
            epoch0 = epoch
            print('epoch {} finished'.format(epoch + 1))
    if not os.path.exists(output_dir + 'final_model'):
        os.mkdir(output_dir + 'final_model')
    model_to_save = model.module if hasattr(model, 'module') else model
    model_to_save.save_pretrained(output_dir + 'final_model')
    print('training finished')
Example #15
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--device',
                        default='0,1,2,3',
                        type=str,
                        required=False,
                        help='设置使用哪些显卡')
    parser.add_argument('--model_config',
                        default='config/model_config.json',
                        type=str,
                        required=False,
                        help='选择模型参数')
    parser.add_argument('--tokenizer_path',
                        default='cache/vocab_small.txt',
                        type=str,
                        required=False,
                        help='选择词库')
    parser.add_argument('--raw_data_path',
                        default='data/train.json',
                        type=str,
                        required=False,
                        help='原始训练语料')
    parser.add_argument('--tokenized_data_path',
                        default='data/tokenized/',
                        type=str,
                        required=False,
                        help='tokenized语料存放位置')
    parser.add_argument('--raw', action='store_true', help='是否先做tokenize')
    parser.add_argument('--epochs',
                        default=5,
                        type=int,
                        required=False,
                        help='训练循环')
    parser.add_argument('--batch_size',
                        default=8,
                        type=int,
                        required=False,
                        help='训练batch size')
    parser.add_argument('--lr',
                        default=1.5e-4,
                        type=float,
                        required=False,
                        help='学习率')
    parser.add_argument('--warmup_steps',
                        default=2000,
                        type=int,
                        required=False,
                        help='warm up步数')
    parser.add_argument('--log_step',
                        default=1,
                        type=int,
                        required=False,
                        help='多少步汇报一次loss')
    parser.add_argument('--stride',
                        default=768,
                        type=int,
                        required=False,
                        help='训练时取训练数据的窗口步长')
    parser.add_argument('--gradient_accumulation',
                        default=1,
                        type=int,
                        required=False,
                        help='梯度积累')
    parser.add_argument('--fp16', action='store_true', help='混合精度')
    parser.add_argument('--fp16_opt_level',
                        default='O1',
                        type=str,
                        required=False)
    parser.add_argument('--max_grad_norm',
                        default=1.0,
                        type=float,
                        required=False)
    parser.add_argument('--num_pieces',
                        default=1,
                        type=int,
                        required=False,
                        help='将训练语料分成多少份')
    parser.add_argument('--output_dir',
                        default='model/',
                        type=str,
                        required=False,
                        help='模型输出路径')
    parser.add_argument('--pretrained_model',
                        default='',
                        type=str,
                        required=False,
                        help='模型训练起点路径')
    parser.add_argument('--segment', action='store_true', help='中文以词为单位')
    '''
    配置参数-------------------------------------------------------------------
    '''
    args = parser.parse_args()
    args.device = '1'
    args.batch_size = 5
    from tokenizations import tokenization
    proj_root_path = os.path.dirname(
        os.path.dirname(os.path.realpath(__file__)))
    vocab_file_path = "tokenizations/clue-vocab.txt"
    #使用预训练里面的词典进行编码
    text = '我是一个人'
    tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file_path,
                                           do_lower_case=True)
    line = tokenization.convert_to_unicode(text)
    bert_tokens = tokenizer.tokenize(line)
    encoded = tokenizer.convert_tokens_to_ids(bert_tokens)

    # 下面关注一下数据集的写法.
    args.raw = True
    args.raw_data_path = '172166.txt'  # -small是小的版本
    args.epochs = 200
    args.output_dir = 'model/'  # 结果存到e盘的final_model
    args.num_pieces = 10  # 结果存到e盘的final_model
    from pre_data_byOnlyOneBook import get_data as get_data
    name2 = args.raw_data_path.split('.')[0]
    get_data(name2 + '.txt', name2 + '.json')
    # 下面使用166893.json即可.
    '''
    ------------------------------------------------------------------------------
    '''

    #---------------配置完毕
    print('args:\n' + args.__repr__())
    if args.segment:
        from tokenizations import tokenization_bert_word_level as tokenization_bert
    else:
        from tokenizations import tokenization_bert
    os.environ["CUDA_VISIBLE_DEVICES"] = args.device  # 此处设置程序使用哪些显卡
    model_config = transformers.modeling_gpt2.GPT2Config.from_json_file(
        args.model_config)  # 这个参数很重要,表示一句话的长度.
    print('config:\n' + model_config.to_json_string())
    n_ctx = model_config.n_ctx
    # full_tokenizer = tokenization_bert.BertTokenizer(vocab_file=args.tokenizer_path)
    full_tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file_path,
                                                do_lower_case=True)
    '''
    full_tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file_path, do_lower_case=True)
    '''
    '''
    直接使用gpt2的tokenizer
    '''

    full_tokenizer.max_len = 999999
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    print('using device:', device)

    raw_data_path = args.raw_data_path
    tokenized_data_path = args.tokenized_data_path
    raw = args.raw  # 选择是否从零开始构建数据集
    epochs = args.epochs
    batch_size = args.batch_size
    lr = args.lr
    warmup_steps = args.warmup_steps
    log_step = args.log_step
    stride = args.stride
    gradient_accumulation = args.gradient_accumulation
    fp16 = args.fp16  # 不支持半精度的显卡请勿打开
    fp16_opt_level = args.fp16_opt_level
    max_grad_norm = args.max_grad_norm
    num_pieces = args.num_pieces
    output_dir = args.output_dir
    #  'data/tokenized/'  编码之后的东西放在这里.
    if raw:
        print('building files')
        build_files(raw_data_path=name2 + '.json',
                    tokenized_data_path=tokenized_data_path,
                    full_tokenizer=full_tokenizer,
                    num_pieces=num_pieces)
        print('files built')

    if not args.pretrained_model:
        model = transformers.modeling_gpt2.GPT2LMHeadModel(config=model_config)
    else:
        model = transformers.modeling_gpt2.GPT2LMHeadModel.from_pretrained(
            args.pretrained_model)
    model.train()
    model.to(device)
    multi_gpu = False
    full_len = 0
    print('calculating total steps')
    for i in tqdm(range(num_pieces)):
        with open(tokenized_data_path + 'tokenized_train_{}.txt'.format(i),
                  'r') as f:
            full_len += len([int(item) for item in f.read().strip().split()])
    import math
    total_steps = math.ceil(full_len / stride * epochs / batch_size /
                            gradient_accumulation)
    print('total steps = {}'.format(total_steps))

    optimizer = transformers.AdamW(model.parameters(),
                                   lr=lr,
                                   correct_bias=True)
    scheduler = transformers.WarmupLinearSchedule(optimizer,
                                                  warmup_steps=warmup_steps,
                                                  t_total=total_steps)
    if fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
            )
        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level=fp16_opt_level)

    if torch.cuda.device_count() > 1:
        print("Let's use", torch.cuda.device_count(), "GPUs!")
        model = DataParallel(model)
        multi_gpu = True
    print('starting training')
    running_loss = 0
    for epoch in range(epochs):
        print('epoch {}'.format(epoch + 1))
        now = datetime.now()
        print('time: {}'.format(now))
        x = np.linspace(0, num_pieces - 1, num_pieces, dtype=np.int32)
        random.shuffle(x)
        piece_num = 0
        loss_save = []
        for i in x:
            with open(tokenized_data_path + 'tokenized_train_{}.txt'.format(i),
                      'r') as f:
                line = f.read().strip()
            tokens = line.split()
            tokens = [int(token) for token in tokens]
            start_point = 0
            samples = []
            while start_point < len(tokens) - n_ctx:  # n_ctx 表示上下文的长度.
                samples.append(tokens[start_point:start_point + n_ctx])
                start_point += stride
            if start_point < len(tokens):  # 拼接上最后一个例子.
                samples.append(tokens[len(tokens) - n_ctx:])
            random.shuffle(samples)
            for step in range((len(samples) // batch_size) + 1):  # 多跑一个

                #  prepare data
                #先判断是否超界,如果超界就表示最后一个组不成batch,所以break
                if step * batch_size > len(samples) - 1:
                    break
                batch = samples[step * batch_size:(step + 1) * batch_size]

                batch_labels = []
                batch_inputs = []
                for ids in batch:
                    int_ids_for_labels = [int(x) for x in ids]
                    int_ids_for_inputs = [int(x) for x in ids]
                    batch_labels.append(int_ids_for_labels)
                    batch_inputs.append(int_ids_for_inputs)
                batch_labels = torch.tensor(batch_labels).long().to(device)
                batch_inputs = torch.tensor(batch_inputs).long().to(device)

                #  forward pass       居然输入输出都一样????????很奇怪这个模型.
                '''
                下面为了对比,把ctrl的模型写这里:
                
                
                    flag_input, inputs = numericalize(domain+tokenized_train_text[i:i+seq_length])  # 注意输入要牵头加上domain.
                    flag_output, outputs = numericalize(tokenized_train_text[i:i+seq_length+1])  # ctrl算法输入是 i:j 输出是i:j+1 
                    
                    
                    研究一下这个数据的问题:
                    https://www.cnblogs.com/wwj99/p/12503545.html
                    
                    好像还真是,样本和标签一样.
                '''
                outputs = model.forward(input_ids=batch_inputs,
                                        labels=batch_labels)
                loss, logits = outputs[:2]

                #  get loss
                if multi_gpu:
                    loss = loss.mean()
                if gradient_accumulation > 1:
                    loss = loss / gradient_accumulation

                #  loss backward
                if fp16:
                    with amp.scale_loss(loss, optimizer) as scaled_loss:
                        scaled_loss.backward()
                        torch.nn.utils.clip_grad_norm_(
                            amp.master_params(optimizer), max_grad_norm)
                else:
                    loss.backward()
                    torch.nn.utils.clip_grad_norm_(model.parameters(),
                                                   max_grad_norm)

                #  optimizer step
                if (step + 1) % gradient_accumulation == 0:
                    running_loss += loss.item()
                    optimizer.step()
                    optimizer.zero_grad()
                    scheduler.step()
                if (step + 1) % log_step == 0:
                    print(
                        'now time: {}:{}. Step {} of piece {} of epoch {}, loss {}'
                        .format(datetime.now().hour,
                                datetime.now().minute,
                                (step + 1) // gradient_accumulation, piece_num,
                                epoch + 1, running_loss / log_step))
                    loss_save.append(running_loss / log_step)
                    running_loss = 0
            piece_num += 1
        #--------------检测是否提前退出
        last = loss_save[:10]
        avg1 = sum(last) / 10
        #如果全在avg1上下百分之5以内就停止:
        last = np.array(last)
        avg1 = np.array(avg1)
        tmp = np.all(last >= avg1 * 0.97) and np.all(last >= avg1 * 1.03)
        if len(last) >= 10 and tmp and loss_save[-1] < 0.05:
            break


#--------------------

    print('training finished')
    if not os.path.exists(output_dir + 'final_model'):
        os.makedirs(output_dir + 'final_model')
    model_to_save = model.module if hasattr(model, 'module') else model
    model_to_save.save_pretrained(output_dir + 'final_model')
Example #16
0
def test_model(test_dataset, test_num_each):
    num_test = len(test_dataset)
    test_useful_start_idx = get_useful_start_idx(sequence_length, test_num_each)

    num_test_we_use = len(test_useful_start_idx)

    test_we_use_start_idx = test_useful_start_idx[0:num_test_we_use]

    test_idx = []
    for i in range(num_test_we_use):
        for j in range(sequence_length):
            test_idx.append(test_we_use_start_idx[i] + j)

    num_test_all = len(test_idx)

    print('num test start idx : {:6d}'.format(len(test_useful_start_idx)))
    print('last idx test start: {:6d}'.format(test_useful_start_idx[-1]))
    print('num of test dataset: {:6d}'.format(num_test))
    print('num of test we use : {:6d}'.format(num_test_we_use))
    print('num of all test use: {:6d}'.format(num_test_all))


    test_loader = DataLoader(test_dataset,
                             batch_size=test_batch_size,
                             sampler=SeqSampler(test_dataset, test_idx),
                             num_workers=workers)

    # model = i3_res50_nl_new_test(400)
    # model = i3_res50_nl_new_test_1block(400)
    model = resnet_lstm_nonlocal()
    # num_ftrs = model.fc.in_features
    # model.fc = nn.Linear(num_ftrs, class_num) 
    print(model)
    if torch.cuda.device_count() > 1:
        print("Let's use", torch.cuda.device_count(), "GPUs!") 
    #consider multi gpu formatted at module.
    state = torch.load(model_name)    
    newdict = {}    
    for k,v in state['state_dict'].items():
        if k[0:7] == 'module.': 
            name = k[7:]
            newdict[name] = v
        else:
            newdict[k] = v
    model.load_state_dict(newdict) 
    model = DataParallel(model)

    if use_gpu:
        model.to(device)

    criterion = nn.CrossEntropyLoss(size_average=False)

    model.eval()
    test_loss = 0.0
    test_corrects = 0
    test_start_time = time.time()

    all_preds = []
    pth_blobs = {}
    # f = open('./possibility.txt', 'a')

    with torch.no_grad():

        for data in test_loader:
            
            # torch.cuda.empty_cache()            
            
            if use_gpu:
                inputs, labels = data[0].to(device), data[1].to(device)
                labels = labels[(sequence_length - 1)::sequence_length]
            else:
                inputs, labels = data[0], data[1]
                labels = labels[(sequence_length - 1)::sequence_length]

            if crop_type == 0 or crop_type == 1:
                inputs = inputs.view(-1, sequence_length, 3, 224, 224)
                outputs = model.forward(inputs)
       
            
            outputs = outputs[sequence_length - 1::sequence_length]

            _, preds = torch.max(outputs.data, 1)

            for i in range(len(preds)):
                all_preds.append(preds[i])
            print("all_preds length:",len(all_preds))
            loss = criterion(outputs, labels)
            test_loss += loss.data.item()
            test_corrects += torch.sum(preds == labels.data)

            print("preds:",preds)
            print("labels:",labels.data)
            # pdb.set_trace()
            test_loss += loss.data.item()
            print("test_corrects:",test_corrects)
            # f.write("preds:"+str(preds.cpu().numpy()))
            # f.write('\t')
            # f.write("labels:" + str(labels.data.cpu().numpy()))
            # f.write('\t')
            # f.write("possibility:" + str(possibility.cpu().numpy()))
            # f.write('\n')

    # f.close()

    test_elapsed_time = time.time() - test_start_time
    test_accuracy = float(test_corrects) / float(num_test_we_use)
    test_average_loss = test_loss / num_test_we_use

    # print('type of all_preds:', type(all_preds))
    # print('leng of all preds:', len(all_preds))
    save_test = int("{:4.0f}".format(test_accuracy * 10000))
    pred_name = model_pure_name + '_test_' + str(save_test) + '_crop_' + str(crop_type) + '.pkl'

    with open(pred_name, 'wb') as f:
        pickle.dump(all_preds, f)
    print('test elapsed: {:2.0f}m{:2.0f}s'
          ' test loss: {:4.4f}'
          ' test accu: {:.4f}'
          .format(test_elapsed_time // 60,
                  test_elapsed_time % 60,
                  test_average_loss, test_accuracy))
Example #17
0
def train_model(train_dataset, train_num_each, val_dataset, val_num_each):
    num_train = len(train_dataset)
    num_val = len(val_dataset)

    train_useful_start_idx = get_useful_start_idx(sequence_length,
                                                  train_num_each)
    val_useful_start_idx = get_useful_start_idx(sequence_length, val_num_each)

    num_train_we_use = len(train_useful_start_idx) // num_gpu * num_gpu
    num_val_we_use = len(val_useful_start_idx) // num_gpu * num_gpu
    # num_train_we_use = 8000
    # num_val_we_use = 800

    train_we_use_start_idx = train_useful_start_idx[0:num_train_we_use]
    val_we_use_start_idx = val_useful_start_idx[0:num_val_we_use]

    #    np.random.seed(0)
    # np.random.shuffle(train_we_use_start_idx)
    train_idx = []
    for i in range(num_train_we_use):
        for j in range(sequence_length):
            train_idx.append(train_we_use_start_idx[i] + j)

    val_idx = []
    for i in range(num_val_we_use):
        for j in range(sequence_length):
            val_idx.append(val_we_use_start_idx[i] + j)

    num_train_all = len(train_idx)
    num_val_all = len(val_idx)
    print('num of train dataset: {:6d}'.format(num_train))
    print('num train start idx : {:6d}'.format(len(train_useful_start_idx)))
    print('last idx train start: {:6d}'.format(train_useful_start_idx[-1]))
    print('num of train we use : {:6d}'.format(num_train_we_use))
    print('num of all train use: {:6d}'.format(num_train_all))
    print('num of valid dataset: {:6d}'.format(num_val))
    print('num valid start idx : {:6d}'.format(len(val_useful_start_idx)))
    print('last idx valid start: {:6d}'.format(val_useful_start_idx[-1]))
    print('num of valid we use : {:6d}'.format(num_val_we_use))
    print('num of all valid use: {:6d}'.format(num_val_all))

    train_loader = DataLoader(train_dataset,
                              batch_size=train_batch_size,
                              sampler=train_idx,
                              num_workers=workers,
                              pin_memory=False)
    val_loader = DataLoader(val_dataset,
                            batch_size=val_batch_size,
                            sampler=val_idx,
                            num_workers=workers,
                            pin_memory=False)
    model = resnet_lstm_dp()
    if use_gpu:
        model = model.cuda()

    model = DataParallel(model)
    criterion = nn.CrossEntropyLoss(size_average=False)

    if multi_optim == 0:
        if optimizer_choice == 0:
            optimizer = optim.SGD(model.parameters(),
                                  lr=learning_rate,
                                  momentum=momentum,
                                  dampening=dampening,
                                  weight_decay=weight_decay,
                                  nesterov=use_nesterov)
            if sgd_adjust_lr == 0:
                exp_lr_scheduler = lr_scheduler.StepLR(optimizer,
                                                       step_size=sgd_adjust_lr,
                                                       gamma=sgd_gamma)
            elif sgd_adjust_lr == 1:
                exp_lr_scheduler = lr_scheduler.ReduceLROnPlateau(
                    optimizer, 'min')
        elif optimizer_choice == 1:
            optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    elif multi_optim == 1:
        if optimizer_choice == 0:
            optimizer = optim.SGD([
                {
                    'params': model.module.share.parameters()
                },
                {
                    'params': model.module.lstm.parameters(),
                    'lr': learning_rate
                },
                {
                    'params': model.module.fc.parameters(),
                    'lr': learning_rate
                },
            ],
                                  lr=learning_rate / 10,
                                  momentum=momentum,
                                  dampening=dampening,
                                  weight_decay=weight_decay,
                                  nesterov=use_nesterov)
            if sgd_adjust_lr == 0:
                exp_lr_scheduler = lr_scheduler.StepLR(optimizer,
                                                       step_size=sgd_adjust_lr,
                                                       gamma=sgd_gamma)
            elif sgd_adjust_lr == 1:
                exp_lr_scheduler = lr_scheduler.ReduceLROnPlateau(
                    optimizer, 'min')
        elif optimizer_choice == 1:
            optimizer = optim.Adam([
                {
                    'params': model.module.share.parameters()
                },
                {
                    'params': model.module.lstm.parameters(),
                    'lr': learning_rate
                },
                {
                    'params': model.module.fc.parameters(),
                    'lr': learning_rate
                },
            ],
                                   lr=learning_rate / 10)

    best_model_wts = copy.deepcopy(model.state_dict())
    best_val_accuracy = 0.0
    correspond_train_acc = 0.0

    all_info = []
    all_train_accuracy = []
    all_train_loss = []
    all_val_accuracy = []
    all_val_loss = []

    for epoch in range(epochs):
        # np.random.seed(epoch)
        np.random.shuffle(train_we_use_start_idx)
        train_idx = []
        for i in range(num_train_we_use):
            for j in range(sequence_length):
                train_idx.append(train_we_use_start_idx[i] + j)

        train_loader = DataLoader(train_dataset,
                                  batch_size=train_batch_size,
                                  sampler=train_idx,
                                  num_workers=workers,
                                  pin_memory=False)

        model.train()
        train_loss = 0.0
        train_corrects = 0
        train_start_time = time.time()
        for data in train_loader:
            inputs, labels_1, labels_2 = data
            if use_gpu:
                inputs = Variable(inputs.cuda())
                labels = Variable(labels_2.cuda())
            else:
                inputs = Variable(inputs)
                labels = Variable(labels_2)
            optimizer.zero_grad()
            outputs = model.forward(inputs)
            _, preds = torch.max(outputs.data, 1)

            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            train_loss += loss.data[0]
            train_corrects += torch.sum(preds == labels.data)
        train_elapsed_time = time.time() - train_start_time
        train_accuracy = train_corrects / num_train_all
        train_average_loss = train_loss / num_train_all

        # begin eval
        model.eval()
        val_loss = 0.0
        val_corrects = 0
        val_start_time = time.time()
        for data in val_loader:
            inputs, labels_1, labels_2 = data
            labels_2 = labels_2[(sequence_length - 1)::sequence_length]
            if use_gpu:
                inputs = Variable(inputs.cuda())
                labels = Variable(labels_2.cuda())
            else:
                inputs = Variable(inputs)
                labels = Variable(labels_2)

            if crop_type == 0 or crop_type == 1:
                outputs = model.forward(inputs)
            elif crop_type == 5:
                inputs = inputs.permute(1, 0, 2, 3, 4).contiguous()
                inputs = inputs.view(-1, 3, 224, 224)
                outputs = model.forward(inputs)
                outputs = outputs.view(5, -1, 7)
                outputs = torch.mean(outputs, 0)
            elif crop_type == 10:
                inputs = inputs.permute(1, 0, 2, 3, 4).contiguous()
                inputs = inputs.view(-1, 3, 224, 224)
                outputs = model.forward(inputs)
                outputs = outputs.view(10, -1, 7)
                outputs = torch.mean(outputs, 0)

            outputs = outputs[sequence_length - 1::sequence_length]

            _, preds = torch.max(outputs.data, 1)

            loss = criterion(outputs, labels)
            val_loss += loss.data[0]
            val_corrects += torch.sum(preds == labels.data)
        val_elapsed_time = time.time() - val_start_time
        val_accuracy = val_corrects / num_val_we_use
        val_average_loss = val_loss / num_val_we_use
        print('epoch: {:4d}'
              ' train in: {:2.0f}m{:2.0f}s'
              ' train loss: {:4.4f}'
              ' train accu: {:.4f}'
              ' valid in: {:2.0f}m{:2.0f}s'
              ' valid loss: {:4.4f}'
              ' valid accu: {:.4f}'.format(epoch, train_elapsed_time // 60,
                                           train_elapsed_time % 60,
                                           train_average_loss, train_accuracy,
                                           val_elapsed_time // 60,
                                           val_elapsed_time % 60,
                                           val_average_loss, val_accuracy))

        if optimizer_choice == 0:
            if sgd_adjust_lr == 0:
                exp_lr_scheduler.step()
            elif sgd_adjust_lr == 1:
                exp_lr_scheduler.step(val_average_loss)

        if val_accuracy > best_val_accuracy:
            best_val_accuracy = val_accuracy
            correspond_train_acc = train_accuracy
            best_model_wts = copy.deepcopy(model.state_dict())
        if val_accuracy == best_val_accuracy:
            if train_accuracy > correspond_train_acc:
                correspond_train_acc = train_accuracy
                best_model_wts = copy.deepcopy(model.state_dict())
        all_train_loss.append(train_average_loss)
        all_train_accuracy.append(train_accuracy)
        all_val_loss.append(val_average_loss)
        all_val_accuracy.append(val_accuracy)

    print('best accuracy: {:.4f} cor train accu: {:.4f}'.format(
        best_val_accuracy, correspond_train_acc))

    save_val = int("{:4.0f}".format(best_val_accuracy * 10000))
    save_train = int("{:4.0f}".format(correspond_train_acc * 10000))
    model_name = "lstm" \
                 + "_epoch_" + str(epochs) \
                 + "_length_" + str(sequence_length) \
                 + "_opt_" + str(optimizer_choice) \
                 + "_mulopt_" + str(multi_optim) \
                 + "_flip_" + str(use_flip) \
                 + "_crop_" + str(crop_type) \
                 + "_batch_" + str(train_batch_size) \
                 + "_train_" + str(save_train) \
                 + "_val_" + str(save_val) \
                 + ".pth"

    torch.save(best_model_wts, model_name)

    all_info.append(all_train_accuracy)
    all_info.append(all_train_loss)
    all_info.append(all_val_accuracy)
    all_info.append(all_val_loss)

    record_name = "lstm" \
                  + "_epoch_" + str(epochs) \
                  + "_length_" + str(sequence_length) \
                  + "_opt_" + str(optimizer_choice) \
                  + "_mulopt_" + str(multi_optim) \
                  + "_flip_" + str(use_flip) \
                  + "_crop_" + str(crop_type) \
                  + "_batch_" + str(train_batch_size) \
                  + "_train_" + str(save_train) \
                  + "_val_" + str(save_val) \
                  + ".pkl"

    with open(record_name, 'wb') as f:
        pickle.dump(all_info, f)
    print()
Example #18
0
def train_process():
    global global_step

    summary_writer = tensorboardX.SummaryWriter(
        log_dir=config.result_sub_folder, comment=config.comment)
    train_tfs = compose(
        [rotate_y(),
         rand_scale(),
         rand_translate(),
         jitter(),
         normalize()])

    test_tfs = normalize()
    scene_tfs = compose([normalize(), to_tensor()])
    # prepare data
    print("config.dataset")
    if config.dataset == "ModelNet40":
        train_set = ModelNet40(partition='train', transforms=train_tfs)
        valid_set = ModelNet40(partition='test', transforms=test_tfs)
    elif config.dataset == "Mnist":
        train_set = Mnist(partition='train')
        valid_set = Mnist(partition='test')
    elif config.dataset == "ScanNet":
        train_set = ScanNet(partition='train', transforms=train_tfs)
        valid_set = ScanNet(partition='test', transforms=test_tfs)
    elif config.dataset == "ModelNet10":
        train_set = ModelNet10(partition='train')
        valid_set = ModelNet10(partition='test')
    elif config.dataset == "S3DIS":
        train_set = S3DIS(partition='train', transforms=train_tfs)
        valid_set = S3DIS(partition='test', transforms=test_tfs)
        scene_set = S3DIS(partition='data/zero_0.h5', transforms=test_tfs)
    elif config.dataset == "ShapeNetParts":
        train_set = ShapeNetPart(partition='trainval', transforms=train_tfs)
        valid_set = ShapeNetPart(partition='test', transforms=test_tfs)
    elif config.dataset == "Cifar10":
        train_set = Cifar10(partition='train')
        valid_set = Cifar10(partition='test')

    else:
        raise NotImplementedError
    train_loader = DataLoader(train_set,
                              batch_size=config.train.batch_size,
                              shuffle=True,
                              num_workers=config.num_workers,
                              drop_last=True)

    valid_loader = DataLoader(valid_set,
                              batch_size=config.validation.batch_size,
                              shuffle=False,
                              num_workers=config.num_workers,
                              drop_last=False)
    if config.dataset == "S3DIS":
        scene_loader = DataLoader(scene_set,
                                  batch_size=config.validation.batch_size,
                                  shuffle=False,
                                  num_workers=config.num_workers,
                                  drop_last=False)

    print('train set size: {}'.format(len(train_set)))
    print('valid set size: {}'.format(len(valid_set)))
    if config.dataset == "S3DIS":
        print('scene set size: {}'.format(len(scene_set)))

    # prepare model
    net = create_model(config.base_model).to(config.device)

    # prepare optimizer
    if config.train.optimizer == 'SGD':
        optimizer = optim.SGD(net.parameters(),
                              config.train.learning_rate_base,
                              momentum=config.train.momentum)
    elif config.train.optimizer == 'ADAM':
        optimizer = optim.Adam(net.parameters(),
                               lr=config.train.learning_rate_base,
                               eps=1e-08,
                               weight_decay=1e-4)
    else:
        raise NotImplementedError

    net = DataParallel(net)
    if config.train.resume:
        model_recorder = ModelRecorder(config.resume_ckpt_file,
                                       optimizer,
                                       summary_writer=summary_writer)
    else:
        model_recorder = ModelRecorder(config.ckpt_file,
                                       optimizer,
                                       summary_writer=summary_writer)
    start_epoch = 0
    if config.train.resume:
        if not config.task == "seg":
            start_epoch = model_recorder.resume(net.module,
                                                optimizer,
                                                from_measurement='acc')
        else:
            start_epoch = model_recorder.resume(net.module,
                                                optimizer,
                                                from_measurement='iou')
        if config.train.resume_epoch is not None:
            start_epoch = config.train.resume_epoch
            print("Force resume at {}".format(start_epoch))
        else:
            print("Resume at {}".format(start_epoch))

    # prepare the criterion
    criterion = nn.CrossEntropyLoss()

    # start to train
    for epoch in range(start_epoch, config.train.num_epochs):
        lr = config.train.learning_rate_base * (math.pow(
            config.train.decay_rate, epoch // 10))
        if lr < config.train.learning_rate_min:
            lr = config.train.learning_rate_min
        for g in optimizer.param_groups:
            g['lr'] = lr
        summary_writer.add_scalar('Learning rate', lr, global_step=epoch)

        if config.task == "seg":
            training_loss, training_acc, avg_per_class_acc, train_ious = train_epoch(
                train_loader, net, criterion, optimizer, epoch)
            summary_writer.add_scalar('Training Loss',
                                      training_loss,
                                      global_step=epoch)
            summary_writer.add_scalar('Training Accuracy',
                                      training_acc,
                                      global_step=epoch)
            summary_writer.add_scalar('Training Average Precision ',
                                      avg_per_class_acc,
                                      global_step=epoch)
            summary_writer.add_scalar('Training IOUs ',
                                      train_ious,
                                      global_step=epoch)
        else:
            training_loss, training_acc = train_epoch(train_loader, net,
                                                      criterion, optimizer,
                                                      epoch)
            summary_writer.add_scalar('Training Accuracy',
                                      training_acc,
                                      global_step=epoch)
            summary_writer.add_scalar('Training Loss',
                                      training_loss,
                                      global_step=epoch)

        if (epoch % config.validation.step_val
                == 0) or (epoch == config.train.num_epochs - 1):
            with torch.no_grad():
                if config.task == "seg":
                    validation_loss, validation_acc, avg_per_class_acc, val_ious = evaluate(
                        valid_loader, net, html_path="training_output")
                    summary_writer.add_scalar('Validation Loss',
                                              validation_loss,
                                              global_step=epoch)
                    summary_writer.add_scalar('Validation Accuracy',
                                              validation_acc,
                                              global_step=epoch)
                    summary_writer.add_scalar('Validation Average Precision ',
                                              avg_per_class_acc,
                                              global_step=epoch)
                    summary_writer.add_scalar('Validation IOUs ',
                                              val_ious,
                                              global_step=epoch)
                    if config.dataset == "ScanNet":
                        net.eval()
                        print('Scene Validation')
                        y_true = []
                        y_pred = []
                        sample_num = 2048
                        max_point_num = 8192
                        batch_size = math.ceil(max_point_num / sample_num)

                        indices_batch_indices = np.tile(
                            np.reshape(np.arange(batch_size),
                                       (batch_size, 1, 1)), (1, sample_num, 1))

                        data_h5 = h5py.File("zero_0.h5", 'r+')
                        data = data_h5['data'][...].astype(np.float32)
                        data_num = data_h5['data_num'][...].astype(np.int32)
                        data_labels_seg = data_h5['label_seg'][...].astype(
                            np.int64)
                        data_h5.close()
                        batch_num = data.shape[0]

                        labels_pred = np.full((batch_num, max_point_num),
                                              -1,
                                              dtype=np.int32)
                        confidences_pred = np.zeros((batch_num, max_point_num),
                                                    dtype=np.float32)

                        for batch_idx in range(batch_num):
                            if batch_idx % 10 == 0:
                                print('{}-Processing {} of {} batches.'.format(
                                    datetime.now(), batch_idx, batch_num))
                            points_batch = data[batch_idx]

                            point_num = data_num[batch_idx]
                            seg_np = (data_labels_seg[batch_idx])[:point_num]
                            y_true.append(seg_np.reshape(-1, 1))
                            tile_num = math.ceil(
                                (sample_num * batch_size) / point_num)

                            indices_shuffle = np.tile(np.arange(point_num),
                                                      tile_num)[0:sample_num *
                                                                batch_size]
                            np.random.shuffle(indices_shuffle)
                            input_points = scene_tfs(
                                (points_batch[indices_shuffle]).reshape(
                                    (batch_size, sample_num,
                                     -1))).to(config.device)

                            seg_probs = net.forward(input_points)
                            probs_2d = np.reshape(
                                seg_probs.detach().cpu().numpy(),
                                (sample_num * batch_size, -1))
                            predictions = [(-1, 0.0)] * point_num
                            for idx in range(sample_num * batch_size):

                                point_idx = indices_shuffle[idx]
                                probs = probs_2d[idx, :]
                                confidence = np.amax(probs)
                                label = np.argmax(probs)
                                if confidence > predictions[point_idx][1]:
                                    predictions[point_idx] = [
                                        label, confidence
                                    ]

                            pred_np = np.array(predictions)[:, 0]
                            y_pred.append(pred_np.reshape(-1, 1))

                        print(
                            metrics.classification_report(
                                np.concatenate(y_true, axis=0),
                                np.concatenate(y_pred, axis=0)))
                else:
                    validation_loss, acc = evaluate(valid_loader, net)
                    summary_writer.add_scalar('Validation Accuracy',
                                              acc,
                                              global_step=epoch)
                    summary_writer.add_scalar('Validation Loss',
                                              validation_loss,
                                              global_step=epoch)
            if config.task == "seg":
                model_recorder.add(
                    epoch, net,
                    dict(acc=validation_acc,
                         iou=val_ious,
                         avg_acc=avg_per_class_acc))
            else:
                model_recorder.add(epoch, net, dict(acc=acc))
            model_recorder.print_curr_stat()

    print('\nTrain Finished: {}'.format(
        time.strftime("%Y-%m-%d_%H-%M-%S", time.localtime())))
Example #19
0
def train_model(train_dataset, train_num_each, val_dataset, val_num_each):
    num_train = len(train_dataset)
    num_val = len(val_dataset)

    train_useful_start_idx = get_useful_start_idx(sequence_length, train_num_each)
    val_useful_start_idx = get_useful_start_idx(sequence_length, val_num_each)

    num_train_we_use = len(train_useful_start_idx) // num_gpu * num_gpu
    num_val_we_use = len(val_useful_start_idx) // num_gpu * num_gpu
    # num_train_we_use = 800
    # num_val_we_use = 800

    train_we_use_start_idx = train_useful_start_idx[0:num_train_we_use]
    val_we_use_start_idx = val_useful_start_idx[0:num_val_we_use]

    train_idx = []
    for i in range(num_train_we_use):
        for j in range(sequence_length):
            train_idx.append(train_we_use_start_idx[i] + j)

    val_idx = []
    for i in range(num_val_we_use):
        for j in range(sequence_length):
            val_idx.append(val_we_use_start_idx[i] + j)

    num_train_all = len(train_idx)
    num_val_all = len(val_idx)

    print('num train start idx : {:6d}'.format(len(train_useful_start_idx)))
    print('last idx train start: {:6d}'.format(train_useful_start_idx[-1]))
    print('num of train dataset: {:6d}'.format(num_train))
    print('num of train we use : {:6d}'.format(num_train_we_use))
    print('num of all train use: {:6d}'.format(num_train_all))
    print('num valid start idx : {:6d}'.format(len(val_useful_start_idx)))
    print('last idx valid start: {:6d}'.format(val_useful_start_idx[-1]))
    print('num of valid dataset: {:6d}'.format(num_val))
    print('num of valid we use : {:6d}'.format(num_val_we_use))
    print('num of all valid use: {:6d}'.format(num_val_all))

    train_loader = DataLoader(
        train_dataset,
        batch_size=train_batch_size,
        sampler=train_idx,
        num_workers=workers,
        pin_memory=False
    )
    val_loader = DataLoader(
        val_dataset,
        batch_size=val_batch_size,
        sampler=val_idx,
        num_workers=workers,
        pin_memory=False
    )

    model = multi_lstm()
    model = DataParallel(model)
    model.load_state_dict(torch.load(
        'cnn_lstm_epoch_25_length_4_opt_1_mulopt_1_flip_0_crop_1_batch_200_train1_9998_train2_9987_val1_9731_val2_8752.pth'))

    kl_fc_p2t = nn.Linear(7, 7)
    kl_fc_t2p = nn.Linear(7, 7)

    # fix 前面网络层,学习两个矩阵
    for param in model.module.parameters():
        param.requires_grad = False
    for param in kl_fc_p2t.parameters():
        param.requires_grad = True
    for param in kl_fc_t2p.parameters():
        param.requires_grad = True

    if use_gpu:
        model = model.cuda()
        kl_fc_p2t = kl_fc_p2t.cuda()
        kl_fc_t2p = kl_fc_t2p.cuda()

    criterion_1 = nn.BCEWithLogitsLoss(size_average=False)
    criterion_2 = nn.CrossEntropyLoss(size_average=False)
    sigmoid = nn.Sigmoid()
    if use_gpu:
        sigmoid = sigmoid.cuda()

    if optimizer_choice == 0:
        optimizer = optim.SGD([{'params': kl_fc_p2t.parameters()},
                               {'params': kl_fc_t2p.parameters()}], lr=learning_rate, momentum=momentum,
                              dampening=dampening,
                              weight_decay=weight_decay, nesterov=use_nesterov)
        if sgd_adjust_lr == 0:
            exp_lr_scheduler = lr_scheduler.StepLR(optimizer, step_size=sgd_adjust_lr, gamma=sgd_gamma)
        elif sgd_adjust_lr == 1:
            exp_lr_scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, 'min')
    elif optimizer_choice == 1:
        optimizer = optim.Adam([{'params': kl_fc_p2t.parameters()},
                                {'params': kl_fc_t2p.parameters()}], lr=learning_rate)

    best_val_accuracy_1 = 0.0
    best_val_accuracy_2 = 0.0
    correspond_train_acc_1 = 0.0
    correspond_train_acc_2 = 0.0

    kl_fc_t2p_np = copy.deepcopy(kl_fc_t2p.weight.data.cpu().numpy())
    kl_fc_p2t_np = copy.deepcopy(kl_fc_p2t.weight.data.cpu().numpy())

    record_np = np.zeros([epochs, 8])

    for epoch in range(epochs):
        np.random.shuffle(train_we_use_start_idx)
        train_idx = []
        for i in range(num_train_we_use):
            for j in range(sequence_length):
                train_idx.append(train_we_use_start_idx[i] + j)

        train_loader = DataLoader(
            train_dataset,
            batch_size=train_batch_size,
            sampler=train_idx,
            num_workers=workers,
            pin_memory=False
        )
        # train
        train_loss_1 = 0.0
        train_loss_2 = 0.0
        train_corrects_1 = 0
        train_corrects_2 = 0

        train_start_time = time.time()
        for data in train_loader:
            inputs, labels_1, labels_2 = data
            if use_gpu:
                inputs = Variable(inputs.cuda())
                labels_1 = Variable(labels_1.cuda())
                labels_2 = Variable(labels_2.cuda())
            else:
                inputs = Variable(inputs)
                labels_1 = Variable(labels_1)
                labels_2 = Variable(labels_2)

            optimizer.zero_grad()

            outputs_1, outputs_2 = model.forward(inputs)

            kl_output_1 = kl_fc_t2p(outputs_1)
            kl_output_2 = kl_fc_p2t(outputs_2)

            outputs_1 = (kl_output_2 + outputs_1) / 2
            outputs_2 = (kl_output_1 + outputs_2) / 2

            _, preds_2 = torch.max(outputs_2.data, 1)

            # 统计tool正确个数
            sig_out = sigmoid(outputs_1.data)
            if use_gpu:
                preds_1 = torch.cuda.ByteTensor(sig_out > 0.5)
            else:
                preds_1 = torch.ByteTensor(sig_out > 0.5)
            preds_1 = preds_1.long()
            train_corrects_1 += torch.sum(preds_1 == labels_1.data)
            labels_1 = Variable(labels_1.data.float())

            loss_1 = criterion_1(outputs_1, labels_1)
            loss_2 = criterion_2(outputs_2, labels_2)

            loss = loss_1 + loss_2
            loss.backward()
            optimizer.step()
            train_loss_1 += loss_1.data[0]
            train_loss_2 += loss_2.data[0]
            train_corrects_2 += torch.sum(preds_2 == labels_2.data)

        train_elapsed_time = time.time() - train_start_time
        train_accuracy_1 = train_corrects_1 / num_train_all / 7
        train_accuracy_2 = train_corrects_2 / num_train_all
        train_average_loss_1 = train_loss_1 / num_train_all / 7
        train_average_loss_2 = train_loss_2 / num_train_all

        # begin eval
        val_loss_1 = 0.0
        val_loss_2 = 0.0
        val_corrects_1 = 0
        val_corrects_2 = 0

        val_start_time = time.time()
        for data in val_loader:
            inputs, labels_1, labels_2 = data
            labels_2 = labels_2[(sequence_length - 1):: sequence_length]
            if use_gpu:
                inputs = Variable(inputs.cuda(), volatile=True)
                labels_1 = Variable(labels_1.cuda(), volatile=True)
                labels_2 = Variable(labels_2.cuda(), volatile=True)
            else:
                inputs = Variable(inputs, volatile=True)
                labels_1 = Variable(labels_1, volatile=True)
                labels_2 = Variable(labels_2, volatile=True)

            outputs_1, outputs_2 = model.forward(inputs)

            kl_output_1 = kl_fc_t2p(outputs_1)
            kl_output_2 = kl_fc_p2t(outputs_2)
            outputs_1 = (kl_output_2 + outputs_1) / 2
            outputs_2 = (kl_output_1 + outputs_2) / 2

            outputs_2 = outputs_2[sequence_length - 1::sequence_length]
            _, preds_2 = torch.max(outputs_2.data, 1)

            sig_out = sigmoid(outputs_1.data)
            if use_gpu:
                preds_1 = torch.cuda.ByteTensor(sig_out > 0.5)
            else:
                preds_1 = torch.ByteTensor(sig_out > 0.5)
            preds_1 = preds_1.long()
            val_corrects_1 += torch.sum(preds_1 == labels_1.data)

            labels_1 = Variable(labels_1.data.float())
            loss_1 = criterion_1(outputs_1, labels_1)
            loss_2 = criterion_2(outputs_2, labels_2)

            val_loss_1 += loss_1.data[0]
            val_loss_2 += loss_2.data[0]
            val_corrects_2 += torch.sum(preds_2 == labels_2.data)

        val_elapsed_time = time.time() - val_start_time
        val_accuracy_1 = val_corrects_1 / (num_val_all * 7)
        val_accuracy_2 = val_corrects_2 / num_val_we_use
        val_average_loss_1 = val_loss_1 / (num_val_all * 7)
        val_average_loss_2 = val_loss_2 / num_val_we_use

        print('epoch: {:4d}'
              ' train time: {:2.0f}m{:2.0f}s'
              ' train accu_1: {:.4f}'
              ' train accu_2: {:.4f}'
              ' train loss_1: {:4.4f}'
              ' train loss_2: {:4.4f}'
              .format(epoch,
                      train_elapsed_time // 60,
                      train_elapsed_time % 60,
                      train_accuracy_1,
                      train_accuracy_2,
                      train_average_loss_1,
                      train_average_loss_2))
        print('epoch: {:4d}'
              ' valid time: {:2.0f}m{:2.0f}s'
              ' valid accu_1: {:.4f}'
              ' valid accu_2: {:.4f}'
              ' valid loss_1: {:4.4f}'
              ' valid loss_2: {:4.4f}'
              .format(epoch,
                      val_elapsed_time // 60,
                      val_elapsed_time % 60,
                      val_accuracy_1,
                      val_accuracy_2,
                      val_average_loss_1,
                      val_average_loss_2))

        if optimizer_choice == 0:
            if sgd_adjust_lr == 0:
                exp_lr_scheduler.step()
            elif sgd_adjust_lr == 1:
                exp_lr_scheduler.step(val_average_loss_1 + val_average_loss_2)

        if val_accuracy_2 > best_val_accuracy_2 and val_accuracy_1 > 0.95:
            best_val_accuracy_2 = val_accuracy_2
            best_val_accuracy_1 = val_accuracy_1
            correspond_train_acc_1 = train_accuracy_1
            correspond_train_acc_2 = train_accuracy_2
            kl_fc_t2p_np = copy.deepcopy(kl_fc_t2p.weight.data.cpu().numpy())
            kl_fc_p2t_np = copy.deepcopy(kl_fc_p2t_np.weight.data.cpu().numpy())
        elif val_accuracy_2 == best_val_accuracy_2 and val_accuracy_1 > 0.95:
            if val_accuracy_1 > best_val_accuracy_1:
                correspond_train_acc_1 = train_accuracy_1
                correspond_train_acc_2 = train_accuracy_2
                kl_fc_t2p_np = copy.deepcopy(kl_fc_t2p.weight.data.cpu().numpy())
                kl_fc_p2t_np = copy.deepcopy(kl_fc_p2t_np.weight.data.cpu().numpy())
            elif val_accuracy_1 == best_val_accuracy_1:
                if train_accuracy_2 > correspond_train_acc_2:
                    correspond_train_acc_2 = train_accuracy_2
                    correspond_train_acc_1 = train_accuracy_1
                    kl_fc_t2p_np = copy.deepcopy(kl_fc_t2p.weight.data.cpu().numpy())
                    kl_fc_p2t_np = copy.deepcopy(kl_fc_p2t_np.weight.data.cpu().numpy())
                elif train_accuracy_2 == correspond_train_acc_2:
                    if train_accuracy_1 > best_val_accuracy_1:
                        correspond_train_acc_1 = train_accuracy_1
                        kl_fc_t2p_np = copy.deepcopy(kl_fc_t2p.weight.data.cpu().numpy())
                        kl_fc_p2t_np = copy.deepcopy(kl_fc_p2t_np.weight.data.cpu().numpy())

        record_np[epoch, 0] = train_accuracy_1
        record_np[epoch, 1] = train_accuracy_2
        record_np[epoch, 2] = train_average_loss_1
        record_np[epoch, 3] = train_average_loss_2
        record_np[epoch, 4] = val_accuracy_1
        record_np[epoch, 5] = val_accuracy_2
        record_np[epoch, 6] = val_average_loss_1
        record_np[epoch, 7] = val_average_loss_2

    print('best accuracy_1: {:.4f} cor train accu_1: {:.4f}'.format(best_val_accuracy_1, correspond_train_acc_1))
    print('best accuracy_2: {:.4f} cor train accu_2: {:.4f}'.format(best_val_accuracy_2, correspond_train_acc_2))
    save_val_1 = int("{:4.0f}".format(best_val_accuracy_1 * 10000))
    save_val_2 = int("{:4.0f}".format(best_val_accuracy_2 * 10000))
    save_train_1 = int("{:4.0f}".format(correspond_train_acc_1 * 10000))
    save_train_2 = int("{:4.0f}".format(correspond_train_acc_2 * 10000))
    public_name = "train_klave" \
                  + "_epoch_" + str(epochs) \
                  + "_length_" + str(sequence_length) \
                  + "_opt_" + str(optimizer_choice) \
                  + "_flip_" + str(use_flip) \
                  + "_crop_" + str(crop_type) \
                  + "_batch_" + str(train_batch_size) \
                  + "_train1_" + str(save_train_1) \
                  + "_train2_" + str(save_train_2) \
                  + "_val1_" + str(save_val_1) \
                  + "_val2_" + str(save_val_2)

    record_name = public_name + ".npy"
    np.save(record_name, record_np)

    np.save('fc_p2t', kl_fc_p2t_np)
    np.save('fc_t2p', kl_fc_t2p_np)
Example #20
0
            start_point += args.stride
        if start_point < len(tokens):
            samples.append(tokens[len(tokens) - n_ctx:])
        random.shuffle(samples)

        ##s数据准备 prepare data
        for step in range(len(samples) // args.batch_size):
            batch = samples[step * args.batch_size:(step + 1) *
                            args.batch_size]
            batch_inputs = []
            for ids in batch:
                int_ids = [int(x) for x in ids]
                batch_inputs.append(int_ids)
            batch_inputs = torch.tensor(batch_inputs).long().cuda()
            ##forward pass
            outputs = model.forward(input_ids=batch_inputs,
                                    labels=batch_inputs)
            loss, logits = outputs[:2]

            ##get loss
            #if  multi_gpu:
            #     loss = loss.mean()
            if args.gradient_accumulation > 1:
                loss = loss / args.gradient_accumulation

        ## loss backward
            loss.backward()
            torch.nn.utils.clip_grad_norm(model.parameters(),
                                          args.max_grad_norm)

            ## optimizer step
            if (overall_step + 1) % args.gradient_accumulation == 0:
Example #21
0
def main():
    if raw:
        print('building files')
        build_files(data_path=raw_data_path)
        print('files built')

    model = pytorch_transformers.modeling_gpt2.GPT2LMHeadModel(
        config=model_config)
    model.to(device)

    num_parameters = 0
    parameters = model.parameters()
    for parameter in parameters:
        num_parameters += parameter.numel()
    print('number of parameters: {}'.format(num_parameters))

    multi_gpu = False
    full_len = 0
    print('calculating total steps')
    for i in tqdm(range(num_pieces)):
        with open(tokenized_data_path + 'tokenized_train_{}.txt'.format(i),
                  'r') as f:
            full_len += len([int(item) for item in f.read().strip().split()])
    total_steps = int(full_len / stride * epochs / batch_size /
                      gradient_accumulation)
    print('total steps = {}'.format(total_steps))
    optimizer = pytorch_transformers.AdamW(model.parameters(),
                                           lr=lr,
                                           correct_bias=True)
    scheduler = pytorch_transformers.WarmupLinearSchedule(
        optimizer, warmup_steps=warmup_steps, t_total=total_steps)
    if fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
            )
        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level=fp16_opt_level)

    if torch.cuda.device_count() > 1:
        print("Let's use", torch.cuda.device_count(), "GPUs!")
        model = DataParallel(model)
        multi_gpu = True
    print('starting training')
    for epoch in range(epochs):
        print('epoch {}'.format(epoch + 1))
        now = datetime.now()
        print('time: {}'.format(now))
        x = np.linspace(0, num_pieces - 1, num_pieces, dtype=np.int32)
        random.shuffle(x)
        piece_num = 0
        for i in x:
            running_loss = 0
            with open(tokenized_data_path + 'tokenized_train_{}.txt'.format(i),
                      'r') as f:
                line = f.read().strip()
            tokens = line.split()
            tokens = [int(token) for token in tokens]
            start_point = 0
            samples = []
            while start_point < len(tokens) - n_ctx:
                samples.append(tokens[start_point:start_point + n_ctx])
                start_point += stride
            random.shuffle(samples)
            for step in range(len(samples) // batch_size):

                #  prepare data
                batch = samples[step * batch_size:(step + 1) * batch_size]
                batch_labels = []
                batch_inputs = []
                for ids in batch:
                    int_ids_for_labels = [int(x) for x in ids]
                    int_ids_for_inputs = [int(x) for x in ids]
                    batch_labels.append(int_ids_for_labels)
                    batch_inputs.append(int_ids_for_inputs)
                batch_labels = torch.tensor(batch_labels).long().to(device)
                batch_inputs = torch.tensor(batch_inputs).long().to(device)

                #  forward pass
                outputs = model.forward(input_ids=batch_inputs,
                                        labels=batch_labels)
                loss, logits = outputs[:2]

                #  get loss
                if multi_gpu:
                    loss = loss.mean()
                if gradient_accumulation > 1:
                    loss = loss / gradient_accumulation

                #  loss backward
                if fp16:
                    with amp.scale_loss(loss, optimizer) as scaled_loss:
                        scaled_loss.backward()
                        torch.nn.utils.clip_grad_norm_(
                            amp.master_params(optimizer), max_grad_norm)
                else:
                    loss.backward()
                    torch.nn.utils.clip_grad_norm_(model.parameters(),
                                                   max_grad_norm)

                #  optimizer step
                if (step + 1) % gradient_accumulation == 0:
                    running_loss += loss.item()
                    scheduler.step()
                    optimizer.step()
                    optimizer.zero_grad()
                if (step + 1) % log_step == 0:
                    print('step {} of piece {} of epoch {}, loss {}'.format(
                        (step + 1) // gradient_accumulation, piece_num,
                        epoch + 1,
                        running_loss * gradient_accumulation / log_step))
                    running_loss = 0
            piece_num += 1

        print('saving model for epoch {}'.format(epoch + 1))
        if not os.path.exists(output_dir + 'model_epoch{}'.format(epoch + 1)):
            os.mkdir(output_dir + 'model_epoch{}'.format(epoch + 1))
        model_to_save = model.module if hasattr(model, 'module') else model
        model_to_save.save_pretrained(output_dir +
                                      'model_epoch{}'.format(epoch + 1))
        # torch.save(scheduler.state_dict(), output_dir + 'model_epoch{}/scheduler.pt'.format(epoch + 1))
        # torch.save(optimizer.state_dict(), output_dir + 'model_epoch{}/optimizer.pt'.format(epoch + 1))
        print('epoch {} finished'.format(epoch + 1))

        then = datetime.now()
        print('time: {}'.format(then))
        print('time for one epoch: {}'.format(then - now))

    print('training finished')
    if not os.path.exists(output_dir + 'final_model'):
        os.mkdir(output_dir + 'final_model')
    model_to_save = model.module if hasattr(model, 'module') else model
    model_to_save.save_pretrained(output_dir + 'final_model')
def train_model(train_dataset, train_num_each, val_dataset, val_num_each):
    num_train = len(train_dataset)
    num_val = len(val_dataset)

    train_useful_start_idx = get_useful_start_idx(sequence_length,
                                                  train_num_each)

    val_useful_start_idx = get_useful_start_idx(sequence_length, val_num_each)

    num_train_we_use = len(train_useful_start_idx) // num_gpu * num_gpu
    num_val_we_use = len(val_useful_start_idx) // num_gpu * num_gpu
    # num_train_we_use = 800
    # num_val_we_use = 800

    train_we_use_start_idx = train_useful_start_idx[0:num_train_we_use]
    val_we_use_start_idx = val_useful_start_idx[0:num_val_we_use]

    train_idx = []
    for i in range(num_train_we_use):
        for j in range(sequence_length):
            train_idx.append(train_we_use_start_idx[i] + j)

    val_idx = []
    for i in range(num_val_we_use):
        for j in range(sequence_length):
            val_idx.append(val_we_use_start_idx[i] + j)

    num_train_all = len(train_idx)
    num_val_all = len(val_idx)

    print('num train start idx : {:6d}'.format(len(train_useful_start_idx)))
    print('last idx train start: {:6d}'.format(train_useful_start_idx[-1]))
    print('num of train dataset: {:6d}'.format(num_train))
    print('num of train we use : {:6d}'.format(num_train_we_use))
    print('num of all train use: {:6d}'.format(num_train_all))
    print('num valid start idx : {:6d}'.format(len(val_useful_start_idx)))
    print('last idx valid start: {:6d}'.format(val_useful_start_idx[-1]))
    print('num of valid dataset: {:6d}'.format(num_val))
    print('num of valid we use : {:6d}'.format(num_val_we_use))
    print('num of all valid use: {:6d}'.format(num_val_all))

    train_loader = DataLoader(train_dataset,
                              batch_size=train_batch_size,
                              sampler=train_idx,
                              num_workers=workers,
                              pin_memory=False)
    val_loader = DataLoader(val_dataset,
                            batch_size=val_batch_size,
                            sampler=val_idx,
                            num_workers=workers,
                            pin_memory=False)

    model = multi_lstm()
    model = DataParallel(model)
    model.load_state_dict(
        torch.load(
            'cnn_lstm_epoch_25_length_4_opt_1_mulopt_1_flip_0_crop_1_batch_200_train1_9998_train2_9987_val1_9731_val2_8752.pth'
        ))
    kl_fc_p2t = nn.Linear(7, 7)

    kl_fc_t2p = nn.Linear(7, 7)

    all_phase_to_tool = np.load('kl_fc_p2t.npy')
    all_tool_to_phase = np.load('kl_fc_t2p.npy')

    kl_fc_p2t.weight.data = torch.from_numpy(
        all_phase_to_tool.astype('float32'))
    kl_fc_t2p.weight.data = torch.from_numpy(
        all_tool_to_phase.astype('float32'))

    for param in kl_fc_p2t.parameters():
        param.requires_grad = True
    for param in kl_fc_t2p.parameters():
        param.requires_grad = True

    if use_gpu:
        model = model.cuda()
        kl_fc_p2t = kl_fc_p2t.cuda()
        kl_fc_t2p = kl_fc_t2p.cuda()

    criterion_1 = nn.BCEWithLogitsLoss(size_average=False)
    criterion_2 = nn.CrossEntropyLoss(size_average=False)
    criterion_3 = nn.KLDivLoss(size_average=False)
    softmax_cuda = nn.Softmax().cuda()
    sigmoid_cuda = nn.Sigmoid().cuda()

    if multi_optim == 0:
        if optimizer_choice == 0:
            optimizer = optim.SGD([{
                'params': model.module.parameters()
            }, {
                'params': kl_fc_p2t.parameters()
            }, {
                'params': kl_fc_t2p.parameters()
            }],
                                  lr=learning_rate,
                                  momentum=momentum,
                                  dampening=dampening,
                                  weight_decay=weight_decay,
                                  nesterov=use_nesterov)
            if sgd_adjust_lr == 0:
                exp_lr_scheduler = lr_scheduler.StepLR(optimizer,
                                                       step_size=sgd_step,
                                                       gamma=sgd_gamma)
            elif sgd_adjust_lr == 1:
                exp_lr_scheduler = lr_scheduler.ReduceLROnPlateau(
                    optimizer, 'min')
        elif optimizer_choice == 1:
            optimizer = optim.Adam([[{
                'params': model.module.parameters()
            }, {
                'params': kl_fc_p2t.parameters()
            }, {
                'params': kl_fc_t2p.parameters()
            }]],
                                   lr=learning_rate)
    elif multi_optim == 1:
        if optimizer_choice == 0:
            optimizer = optim.SGD([
                {
                    'params': model.module.share.parameters()
                },
                {
                    'params': kl_fc_p2t.parameters()
                },
                {
                    'params': kl_fc_t2p.parameters()
                },
                {
                    'params': model.module.lstm.parameters(),
                    'lr': learning_rate
                },
                {
                    'params': model.module.fc.parameters(),
                    'lr': learning_rate
                },
            ],
                                  lr=learning_rate / 10,
                                  momentum=momentum,
                                  dampening=dampening,
                                  weight_decay=weight_decay,
                                  nesterov=use_nesterov)
            if sgd_adjust_lr == 0:
                exp_lr_scheduler = lr_scheduler.StepLR(optimizer,
                                                       step_size=sgd_step,
                                                       gamma=sgd_gamma)
            elif sgd_adjust_lr == 1:
                exp_lr_scheduler = lr_scheduler.ReduceLROnPlateau(
                    optimizer, 'min')
        elif optimizer_choice == 1:
            optimizer = optim.Adam([
                {
                    'params': model.module.share.parameters()
                },
                {
                    'params': kl_fc_p2t.parameters()
                },
                {
                    'params': kl_fc_t2p.parameters()
                },
                {
                    'params': model.module.lstm.parameters(),
                    'lr': learning_rate
                },
                {
                    'params': model.module.fc.parameters(),
                    'lr': learning_rate
                },
            ],
                                   lr=learning_rate / 10)

    best_model_wts = copy.deepcopy(model.state_dict())
    best_val_accuracy_1 = 0.0
    best_val_accuracy_2 = 0.0  # judge by accu2
    correspond_train_acc_1 = 0.0
    correspond_train_acc_2 = 0.0

    # 要存储2个train的准确率 2个valid的准确率 4个train 4个loss的loss, 一共12个数据要记录
    record_np = np.zeros([epochs, 12])

    for epoch in range(epochs):
        # np.random.seed(epoch)
        np.random.shuffle(train_we_use_start_idx)
        train_idx = []
        for i in range(num_train_we_use):
            for j in range(sequence_length):
                train_idx.append(train_we_use_start_idx[i] + j)

        train_loader = DataLoader(train_dataset,
                                  batch_size=train_batch_size,
                                  sampler=train_idx,
                                  num_workers=workers,
                                  pin_memory=False)

        model.train()
        train_loss_1 = 0.0
        train_loss_2 = 0.0
        train_loss_3 = 0.0
        train_loss_4 = 0.0
        train_corrects_1 = 0
        train_corrects_2 = 0

        train_start_time = time.time()
        for data in train_loader:
            inputs, labels_1, labels_2 = data
            if use_gpu:
                inputs = Variable(inputs.cuda())
                labels_1 = Variable(labels_1.cuda())
                labels_2 = Variable(labels_2.cuda())
            else:
                inputs = Variable(inputs)
                labels_1 = Variable(labels_1)
                labels_2 = Variable(labels_2)

            optimizer.zero_grad()

            outputs_1, outputs_2 = model.forward(inputs)

            _, preds_2 = torch.max(outputs_2.data, 1)

            sig_out = outputs_1.data
            sig_out = sigmoid_cuda(sig_out)
            preds_1 = torch.cuda.ByteTensor(sig_out > 0.5)
            preds_1 = preds_1.long()
            train_corrects_1 += torch.sum(preds_1 == labels_1.data)
            labels_1 = Variable(labels_1.data.float())
            loss_1 = criterion_1(outputs_1, labels_1)
            loss_2 = criterion_2(outputs_2, labels_2)

            sig_output_1 = sigmoid_cuda(outputs_1)
            soft_output_2 = softmax_cuda(outputs_2)
            sig_output_1 = Variable(sig_output_1.data, requires_grad=False)
            soft_output_2 = Variable(soft_output_2.data, requires_grad=False)

            kl_output_1 = kl_fc_t2p(sig_output_1)
            kl_output_2 = kl_fc_p2t(soft_output_2)

            loss_3 = torch.abs(criterion_3(kl_output_1, soft_output_2))
            loss_4 = torch.abs(criterion_3(kl_output_2, sig_output_1))
            loss = loss_1 + loss_2 + loss_3 + loss_4
            loss.backward()
            optimizer.step()

            train_loss_1 += loss_1.data[0]
            train_loss_2 += loss_2.data[0]
            train_loss_3 += loss_3.data[0]
            train_loss_4 += loss_4.data[0]
            train_corrects_2 += torch.sum(preds_2 == labels_2.data)

        train_elapsed_time = time.time() - train_start_time
        train_accuracy_1 = train_corrects_1 / num_train_all / 7
        train_accuracy_2 = train_corrects_2 / num_train_all
        train_average_loss_1 = train_loss_1 / num_train_all / 7
        train_average_loss_2 = train_loss_2 / num_train_all
        train_average_loss_3 = train_loss_3 / num_train_all
        train_average_loss_4 = train_loss_4 / num_train_all

        # begin eval

        model.eval()
        val_loss_1 = 0.0
        val_loss_2 = 0.0
        val_loss_3 = 0.0
        val_loss_4 = 0.0
        val_corrects_1 = 0
        val_corrects_2 = 0

        val_start_time = time.time()
        for data in val_loader:
            inputs, labels_1, labels_2 = data
            labels_2 = labels_2[(sequence_length - 1)::sequence_length]
            if use_gpu:
                inputs = Variable(inputs.cuda(), volatile=True)
                labels_1 = Variable(labels_1.cuda(), volatile=True)
                labels_2 = Variable(labels_2.cuda(), volatile=True)
            else:
                inputs = Variable(inputs, volatile=True)
                labels_1 = Variable(labels_1, volatile=True)
                labels_2 = Variable(labels_2, volatile=True)

            if crop_type == 0 or crop_type == 1:
                outputs_1, outputs_2 = model.forward(inputs)
            elif crop_type == 5:
                inputs = inputs.permute(1, 0, 2, 3, 4).contiguous()
                inputs = inputs.view(-1, 3, 224, 224)
                outputs_1, outputs_2 = model.forward(inputs)
                outputs_1 = outputs_1.view(5, -1, 7)
                outputs_1 = torch.mean(outputs_1, 0)
                outputs_2 = outputs_2.view(5, -1, 7)
                outputs_2 = torch.mean(outputs_2, 0)
            elif crop_type == 10:
                inputs = inputs.permute(1, 0, 2, 3, 4).contiguous()
                inputs = inputs.view(-1, 3, 224, 224)
                outputs_1, outputs_2 = model.forward(inputs)
                outputs_1 = outputs_1.view(10, -1, 7)
                outputs_1 = torch.mean(outputs_1, 0)
                outputs_2 = outputs_2.view(10, -1, 7)
                outputs_2 = torch.mean(outputs_2, 0)

            sig_output_1 = sigmoid_cuda(outputs_1)
            soft_output_2 = softmax_cuda(outputs_2)
            kl_output_1 = (kl_fc_t2p(sig_output_1))
            kl_output_2 = (kl_fc_p2t(soft_output_2))
            sig_output_1 = Variable(sig_output_1.data, requires_grad=False)
            soft_output_2 = Variable(soft_output_2.data, requires_grad=False)

            outputs_1 = outputs_1 + kl_output_2
            outputs_2 = outputs_2 + kl_output_1

            outputs_2 = outputs_2[sequence_length - 1::sequence_length]
            _, preds_2 = torch.max(outputs_2.data, 1)

            sig_out = outputs_1.data
            sig_out = sigmoid_cuda(sig_out)
            preds_1 = torch.cuda.ByteTensor(sig_out > 0.5)
            preds_1 = preds_1.long()
            val_corrects_1 += torch.sum(preds_1 == labels_1.data)
            labels_1 = Variable(labels_1.data.float())
            loss_1 = criterion_1(outputs_1, labels_1)
            val_loss_1 += loss_1.data[0]
            loss_2 = criterion_2(outputs_2, labels_2)
            val_corrects_2 += torch.sum(preds_2 == labels_2.data)
            val_loss_2 += loss_2.data[0]

            loss_3 = torch.abs(criterion_3(kl_output_1, soft_output_2))
            loss_4 = torch.abs(criterion_3(kl_output_2, sig_output_1))
            val_loss_3 += loss_3.data[0]
            val_loss_4 += loss_4.data[0]

        val_elapsed_time = time.time() - val_start_time
        val_accuracy_1 = val_corrects_1 / (num_val_all * 7)
        val_accuracy_2 = val_corrects_2 / num_val_we_use
        val_average_loss_1 = val_loss_1 / (num_val_all * 7)
        val_average_loss_2 = val_loss_2 / num_val_we_use
        val_average_loss_3 = val_loss_3 / num_val_all
        val_average_loss_4 = val_loss_4 / num_val_all

        print('epoch: {:3d}'
              ' train time: {:2.0f}m{:2.0f}s'
              ' train accu_1: {:.4f}'
              ' train accu_2: {:.4f}'
              ' train loss_1: {:4.4f}'
              ' train loss_2: {:4.4f}'
              ' train loss_3: {:4.4f}'
              ' train loss_3: {:4.4f}'.format(
                  epoch, train_elapsed_time // 60, train_elapsed_time % 60,
                  train_accuracy_1, train_accuracy_2, train_average_loss_1,
                  train_average_loss_2, train_average_loss_3,
                  train_average_loss_4))
        print('epoch: {:3d}'
              ' valid time: {:2.0f}m{:2.0f}s'
              ' valid accu_1: {:.4f}'
              ' valid accu_2: {:.4f}'
              ' valid loss_1: {:4.4f}'
              ' valid loss_2: {:4.4f}'
              ' valid loss_3: {:4.4f}'
              ' valid loss_4: {:4.4f}'.format(
                  epoch, val_elapsed_time // 60, val_elapsed_time % 60,
                  val_accuracy_1, val_accuracy_2, val_average_loss_1,
                  val_average_loss_2, val_average_loss_3, val_average_loss_4))

        if optimizer_choice == 0:
            if sgd_adjust_lr == 0:
                exp_lr_scheduler.step()
            elif sgd_adjust_lr == 1:
                exp_lr_scheduler.step(val_average_loss_1 + val_average_loss_2)

        if val_accuracy_2 > best_val_accuracy_2 and val_accuracy_1 > 0.95:
            best_val_accuracy_2 = val_accuracy_2
            best_val_accuracy_1 = val_accuracy_1
            correspond_train_acc_1 = train_accuracy_1
            correspond_train_acc_2 = train_accuracy_2
            best_model_wts = copy.deepcopy(model.state_dict())
        elif val_accuracy_2 == best_val_accuracy_2 and val_accuracy_1 > 0.95:
            if val_accuracy_1 > best_val_accuracy_1:
                correspond_train_acc_1 = train_accuracy_1
                correspond_train_acc_2 = train_accuracy_2
                best_model_wts = copy.deepcopy(model.state_dict())
            elif val_accuracy_1 == best_val_accuracy_1:
                if train_accuracy_2 > correspond_train_acc_2:
                    correspond_train_acc_2 = train_accuracy_2
                    correspond_train_acc_1 = train_accuracy_1
                    best_model_wts = copy.deepcopy(model.state_dict())
                elif train_accuracy_2 == correspond_train_acc_2:
                    if train_accuracy_1 > best_val_accuracy_1:
                        correspond_train_acc_1 = train_accuracy_1
                        best_model_wts = copy.deepcopy(model.state_dict())

        record_np[epoch, 0] = train_accuracy_1
        record_np[epoch, 1] = train_accuracy_2
        record_np[epoch, 2] = train_average_loss_1
        record_np[epoch, 3] = train_average_loss_2
        record_np[epoch, 4] = train_average_loss_3
        record_np[epoch, 5] = train_average_loss_4

        record_np[epoch, 6] = val_accuracy_1
        record_np[epoch, 7] = val_accuracy_2
        record_np[epoch, 8] = val_average_loss_1
        record_np[epoch, 9] = val_average_loss_2
        record_np[epoch, 10] = val_average_loss_3
        record_np[epoch, 11] = val_average_loss_4

    print('best accuracy_1: {:.4f} cor train accu_1: {:.4f}'.format(
        best_val_accuracy_1, correspond_train_acc_1))
    print('best accuracy_2: {:.4f} cor train accu_2: {:.4f}'.format(
        best_val_accuracy_2, correspond_train_acc_2))

    save_val_1 = int("{:4.0f}".format(best_val_accuracy_1 * 10000))
    save_val_2 = int("{:4.0f}".format(best_val_accuracy_2 * 10000))
    save_train_1 = int("{:4.0f}".format(correspond_train_acc_1 * 10000))
    save_train_2 = int("{:4.0f}".format(correspond_train_acc_2 * 10000))
    public_name = "cnn_lstm_klave" \
                  + "_epoch_" + str(epochs) \
                  + "_length_" + str(sequence_length) \
                  + "_opt_" + str(optimizer_choice) \
                  + "_mulopt_" + str(multi_optim) \
                  + "_flip_" + str(use_flip) \
                  + "_crop_" + str(crop_type) \
                  + "_batch_" + str(train_batch_size) \
                  + "_train1_" + str(save_train_1) \
                  + "_train2_" + str(save_train_2) \
                  + "_val1_" + str(save_val_1) \
                  + "_val2_" + str(save_val_2)
    model_name = public_name + ".pth"
    torch.save(best_model_wts, model_name)

    record_name = public_name + ".npy"
    np.save(record_name, record_np)

    kl_fc_p2t_name = public_name + "p2t.npy"
    kl_fc_t2p_name = public_name + "t2p.npy"
    kl_fc_p2t_np = kl_fc_p2t.cpu().weight.data.numpy()
    np.save(kl_fc_p2t_name, kl_fc_p2t_np)
    kl_fc_t2p_np = kl_fc_t2p.cpu().weight.data.numpy()
    np.save(kl_fc_t2p_name, kl_fc_t2p_np)
Example #23
0
class TrainLoop_GPT2():
    def __init__(self, args, logger):
        self.args = args
        self.logger = logger

        self.args.device = 'cuda:{}'.format(
            self.args.gpu) if self.args.use_cuda else 'cpu'
        self.logger.info('using device:{}'.format(self.args.device))

        self.opt = vars(self.args)

        self.batch_size = self.opt['batch_size']
        self.use_cuda = self.opt['use_cuda']
        self.device = self.args.device
        self.multi_gpu = self.args.use_multi_gpu

        # self.movie_ids = pickle.load(open("data/movie_ids.pickle", "rb"))

        self.build_data()
        self.build_model()

    def build_data(self):
        self.tokenizer = BertTokenizer(vocab_file=self.args.vocab_path)
        self.vocab_size = len(self.tokenizer)
        self.pad_id = self.tokenizer.convert_tokens_to_ids('[PAD]')

        # 对原始数据进行预处理,将原始语料转换成对应的token_id
        if self.args.raw:
            for subset in ['train', 'valid', 'test']:
                self.preprocess_raw_data(subset)
        # 加载tokenized data
        self.subset2data = {}
        with open(self.args.test_tokenized_path, "r", encoding="utf8") as f:
            self.subset2data['test'] = f.read()
        if not self.args.do_eval:
            with open(self.args.train_tokenized_path, "r",
                      encoding="utf8") as f:
                self.subset2data['train'] = f.read()
            with open(self.args.valid_tokenized_path, "r",
                      encoding="utf8") as f:
                self.subset2data['valid'] = f.read()
        # 这一步是干啥的
        for subset in self.subset2data:
            self.subset2data[subset] = self.subset2data[subset].split("\n")

        self.logger.info("Train/Valid/Test set has {} convs".format(
            [len(self.subset2data[subset]) for subset in self.subset2data]))

    def build_model(self):
        """

        :param args:
        :param vocab_size:字典大小
        :return:
        """
        if self.args.pretrained_model:
            # 如果指定了预训练的GPT2模型
            self.model = GPT2LMHeadModel.from_pretrained(
                self.args.pretrained_model)
        else:
            # 若没有指定预训练模型,则初始化模型
            model_config = transformers.modeling_gpt2.GPT2Config.from_json_file(
                self.args.model_config)
            self.model = GPT2LMHeadModel(config=model_config)

        # 根据tokenizer的vocabulary调整GPT2模型的voca的大小
        self.model.resize_token_embeddings(self.vocab_size)

        if self.use_cuda:
            self.model.to(self.device)

        self.logger.info('model config:\n{}'.format(
            self.model.config.to_json_string()))

        self.n_ctx = self.model.config.to_dict().get("n_ctx")

        # 建立模型存储路径
        if self.args.is_model_output and not os.path.exists(
                self.args.dialogue_model_output_path):
            os.mkdir(self.args.dialogue_model_output_path)

        # 记录模型参数数量
        num_parameters = 0
        parameters = self.model.parameters()
        for parameter in parameters:
            num_parameters += parameter.numel()
        self.logger.info(
            'number of model parameters: {}'.format(num_parameters))

        # 是否使用多块GPU进行并行运算
        if self.args.use_multi_gpu:
            if self.args.use_cuda and torch.cuda.device_count() > 1:
                self.logger.info("Let's use GPUs to train")
                self.model = DataParallel(
                    self.model,
                    device_ids=[int(i) for i in self.args.device.split(',')])
            else:
                self.args.use_multi_gpu = False

    def train(self):
        train_dataset = GPT2Dataset(self.subset2data['train'])
        train_dataloader = DataLoader(train_dataset,
                                      batch_size=self.args.batch_size,
                                      shuffle=True,
                                      num_workers=self.args.num_workers,
                                      collate_fn=self.collate_fn)

        # 计算所有epoch进行参数优化的总步数total_steps
        self.total_steps = int(train_dataset.__len__() * self.args.epochs /
                               self.args.batch_size /
                               self.args.gradient_accumulation)
        self.logger.info('total training steps = {}'.format(self.total_steps))

        self.init_optim()

        self.logger.info('starting training')
        # 用于统计每次梯度累计的loss
        running_loss = 0
        # 统计一共训练了多少个step
        overall_step = 0
        # 记录tensorboardX
        # tb_writer = SummaryWriter(log_dir=self.args.writer_dir)
        # 记录 out of memory的次数
        oom_time = 0
        # patience
        patience = 0
        max_patience = 2
        best_test_loss = 10000
        # 开始训练
        for epoch in range(self.args.epochs):
            epoch_start_time = datetime.now()
            train_loss = []  # 记录一个epoch里面的train loss
            for batch_idx, (input_ids, mask_r) in enumerate(train_dataloader):
                # 注意:GPT2模型的forward()函数,是对于给定的context,生成一个token,而不是生成一串token
                # GPT2Model的输入为n个token_id时,输出也是n个hidden_state,使用第n个hidden_state预测第n+1个token
                # self.logger.info(input_ids == mask_r)
                # self.logger.info(input_ids)
                # self.logger.info(mask_r)
                # for context in input_ids:
                #     print(tokenizer.convert_ids_to_tokens(int(id) for id in context))
                # ipdb.set_trace()
                self.model.train()
                input_ids = input_ids.to(self.device)
                # 解决在运行过程中,由于显存不足产生的cuda out of memory的问题
                try:
                    outputs = self.model.forward(input_ids=input_ids)
                    loss, accuracy = self.calculate_loss_and_accuracy(
                        outputs, input_ids, mask_r, device=self.device)
                    train_loss.append(loss.item())

                    if self.multi_gpu:
                        loss = loss.mean()
                        accuracy = accuracy.mean()
                    if self.args.gradient_accumulation > 1:
                        loss = loss / self.args.gradient_accumulation
                        accuracy = accuracy / self.args.gradient_accumulation
                    loss.backward()
                    # 梯度裁剪解决的是梯度消失或爆炸的问题,即设定阈值
                    torch.nn.utils.clip_grad_norm_(self.model.parameters(),
                                                   self.args.max_grad_norm)
                    # 进行一定step的梯度累计之后,更新参数
                    if (batch_idx + 1) % self.args.gradient_accumulation == 0:
                        running_loss += loss.item()
                        self.optimizer.step()
                        self.optimizer.zero_grad()
                        self.scheduler.step()

                        overall_step += 1
                        # 更新日志与tnesorboardX信息
                        if (overall_step + 1) % self.args.log_step == 0:
                            self.logger.info(
                                "batch {} of epoch {}, loss {:.4f}, ppl {:.5f}"
                                .format(batch_idx + 1, epoch + 1, loss,
                                        exp(loss)))
                            # tb_writer.add_scalar('loss', loss.item(), overall_step)
                except RuntimeError as exception:
                    if "out of memory" in str(exception):
                        oom_time += 1
                        self.logger.info(
                            "WARNING: ran out of memory,times: {}".format(
                                oom_time))
                        if hasattr(torch.cuda, 'empty_cache'):
                            torch.cuda.empty_cache()
                    else:
                        self.logger.info(str(exception))
                        raise exception
            train_loss = sum(train_loss) / len(train_loss)
            epoch_finish_time = datetime.now()
            self.logger.info(
                'epoch {}, train loss is {:.4f}, ppl is {:.5f}, spend {} time'.
                format(epoch + 1, train_loss, exp(train_loss),
                       epoch_finish_time - epoch_start_time))
            # val
            # test_loss = val(model, device, test_list, multi_gpu, self.args)
            test_loss = self.val('valid')
            if test_loss <= best_test_loss:
                patience = 0
                best_test_loss = test_loss

                self.logger.info('saving model for epoch {}'.format(epoch + 1))
                model_path = join(self.args.dialogue_model_output_path,
                                  'model')
                if not os.path.exists(model_path):
                    os.mkdir(model_path)
                # 这里是什么意思,还不是很懂
                model_to_save = self.model.module if hasattr(
                    self.model, 'module') else self.model
                model_to_save.save_pretrained(model_path)
                self.logger.info("save model to " + str(model_path))
            else:
                patience += 1
                self.logger.info('Patience = ' + str(patience))
                if patience >= max_patience:
                    break
            test_loss = self.val('test')

        # self.logger.info('training finished')

    def val(self, subset):
        # self.logger.info("start evaluating model")
        self.model.eval()
        # self.logger.info('starting evaluating')
        # 记录tensorboardX
        # tb_writer = SummaryWriter(log_dir=self.args.writer_dir)
        test_dataset = GPT2Dataset(self.subset2data[subset])
        test_dataloader = DataLoader(test_dataset,
                                     batch_size=self.args.batch_size,
                                     shuffle=True,
                                     num_workers=self.args.num_workers,
                                     collate_fn=self.collate_fn)
        test_loss = []
        # test_accuracy = []
        with torch.no_grad():
            for batch_idx, (input_ids, mask_r) in enumerate(test_dataloader):
                input_ids = input_ids.to(self.device)
                outputs = self.model.forward(input_ids=input_ids)
                loss, accuracy = self.calculate_loss_and_accuracy(
                    outputs, input_ids, mask_r, device=self.device)
                test_loss.append(loss.item())
                # test_accuracy.append(accuracy)
                if self.multi_gpu:
                    loss = loss.mean()
                    accuracy = accuracy.mean()
                if self.args.gradient_accumulation > 1:
                    loss = loss / self.args.gradient_accumulation
                    accuracy = accuracy / self.args.gradient_accumulation
                # self.logger.info("val batch {} ,loss {} ,accuracy {}".format(batch_idx, loss, accuracy))
                # tb_writer.add_scalar('loss', loss.item(), overall_step)
        test_loss = sum(test_loss) / len(test_loss)
        self.logger.info("val {} loss {:.4f} , ppl {:.5f}".format(
            subset, test_loss, exp(test_loss)))

        return test_loss

    def generate(self):
        samples_file = open(self.args.save_samples_path, 'w', encoding='utf8')
        convs = pickle.load(open(self.args.test_path, 'rb'))

        for conv in tqdm(convs[:]):
            conv_id = conv['conv_id']
            history = []  # list of id, to model

            for message in conv['messages']:
                message_id, role, content = int(
                    message['local_id']), message['role'], message['content']
                if role == 'Recommender' and message_id != 1:
                    try:
                        if self.args.save_samples_path:
                            samples_file.write(f"[GroundTruth]: {content}\n")
                        input_ids = [
                            self.tokenizer.cls_token_id
                        ] + history[-self.args.max_context_len +
                                    1:]  # 每个input以[CLS]为开头 [SEP]结尾
                        # tensor of [input_token_num]
                        curr_input_tensor = torch.tensor(input_ids).long().to(
                            self.device)
                        generated = []
                        # 最多生成max_len个token
                        for _ in range(self.args.max_len):
                            # (tensor of [input_token_nums, 13317], tuple of 10 tensor)
                            outputs = self.model(
                                input_ids=curr_input_tensor)  #?shape?
                            # tensor of [13317]
                            next_token_logits = outputs[0][-1, :]
                            # 对于已生成的结果generated中的每个token添加一个重复惩罚项,降低其生成概率
                            for id in set(generated):
                                next_token_logits[
                                    id] /= self.args.repetition_penalty
                            next_token_logits = next_token_logits / self.args.temperature
                            # 对于[UNK]的概率设为无穷小,也就是说模型的预测结果不可能是[UNK]这个token
                            next_token_logits[
                                self.tokenizer.convert_tokens_to_ids(
                                    '[UNK]')] = -float('Inf')
                            # 将topk以外的token的概率设置为-inf,然后排序,然后将accum-概率大与topp的token的概率设置为-inf
                            filtered_logits = top_k_top_p_filtering(
                                next_token_logits,
                                top_k=self.args.topk,
                                top_p=self.args.topp)
                            # torch.multinomial表示从候选集合中无放回地进行抽取num_samples个元素,权重越高,抽到的几率越高,返回元素的下标
                            next_token = torch.multinomial(F.softmax(
                                filtered_logits, dim=-1),
                                                           num_samples=1)
                            if next_token == self.tokenizer.sep_token_id:  # 遇到[SEP]则表明response生成结束
                                break
                            generated.append(next_token.item())
                            curr_input_tensor = torch.cat(
                                (curr_input_tensor, next_token),
                                dim=0)[-self.n_ctx:]
                        generated_text = self.tokenizer.convert_ids_to_tokens(
                            generated)
                        if self.args.save_samples_path:
                            samples_file.write("[Generated]: {}\n\n".format(
                                "".join(generated_text)))

                    except Exception as e:
                        print(e)
                        print(conv_id, message_id)
                        print(max(input_ids))
                        print('\n')
                history.extend(
                    self.tokenizer.encode(content) +
                    [self.tokenizer.sep_token_id])  #? encode成了啥

        samples_file.close()

    def calculate_loss_and_accuracy(self, outputs, labels, mask_r, device):
        """
        计算非self.pad_id的平均loss和准确率
        :param outputs:
        :param labels:
        :param device:
        :return:
        """
        logits = outputs[
            0]  # 每个token用来预测下一个token的prediction_score,维度:[batch_size,token_len,voca_size]
        # 用前n-1个token,预测出第n个token
        # 用第i个token的prediction_score用来预测第i+1个token。
        # 假定有input有n个token,则shift_logits表示model中第[0,n-2]个token的prediction_score,shift_labels表示第[1,n-1]的label
        shift_logits = logits[..., :-1, :].contiguous()
        shift_labels = labels[..., 1:].contiguous().to(device)
        ##################################### shift_labels给mask掉
        mask_shift_labels = mask_r[..., 1:].contiguous().to(device)
        shift_labels = shift_labels * mask_shift_labels
        #######################################

        loss_fct = CrossEntropyLoss(
            ignore_index=self.pad_id,
            reduction='sum')  # 忽略self.pad_id的loss,并对所有的非self.pad_id的loss进行求和
        loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)),
                        shift_labels.view(-1))

        _, preds = shift_logits.max(
            dim=-1
        )  # preds表示对应的prediction_score预测出的token在voca中的id。维度为[batch_size,token_len]

        # 对非self.pad_id的token的loss进行求平均,且计算出预测的准确率
        not_ignore = shift_labels.ne(
            self.pad_id
        )  # 进行非运算,返回一个tensor,若targets_view的第i个位置为self.pad_id,则置为0,否则为1
        num_targets = not_ignore.long().sum().item(
        )  # 计算target中的非self.pad_id的数量

        correct = (shift_labels
                   == preds) & not_ignore  # 计算model预测正确的token的个数,排除pad的tokne
        correct = correct.float().sum()

        accuracy = correct / num_targets
        loss = loss / num_targets
        return loss, accuracy

    def preprocess_raw_data(self, subset):
        """
        对原始语料进行处理,将原始语料转换为用于train的token id,对于每个dialogue,将其处于成如下形式"[CLS]utterance1[SEP]utterance2[SEP]utterance3[SEP]"
        :param args:
        :param tokenizer:
        :param n_ctx:GPT2模型的上下文窗口大小,对于超过n_ctx(n_ctx包括了特殊字符)的dialogue进行截断
        :return:
        """
        self.logger.info(
            "tokenizing raw data,raw data path:{}, token output path:{}".
            format(args.train_raw_path, args.train_tokenized_path))
        if subset == 'train':
            raw_path = self.args.train_raw_path
        elif subset == 'valid':
            raw_path = self.args.valid_raw_path
        elif subset == 'test':
            raw_path = self.args.test_raw_path

        with open(raw_path, 'rb') as f:
            data = f.read().decode("utf-8")
        if "\r\n" in data:
            train_data = data.split("\r\n\r\n")
        else:
            train_data = data.split("\n\n")
        self.logger.info("there are {} dialogue in raw dataset".format(
            len(train_data)))
        if subset == 'train':
            path = self.args.train_tokenized_path
        elif subset == 'valid':
            path = self.args.valid_tokenized_path
        elif subset == 'test':
            path = self.args.test_tokenized_path
        with open(path, "w", encoding="utf-8") as f:
            for dialogue_index, dialogue in enumerate(tqdm(train_data)):
                if "\r\n" in data:
                    utterances = dialogue.split("\r\n")
                else:
                    utterances = dialogue.split("\n")
                # dialogue_ids = [tokenizer.cls_token_id]  # 每个dialogue以[CLS]开头
                dialogue_ids = []  # 每个dialogue以[CLS]开头
                for utterance in utterances:
                    dialogue_ids.extend([
                        self.tokenizer.convert_tokens_to_ids(word)
                        for word in utterance
                    ])
                    dialogue_ids.append(self.tokenizer.sep_token_id
                                        )  # 每个utterance之后添加[SEP],表示utterance结束
                # 对超过n_ctx的长度进行截断,否则GPT2模型会报错
                ###############################m
                dialogue_ids = [self.tokenizer.cls_token_id
                                ] + dialogue_ids[-self.n_ctx + 1:]
                # dialogue_ids = dialogue_ids[:n_ctx]
                for dialogue_id in dialogue_ids:
                    f.write(str(dialogue_id) + ' ')
                # 最后一条记录不添加换行符
                if dialogue_index < len(train_data) - 1:
                    f.write("\n")
        self.logger.info(
            "finish preprocessing raw data,the result is stored in {}".format(
                self.args.train_tokenized_path))

    def collate_fn(self, batch):
        """
        计算该batch中的所有sample的最长的input,并且将其他input的长度向其对齐
        :param batch:
        :return:
        """
        input_ids = []
        mask_rs = []
        btc_size = len(batch)
        max_input_len = 0  # 该batch中最长的input,用于该batch的数据对齐
        # 计算该batch中input的最大长度
        # for btc_idx in range(btc_size):
        #     if max_input_len < len(batch[btc_idx]):
        #         max_input_len = len(batch[btc_idx])
        # 使用pad_id对小于max_input_len的input_id进行补全
        # for btc_idx in range(btc_size):
        #     input_len = len(batch[btc_idx])
        #     input_ids.append(batch[btc_idx])
        #     input_ids[btc_idx].extend([pad_id] * (max_input_len - input_len))

        # 计算该batch中input的最大长度
        for btc_idx, (inputs, mask_r) in enumerate(batch):
            if max_input_len < len(inputs):
                max_input_len = len(inputs)
        # 使用pad_id对小于max_input_len的input_id进行补全
        for btc_idx, (inputs, mask_r) in enumerate(batch):
            assert len(inputs) == len(mask_r), f"{len(inputs)}, {len(mask_r)}"
            input_len = len(inputs)
            input_ids.append(inputs)
            input_ids[btc_idx].extend([self.pad_id] *
                                      (max_input_len - input_len))
            mask_rs.append(mask_r)
            mask_rs[btc_idx].extend([self.pad_id] *
                                    (max_input_len - input_len))
        # self.logger.info(torch.tensor(input_ids, dtype=torch.long).shape)
        # self.logger.info(torch.tensor(mask_rs, dtype=torch.long).shape)
        return (torch.tensor(input_ids, dtype=torch.long),
                torch.tensor(mask_rs, dtype=torch.long))

    def vector2sentence(self, batch_sen):
        # 一个batch的sentence 从id换成token
        sentences = []
        for sen in batch_sen.numpy().tolist():
            sentence = []
            for word in sen:
                if word > 3:
                    sentence.append(self.index2word[word])
                elif word == 3:
                    sentence.append('_UNK_')
            sentences.append(sentence)
        return sentences

    @classmethod
    def optim_opts(self):
        """
        Fetch optimizer selection.

        By default, collects everything in torch.optim, as well as importing:
        - qhm / qhmadam if installed from github.com/facebookresearch/qhoptim

        Override this (and probably call super()) to add your own optimizers.
        """
        # first pull torch.optim in
        optims = {
            k.lower(): v
            for k, v in optim.__dict__.items()
            if not k.startswith('__') and k[0].isupper()
        }
        try:
            import apex.optimizers.fused_adam as fused_adam
            optims['fused_adam'] = fused_adam.FusedAdam
        except ImportError:
            pass

        try:
            # https://openreview.net/pdf?id=S1fUpoR5FQ
            from qhoptim.pyt import QHM, QHAdam
            optims['qhm'] = QHM
            optims['qhadam'] = QHAdam
        except ImportError:
            # no QHM installed
            pass
        self.logger.info(optims)
        return optims

    def init_optim(self):
        """
        Initialize optimizer with model parameters.

        :param params:
            parameters from the model

        :param optim_states:
            optional argument providing states of optimizer to load

        :param saved_optim_type:
            type of optimizer being loaded, if changed will skip loading
            optimizer states
        """
        # 设置优化器,并且在初始训练时,使用warmup策略
        self.optimizer = transformers.AdamW(self.model.parameters(),
                                            lr=self.args.lr,
                                            correct_bias=True)
        self.scheduler = transformers.WarmupLinearSchedule(
            self.optimizer,
            warmup_steps=self.args.warmup_steps,
            t_total=self.total_steps)

    def backward(self, loss):
        """
        Perform a backward pass. It is recommended you use this instead of
        loss.backward(), for integration with distributed training and FP16
        training.
        """
        loss.backward()

    def update_params(self):
        """
        Perform step of optimization, clipping gradients and adjusting LR
        schedule if needed. Gradient accumulation is also performed if agent
        is called with --update-freq.

        It is recommended (but not forced) that you call this in train_step.
        """
        update_freq = 1
        if update_freq > 1:
            # we're doing gradient accumulation, so we don't only want to step
            # every N updates instead
            self._number_grad_accum = (self._number_grad_accum +
                                       1) % update_freq
            if self._number_grad_accum != 0:
                return
        #0.1是不是太小了,原版就是这样
        if self.opt['gradient_clip'] > 0:
            torch.nn.utils.clip_grad_norm_(self.model.parameters(),
                                           self.opt['gradient_clip'])

        self.optimizer.step()

    def zero_grad(self):
        """
        Zero out optimizer.

        It is recommended you call this in train_step. It automatically handles
        gradient accumulation if agent is called with --update-freq.
        """
        self.optimizer.zero_grad()
Example #24
0
class DataParalleledLoss(Loss):
    r"""
    Loss class wrapper of torch.nn.DataParallel. It can be used as the original loss class.
    `eval` & `forward` methods support data-parallel running.

    Examples
    --------
    >>> import torch
    >>> from torch import optim
    >>> from torch.nn import functional as F
    >>> from pixyz.distributions import Bernoulli, Normal
    >>> from pixyz.losses import KullbackLeibler, DataParalleledLoss
    >>> from pixyz.models import Model
    >>> used_gpu_i = set()
    >>> used_gpu_g = set()
    >>> # Set distributions (Distribution API)
    >>> class Inference(Normal):
    ...     def __init__(self):
    ...         super().__init__(var=["z"],cond_var=["x"],name="q")
    ...         self.model_loc = torch.nn.Linear(128, 64)
    ...         self.model_scale = torch.nn.Linear(128, 64)
    ...     def forward(self, x):
    ...         used_gpu_i.add(x.device.index)
    ...         return {"loc": self.model_loc(x), "scale": F.softplus(self.model_scale(x))}
    >>> class Generator(Bernoulli):
    ...     def __init__(self):
    ...         super().__init__(var=["x"],cond_var=["z"],name="p")
    ...         self.model = torch.nn.Linear(64, 128)
    ...     def forward(self, z):
    ...         used_gpu_g.add(z.device.index)
    ...         return {"probs": torch.sigmoid(self.model(z))}
    >>> p = Generator()
    >>> q = Inference()
    >>> prior = Normal(loc=torch.tensor(0.), scale=torch.tensor(1.),
    ...                var=["z"], features_shape=[64], name="p_{prior}")
    >>> # Define a loss function (Loss API)
    >>> reconst = -p.log_prob().expectation(q)
    >>> kl = KullbackLeibler(q,prior)
    >>> batch_loss_cls = (reconst - kl)
    >>> # device settings
    >>> device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    >>> device_count = torch.cuda.device_count()
    >>> if device_count > 1:
    ...     loss_cls = DataParalleledLoss(batch_loss_cls).mean().to(device)
    ... else:
    ...     loss_cls = batch_loss_cls.mean().to(device)
    >>> # Set a model (Model API)
    >>> model = Model(loss=loss_cls, distributions=[p, q],
    ...               optimizer=optim.Adam, optimizer_params={"lr": 1e-3})
    >>> # Train and test the model
    >>> data = torch.randn(2, 128).to(device)  # Pseudo data
    >>> train_loss = model.train({"x": data})
    >>> expected = set(range(device_count)) if torch.cuda.is_available() else {None}
    >>> assert used_gpu_i==expected
    >>> assert used_gpu_g==expected
    """
    def __init__(self, loss, distributed=False, **kwargs):
        super().__init__(loss.input_var)
        if distributed:
            self.paralleled = DistributedDataParallel(loss, **kwargs)
        else:
            self.paralleled = DataParallel(loss, **kwargs)

    def forward(self, x_dict, **kwargs):
        return self.paralleled.forward(x_dict, **kwargs)

    @property
    def _symbol(self):
        return self.paralleled.module._symbol

    def __getattr__(self, name):
        try:
            return super().__getattr__(name)
        except AttributeError:
            return getattr(self.paralleled.module, name)
Example #25
0
def train_model(train_dataset, train_num_each, val_dataset, val_num_each):
    num_train = len(train_dataset)
    num_val = len(val_dataset)

    train_useful_start_idx = get_useful_start_idx(sequence_length,
                                                  train_num_each)

    val_useful_start_idx = get_useful_start_idx(sequence_length, val_num_each)

    num_train_we_use = len(train_useful_start_idx) // num_gpu * num_gpu
    num_val_we_use = len(val_useful_start_idx) // num_gpu * num_gpu
    # num_train_we_use = 800
    # num_val_we_use = 80

    train_we_use_start_idx = train_useful_start_idx[0:num_train_we_use]
    val_we_use_start_idx = val_useful_start_idx[0:num_val_we_use]

    train_idx = []
    for i in range(num_train_we_use):
        for j in range(sequence_length):
            train_idx.append(train_we_use_start_idx[i] + j)

    val_idx = []
    for i in range(num_val_we_use):
        for j in range(sequence_length):
            val_idx.append(val_we_use_start_idx[i] + j)

    num_train_all = len(train_idx)
    num_val_all = len(val_idx)

    print('num train start idx : {:6d}'.format(len(train_useful_start_idx)))
    print('last idx train start: {:6d}'.format(train_useful_start_idx[-1]))
    print('num of train dataset: {:6d}'.format(num_train))
    print('num of train we use : {:6d}'.format(num_train_we_use))
    print('num of all train use: {:6d}'.format(num_train_all))
    print('num valid start idx : {:6d}'.format(len(val_useful_start_idx)))
    print('last idx valid start: {:6d}'.format(val_useful_start_idx[-1]))
    print('num of valid dataset: {:6d}'.format(num_val))
    print('num of valid we use : {:6d}'.format(num_val_we_use))
    print('num of all valid use: {:6d}'.format(num_val_all))

    train_loader = DataLoader(train_dataset,
                              batch_size=train_batch_size,
                              sampler=train_idx,
                              num_workers=workers,
                              pin_memory=False)
    val_loader = DataLoader(val_dataset,
                            batch_size=val_batch_size,
                            sampler=val_idx,
                            num_workers=workers,
                            pin_memory=False)

    model_old = multi_lstm()
    model_old = DataParallel(model_old)
    model_old.load_state_dict(
        torch.load(
            "cnn_lstm_epoch_25_length_10_opt_1_mulopt_1_flip_0_crop_1_batch_400_train1_9997_train2_9982_val1_9744_val2_8876.pth"
        ))

    model = multi_lstm_p2t()
    model.share = model_old.module.share
    model.lstm = model_old.module.lstm
    model.fc = model_old.module.fc
    model.fc2 = model_old.module.fc2

    model = DataParallel(model)
    for param in model.module.fc_p2t.parameters():
        param.requires_grad = False
    model.module.fc_p2t.load_state_dict(
        torch.load(
            "fc_epoch_25_length_4_opt_1_mulopt_1_flip_0_crop_1_batch_800_train1_9951_train2_9713_val1_9686_val2_7867_p2t.pth"
        ))

    if use_gpu:
        model = model.cuda()
        model.module.fc_p2t = model.module.fc_p2t.cuda()

    criterion_1 = nn.BCEWithLogitsLoss(size_average=False)
    criterion_2 = nn.CrossEntropyLoss(size_average=False)
    criterion_3 = nn.KLDivLoss(size_average=False)
    sigmoid_cuda = nn.Sigmoid().cuda()

    if multi_optim == 0:
        if optimizer_choice == 0:
            optimizer = optim.SGD([{
                'params': model.module.share.parameters()
            }, {
                'params': model.module.lstm.parameters(),
            }, {
                'params': model.module.fc.parameters()
            }, {
                'params': model.module.fc2.parameters()
            }],
                                  lr=learning_rate,
                                  momentum=momentum,
                                  dampening=dampening,
                                  weight_decay=weight_decay,
                                  nesterov=use_nesterov)
            if sgd_adjust_lr == 0:
                exp_lr_scheduler = lr_scheduler.StepLR(optimizer,
                                                       step_size=sgd_step,
                                                       gamma=sgd_gamma)
            elif sgd_adjust_lr == 1:
                exp_lr_scheduler = lr_scheduler.ReduceLROnPlateau(
                    optimizer, 'min')
        elif optimizer_choice == 1:
            optimizer = optim.Adam([{
                'params': model.module.share.parameters()
            }, {
                'params': model.module.lstm.parameters(),
            }, {
                'params': model.module.fc.parameters()
            }, {
                'params': model.module.fc2.parameters()
            }],
                                   lr=learning_rate)
    elif multi_optim == 1:
        if optimizer_choice == 0:
            optimizer = optim.SGD([{
                'params': model.module.share.parameters()
            }, {
                'params': model.module.lstm.parameters(),
                'lr': learning_rate
            }, {
                'params': model.module.fc.parameters(),
                'lr': learning_rate
            }],
                                  lr=learning_rate / 10,
                                  momentum=momentum,
                                  dampening=dampening,
                                  weight_decay=weight_decay,
                                  nesterov=use_nesterov)
            if sgd_adjust_lr == 0:
                exp_lr_scheduler = lr_scheduler.StepLR(optimizer,
                                                       step_size=sgd_step,
                                                       gamma=sgd_gamma)
            elif sgd_adjust_lr == 1:
                exp_lr_scheduler = lr_scheduler.ReduceLROnPlateau(
                    optimizer, 'min')
        elif optimizer_choice == 1:
            optimizer = optim.Adam([{
                'params': model.module.share.parameters()
            }, {
                'params': model.module.lstm.parameters(),
                'lr': learning_rate
            }, {
                'params': model.module.fc.parameters(),
                'lr': learning_rate
            }, {
                'params': model.module.fc2.parameters(),
                'lr': learning_rate
            }],
                                   lr=learning_rate / 10)

    best_model_wts = copy.deepcopy(model.state_dict())
    best_val_accuracy_1 = 0.0
    best_val_accuracy_2 = 0.0
    correspond_train_acc_1 = 0.0
    correspond_train_acc_2 = 0.0

    # 要存储2个train的准确率 2个valid的准确率 3个train 3个loss的loss, 一共10个数据要记录
    record_np = np.zeros([epochs, 12])

    for epoch in range(epochs):
        # np.random.seed(epoch)
        np.random.shuffle(train_we_use_start_idx)
        train_idx = []
        for i in range(num_train_we_use):
            for j in range(sequence_length):
                train_idx.append(train_we_use_start_idx[i] + j)

        train_loader = DataLoader(train_dataset,
                                  batch_size=train_batch_size,
                                  sampler=train_idx,
                                  num_workers=workers,
                                  pin_memory=False)

        model.train()
        train_loss_1 = 0.0
        train_loss_2 = 0.0
        train_loss_3 = 0.0
        train_corrects_1 = 0
        train_corrects_2 = 0
        train_corrects_3 = 0

        train_start_time = time.time()
        for data in train_loader:
            inputs, labels_1, labels_2 = data
            if use_gpu:
                inputs = Variable(inputs.cuda())
                labels_1 = Variable(labels_1.cuda())
                labels_2 = Variable(labels_2.cuda())
            else:
                inputs = Variable(inputs)
                labels_1 = Variable(labels_1)
                labels_2 = Variable(labels_2)

            optimizer.zero_grad()

            outputs_1, outputs_2, outputs_3 = model.forward(inputs)

            _, preds_2 = torch.max(outputs_2.data, 1)
            train_corrects_2 += torch.sum(preds_2 == labels_2.data)

            sig_output_1 = sigmoid_cuda(outputs_1)
            sig_output_3 = sigmoid_cuda(outputs_3)

            sig_average = (sig_output_1.data + sig_output_3.data) / 2

            preds_1 = torch.cuda.ByteTensor(sig_output_1.data > 0.5)
            preds_1 = preds_1.long()
            train_corrects_1 += torch.sum(preds_1 == labels_1.data)

            preds_3 = torch.cuda.ByteTensor(sig_average > 0.5)
            preds_3 = preds_3.long()
            train_corrects_3 += torch.sum(preds_3 == labels_1.data)

            labels_1 = Variable(labels_1.data.float())
            loss_1 = criterion_1(outputs_1, labels_1)
            loss_2 = criterion_2(outputs_2, labels_2)

            sig_output_3 = Variable(sig_output_3.data, requires_grad=False)
            loss_3 = torch.abs(criterion_3(sig_output_1, sig_output_3))
            loss = loss_1 + loss_2 + loss_3 * alpha
            loss.backward()
            optimizer.step()

            train_loss_1 += loss_1.data[0]
            train_loss_2 += loss_2.data[0]
            train_loss_3 += loss_3.data[0]

        train_elapsed_time = time.time() - train_start_time
        train_accuracy_1 = train_corrects_1 / num_train_all / 7
        train_accuracy_2 = train_corrects_2 / num_train_all
        train_accuracy_3 = train_corrects_3 / num_train_all / 7
        train_average_loss_1 = train_loss_1 / num_train_all / 7
        train_average_loss_2 = train_loss_2 / num_train_all
        train_average_loss_3 = train_loss_3 / num_train_all

        # begin eval

        model.eval()
        val_loss_1 = 0.0
        val_loss_2 = 0.0
        val_loss_3 = 0.0
        val_corrects_1 = 0
        val_corrects_2 = 0
        val_corrects_3 = 0

        val_start_time = time.time()
        for data in val_loader:
            inputs, labels_1, labels_2 = data
            labels_2 = labels_2[(sequence_length - 1)::sequence_length]
            if use_gpu:
                inputs = Variable(inputs.cuda(), volatile=True)
                labels_1 = Variable(labels_1.cuda(), volatile=True)
                labels_2 = Variable(labels_2.cuda(), volatile=True)
            else:
                inputs = Variable(inputs, volatile=True)
                labels_1 = Variable(labels_1, volatile=True)
                labels_2 = Variable(labels_2, volatile=True)

            outputs_1, outputs_2, outputs_3 = model.forward(inputs)
            outputs_2 = outputs_2[(sequence_length - 1)::sequence_length]
            _, preds_2 = torch.max(outputs_2.data, 1)
            val_corrects_2 += torch.sum(preds_2 == labels_2.data)

            sig_output_1 = sigmoid_cuda(outputs_1)
            sig_output_3 = sigmoid_cuda(outputs_3)

            sig_average = (sig_output_1.data + sig_output_3.data) / 2

            preds_1 = torch.cuda.ByteTensor(sig_output_1.data > 0.5)
            preds_1 = preds_1.long()
            val_corrects_1 += torch.sum(preds_1 == labels_1.data)

            preds_3 = torch.cuda.ByteTensor(sig_average > 0.5)
            preds_3 = preds_3.long()
            val_corrects_3 += torch.sum(preds_3 == labels_1.data)

            labels_1 = Variable(labels_1.data.float())
            loss_1 = criterion_1(outputs_1, labels_1)
            loss_2 = criterion_2(outputs_2, labels_2)

            sig_output_3 = Variable(sig_output_3.data, requires_grad=False)
            loss_3 = torch.abs(criterion_3(sig_output_1, sig_output_3))

            val_loss_1 += loss_1.data[0]
            val_loss_2 += loss_2.data[0]
            val_loss_3 += loss_3.data[0]

        val_elapsed_time = time.time() - val_start_time
        val_accuracy_1 = val_corrects_1 / (num_val_all * 7)
        val_accuracy_2 = val_corrects_2 / num_val_we_use
        val_accuracy_3 = val_corrects_3 / (num_val_all * 7)
        val_average_loss_1 = val_loss_1 / (num_val_all * 7)
        val_average_loss_2 = val_loss_2 / num_val_we_use
        val_average_loss_3 = val_loss_3 / num_val_all

        print('epoch: {:3d}'
              ' train time: {:2.0f}m{:2.0f}s'
              ' train accu_1: {:.4f}'
              ' train accu_3: {:.4f}'
              ' train accu_2: {:.4f}'
              ' train loss_1: {:4.4f}'
              ' train loss_2: {:4.4f}'
              ' train loss_3: {:4.4f}'.format(
                  epoch, train_elapsed_time // 60, train_elapsed_time % 60,
                  train_accuracy_1, train_accuracy_3, train_accuracy_2,
                  train_average_loss_1, train_average_loss_2,
                  train_average_loss_3))
        print('epoch: {:3d}'
              ' valid time: {:2.0f}m{:2.0f}s'
              ' valid accu_1: {:.4f}'
              ' valid accu_3: {:.4f}'
              ' valid accu_2: {:.4f}'
              ' valid loss_1: {:4.4f}'
              ' valid loss_2: {:4.4f}'
              ' valid loss_3: {:4.4f}'.format(
                  epoch, val_elapsed_time // 60, val_elapsed_time % 60,
                  val_accuracy_1, val_accuracy_3, val_accuracy_2,
                  val_average_loss_1, val_average_loss_2, val_average_loss_3))

        if optimizer_choice == 0:
            if sgd_adjust_lr == 0:
                exp_lr_scheduler.step()
            elif sgd_adjust_lr == 1:
                exp_lr_scheduler.step(val_average_loss_1 + val_average_loss_2 +
                                      alpha * val_average_loss_3)

        if val_accuracy_2 > best_val_accuracy_2 and val_accuracy_1 > 0.95:
            best_val_accuracy_2 = val_accuracy_2
            best_val_accuracy_1 = val_accuracy_1
            correspond_train_acc_1 = train_accuracy_1
            correspond_train_acc_2 = train_accuracy_2
            best_model_wts = copy.deepcopy(model.state_dict())
        elif val_accuracy_2 == best_val_accuracy_2 and val_accuracy_1 > 0.95:
            if val_accuracy_1 > best_val_accuracy_1:
                correspond_train_acc_1 = train_accuracy_1
                correspond_train_acc_2 = train_accuracy_2
                best_model_wts = copy.deepcopy(model.state_dict())
            elif val_accuracy_1 == best_val_accuracy_1:
                if train_accuracy_2 > correspond_train_acc_2:
                    correspond_train_acc_2 = train_accuracy_2
                    correspond_train_acc_1 = train_accuracy_1
                    best_model_wts = copy.deepcopy(model.state_dict())
                elif train_accuracy_2 == correspond_train_acc_2:
                    if train_accuracy_1 > best_val_accuracy_1:
                        correspond_train_acc_1 = train_accuracy_1
                        best_model_wts = copy.deepcopy(model.state_dict())

        if val_accuracy_2 > 0.885:
            save_val_1 = int("{:4.0f}".format(val_accuracy_1 * 10000))
            save_val_2 = int("{:4.0f}".format(val_accuracy_2 * 10000))
            save_train_1 = int("{:4.0f}".format(train_accuracy_1 * 10000))
            save_train_2 = int("{:4.0f}".format(train_accuracy_2 * 10000))
            public_name = "cnn_lstm_p2t" \
                          + "_epoch_" + str(epochs) \
                          + "_length_" + str(sequence_length) \
                          + "_opt_" + str(optimizer_choice) \
                          + "_mulopt_" + str(multi_optim) \
                          + "_flip_" + str(use_flip) \
                          + "_crop_" + str(crop_type) \
                          + "_batch_" + str(train_batch_size) \
                          + "_train1_" + str(save_train_1) \
                          + "_train2_" + str(save_train_2) \
                          + "_val1_" + str(save_val_1) \
                          + "_val2_" + str(save_val_2)
            model_name = public_name + ".pth"
            torch.save(best_model_wts, model_name)

        record_np[epoch, 0] = train_accuracy_1
        record_np[epoch, 1] = train_accuracy_3
        record_np[epoch, 2] = train_accuracy_2
        record_np[epoch, 3] = train_average_loss_1
        record_np[epoch, 4] = train_average_loss_2
        record_np[epoch, 5] = train_average_loss_3

        record_np[epoch, 6] = val_accuracy_1
        record_np[epoch, 7] = val_accuracy_3
        record_np[epoch, 7] = val_accuracy_2
        record_np[epoch, 9] = val_average_loss_1
        record_np[epoch, 10] = val_average_loss_2
        record_np[epoch, 11] = val_average_loss_3

    print('best accuracy_1: {:.4f} cor train accu_1: {:.4f}'.format(
        best_val_accuracy_1, correspond_train_acc_1))
    print('best accuracy_2: {:.4f} cor train accu_2: {:.4f}'.format(
        best_val_accuracy_2, correspond_train_acc_2))

    # save_val_1 = int("{:4.0f}".format(best_val_accuracy_1 * 10000))
    # save_val_2 = int("{:4.0f}".format(best_val_accuracy_2 * 10000))
    # save_train_1 = int("{:4.0f}".format(correspond_train_acc_1 * 10000))
    # save_train_2 = int("{:4.0f}".format(correspond_train_acc_2 * 10000))
    # public_name = "cnn_lstm_p2t" \
    #               + "_epoch_" + str(epochs) \
    #               + "_length_" + str(sequence_length) \
    #               + "_opt_" + str(optimizer_choice) \
    #               + "_mulopt_" + str(multi_optim) \
    #               + "_flip_" + str(use_flip) \
    #               + "_crop_" + str(crop_type) \
    #               + "_batch_" + str(train_batch_size) \
    #               + "_train1_" + str(save_train_1) \
    #               + "_train2_" + str(save_train_2) \
    #               + "_val1_" + str(save_val_1) \
    #               + "_val2_" + str(save_val_2)
    # model_name = public_name + ".pth"
    # torch.save(best_model_wts, model_name)

    record_name = public_name + ".npy"
    np.save(record_name, record_np)
Example #26
0
def test_model(test_dataset, test_num_each):
    num_test = len(test_dataset)
    test_idx = [i for i in range(num_test)]
    print('num of test dataset: {:6d}'.format(num_test))
    test_loader = DataLoader(test_dataset,
                             batch_size=test_batch_size,
                             sampler=test_idx,
                             num_workers=workers,
                             pin_memory=False)
    model = multi_resnet()
    model = DataParallel(model)
    model.load_state_dict(torch.load(model_name))
    if use_gpu:
        model = model.cuda()
    criterion = nn.BCEWithLogitsLoss(size_average=False)
    sig_f = nn.Sigmoid()
    model.eval()
    test_loss = 0.0
    test_corrects = 0
    all_preds = []
    test_start_time = time.time()
    for data in test_loader:
        inputs, labels_1, labels_2 = data
        if use_gpu:
            inputs = Variable(inputs.cuda(), volatile=True)
            labels = Variable(labels_1.cuda(), volatile=True)
        else:
            inputs = Variable(inputs, volatile=True)
            labels = Variable(labels_1, volatile=True)

        if crop_type == 0 or crop_type == 1:
            outputs = model.forward(inputs)
        elif crop_type == 5:
            inputs = inputs.permute(1, 0, 2, 3, 4).contiguous()
            inputs = inputs.view(-1, 3, 224, 224)
            outputs = model.forward(inputs)
            outputs = outputs.view(5, -1, 7)
            outputs = torch.mean(outputs, 0)
        elif crop_type == 10:
            inputs = inputs.permute(1, 0, 2, 3, 4).contiguous()
            inputs = inputs.view(-1, 3, 224, 224)
            outputs = model.forward(inputs)
            outputs = outputs.view(10, -1, 7)
            outputs = torch.mean(outputs, 0)

        for i in range(len(outputs)):
            all_preds.append(outputs[i].data.cpu().numpy().tolist())
        sig_out = outputs.data.cpu()
        sig_out = sig_f(sig_out)
        predict = torch.ByteTensor(sig_out > 0.5)
        predict = predict.long()
        test_corrects += torch.sum(predict == labels.data.cpu())
        labels = Variable(labels.data.float())
        loss = criterion(outputs, labels)
        test_loss += loss.data[0]
        # print(test_corrects)
    test_elapsed_time = time.time() - test_start_time
    test_accuracy = test_corrects / num_test / 7
    test_average_loss = test_loss / num_test / 7

    save_test = int("{:4.0f}".format(test_accuracy * 10000))
    pred_name = model_pure_name + '_test_' + str(save_test) + '_crop_' + str(
        crop_type) + '.pkl'
    with open(pred_name, 'wb') as f:
        pickle.dump(all_preds, f)
    print('test elapsed: {:2.0f}m{:2.0f}s'
          ' test loss: {:4.4f}'
          ' test accu: {:.4f}'.format(test_elapsed_time // 60,
                                      test_elapsed_time % 60,
                                      test_average_loss, test_accuracy))
Example #27
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--device',
                        default='0,1,2,3',
                        type=str,
                        required=False,
                        help='设置使用哪些显卡')
    parser.add_argument('--model_config',
                        default='config/model_config_small.json',
                        type=str,
                        required=False,
                        help='选择模型参数')
    parser.add_argument('--tokenizer_path',
                        default='cache/vocab_small.txt',
                        type=str,
                        required=False,
                        help='选择词库')
    parser.add_argument('--raw_data_path',
                        default='data/eval.json',
                        type=str,
                        required=False,
                        help='原始语料')
    parser.add_argument('--tokenized_data_path',
                        default='data/tokenized_eval/',
                        type=str,
                        required=False,
                        help='tokenized语料存放位置')
    parser.add_argument('--raw', action='store_true', help='是否先做tokenize')
    parser.add_argument('--batch_size',
                        default=8,
                        type=int,
                        required=False,
                        help='batch size')
    parser.add_argument('--log_step',
                        default=1,
                        type=int,
                        required=False,
                        help='多少步汇报一次')
    parser.add_argument('--stride',
                        default=768,
                        type=int,
                        required=False,
                        help='取数据的窗口步长')
    parser.add_argument('--num_pieces',
                        default=100,
                        type=int,
                        required=False,
                        help='将训练语料分成多少份')
    parser.add_argument('--min_length',
                        default=128,
                        type=int,
                        required=False,
                        help='最短收录文章长度')
    parser.add_argument('--pretrained_model',
                        default='',
                        type=str,
                        required=False,
                        help='模型起点路径')
    parser.add_argument('--no_wordpiece',
                        action='store_true',
                        help='不做word piece切词')
    parser.add_argument('--output_dir',
                        default='eval_result/',
                        type=str,
                        required=False,
                        help='结果输出路径')

    args = parser.parse_args()
    print('args:\n' + args.__repr__())

    if args.no_wordpiece:
        import tokenization_bert_without_wordpiece as tokenization_bert
    else:
        import tokenization_bert

    os.environ["CUDA_VISIBLE_DEVICES"] = args.device  # 此处设置程序使用哪些显卡

    model_config = pytorch_transformers.modeling_gpt2.GPT2Config.from_json_file(
        args.model_config)
    print('config:\n' + model_config.to_json_string())

    n_ctx = model_config.n_ctx
    full_tokenizer = tokenization_bert.BertTokenizer(
        vocab_file=args.tokenizer_path)
    full_tokenizer.max_len = n_ctx
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    print('using device:', device)

    raw_data_path = args.raw_data_path
    tokenized_data_path = args.tokenized_data_path
    raw = args.raw  # 选择是否从零开始构建数据集
    batch_size = args.batch_size
    log_step = args.log_step
    stride = args.stride
    num_pieces = args.num_pieces
    min_length = args.min_length
    output_dir = args.output_dir

    if not os.path.exists(output_dir):
        os.mkdir(output_dir)

    if raw:
        print('building files')
        build_files(data_path=raw_data_path,
                    tokenized_data_path=tokenized_data_path,
                    num_pieces=num_pieces,
                    full_tokenizer=full_tokenizer,
                    min_length=min_length)
        print('files built')

    if not args.pretrained_model:
        print('you need to specify a trained model.')
        exit(1)
    else:
        model = pytorch_transformers.modeling_gpt2.GPT2LMHeadModel.from_pretrained(
            args.pretrained_model)
    model.eval()
    model.to(device)

    num_parameters = 0
    parameters = model.parameters()
    for parameter in parameters:
        num_parameters += parameter.numel()
    print('number of parameters: {}'.format(num_parameters))

    multi_gpu = False
    full_len = 0
    print('calculating total steps')
    for i in tqdm(range(num_pieces)):
        with open(tokenized_data_path + 'tokenized_train_{}.txt'.format(i),
                  'r') as f:
            full_len += len([int(item) for item in f.read().strip().split()])

    if torch.cuda.device_count() > 1:
        print("Let's use", torch.cuda.device_count(), "GPUs!")
        model = DataParallel(model)
        multi_gpu = True
    print('starting training')
    overall_step = 0

    total_loss = 0
    total_steps = 0
    #  eval
    now = datetime.now()
    print('time: {}'.format(now))
    piece_num = 0
    for i in range(num_pieces):
        with open(tokenized_data_path + 'tokenized_train_{}.txt'.format(i),
                  'r') as f:
            line = f.read().strip()
        tokens = line.split()
        tokens = [int(token) for token in tokens]
        start_point = 0
        samples = []
        while start_point < len(tokens) - n_ctx:
            samples.append(tokens[start_point:start_point + n_ctx])
            start_point += stride
        start_point -= stride
        last = tokens[start_point + n_ctx:]
        last.extend([
            full_tokenizer.convert_tokens_to_ids(['[PAD]']) *
            (n_ctx - len(last))
        ])
        random.shuffle(samples)
        for step in range(len(samples) // batch_size):  # drop last

            #  prepare data
            batch = samples[step * batch_size:(step + 1) * batch_size]
            batch_labels = []
            batch_inputs = []
            for ids in batch:
                int_ids_for_labels = [int(x) for x in ids]
                int_ids_for_inputs = [int(x) for x in ids]
                batch_labels.append(int_ids_for_labels)
                batch_inputs.append(int_ids_for_inputs)
            batch_labels = torch.tensor(batch_labels).long().to(device)
            batch_inputs = torch.tensor(batch_inputs).long().to(device)

            #  forward pass
            outputs = model.forward(input_ids=batch_inputs,
                                    labels=batch_labels)
            loss, logits = outputs[:2]

            #  get loss
            if multi_gpu:
                loss = loss.mean()
            total_loss += loss
            total_steps += 1

            if (overall_step + 1) % log_step == 0:
                print('now time: {}:{}. Step {} of piece {}, ppl {}'.format(
                    datetime.now().hour,
                    datetime.now().minute, (step + 1), piece_num,
                    torch.exp(loss)))
        piece_num += 1

    if not os.path.exists(args.output_dir):
        os.mkdir(args.output_dir)
    else:
        with open(args.output_dir + 'result.txt', 'w') as f:
            f.write(np.exp(total_loss / total_steps))
Example #28
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--device", default="0,1,2,3", type=str, required=False, help="设置使用哪些显卡"
    )
    parser.add_argument(
        "--model_config",
        default="config/model_config_small.json",
        type=str,
        required=False,
        help="选择模型参数",
    )
    parser.add_argument(
        "--tokenizer_path",
        default="cache/vocab_small.txt",
        type=str,
        required=False,
        help="选择词库",
    )
    parser.add_argument(
        "--raw_data_path",
        default="data/train.json",
        type=str,
        required=False,
        help="原始训练语料",
    )
    parser.add_argument(
        "--tokenized_data_path",
        default="data/tokenized/",
        type=str,
        required=False,
        help="tokenized语料存放位置",
    )
    parser.add_argument("--raw", action="store_true", help="是否先做tokenize")
    parser.add_argument("--epochs", default=5, type=int, required=False, help="训练循环")
    parser.add_argument(
        "--batch_size", default=8, type=int, required=False, help="训练batch size"
    )
    parser.add_argument("--lr", default=1.5e-4, type=float, required=False, help="学习率")
    parser.add_argument(
        "--warmup_steps", default=2000, type=int, required=False, help="warm up步数"
    )
    parser.add_argument(
        "--log_step",
        default=1,
        type=int,
        required=False,
        help="多少步汇报一次loss,设置为gradient accumulation的整数倍",
    )
    parser.add_argument(
        "--stride", default=768, type=int, required=False, help="训练时取训练数据的窗口步长"
    )
    parser.add_argument(
        "--gradient_accumulation", default=1, type=int, required=False, help="梯度积累"
    )
    parser.add_argument("--fp16", action="store_true", help="混合精度")
    parser.add_argument("--fp16_opt_level", default="O1", type=str, required=False)
    parser.add_argument("--max_grad_norm", default=1.0, type=float, required=False)
    parser.add_argument(
        "--num_pieces", default=100, type=int, required=False, help="将训练语料分成多少份"
    )
    parser.add_argument(
        "--min_length", default=128, type=int, required=False, help="最短收录文章长度"
    )
    parser.add_argument(
        "--output_dir", default="model/", type=str, required=False, help="模型输出路径"
    )
    parser.add_argument(
        "--pretrained_model", default="", type=str, required=False, help="模型训练起点路径"
    )
    parser.add_argument(
        "--writer_dir",
        default="tensorboard_summary/",
        type=str,
        required=False,
        help="Tensorboard路径",
    )
    parser.add_argument("--segment", action="store_true", help="中文以词为单位")
    parser.add_argument("--bpe_token", action="store_true", help="subword")
    parser.add_argument(
        "--encoder_json",
        default="tokenizations/encoder.json",
        type=str,
        help="encoder.json",
    )
    parser.add_argument(
        "--vocab_bpe", default="tokenizations/vocab.bpe", type=str, help="vocab.bpe"
    )

    args = parser.parse_args()
    print("args:\n" + args.__repr__())

    if args.segment:
        from tokenizations import tokenization_bert_word_level as tokenization_bert
    else:
        from tokenizations import tokenization_bert

    os.environ["CUDA_VISIBLE_DEVICES"] = args.device  # 此处设置程序使用哪些显卡

    model_config = transformers.modeling_gpt2.GPT2Config.from_json_file(
        args.model_config
    )
    print("config:\n" + model_config.to_json_string())

    n_ctx = model_config.n_ctx
    if args.bpe_token:
        full_tokenizer = get_encoder(args.encoder_json, args.vocab_bpe)
    else:
        full_tokenizer = tokenization_bert.BertTokenizer(vocab_file=args.tokenizer_path)
    full_tokenizer.max_len = 999999
    device = "cuda" if torch.cuda.is_available() else "cpu"
    print("using device:", device)

    raw_data_path = args.raw_data_path
    tokenized_data_path = args.tokenized_data_path
    raw = args.raw  # 选择是否从零开始构建数据集
    epochs = args.epochs
    batch_size = args.batch_size
    lr = args.lr
    warmup_steps = args.warmup_steps
    log_step = args.log_step
    stride = args.stride
    gradient_accumulation = args.gradient_accumulation
    fp16 = args.fp16  # 不支持半精度的显卡请勿打开
    fp16_opt_level = args.fp16_opt_level
    max_grad_norm = args.max_grad_norm
    num_pieces = args.num_pieces
    min_length = args.min_length
    output_dir = args.output_dir
    tb_writer = SummaryWriter(log_dir=args.writer_dir)
    assert log_step % gradient_accumulation == 0

    if not os.path.exists(output_dir):
        os.mkdir(output_dir)

    if raw:
        print("building files")
        build_files(
            data_path=raw_data_path,
            tokenized_data_path=tokenized_data_path,
            num_pieces=num_pieces,
            full_tokenizer=full_tokenizer,
            min_length=min_length,
        )
        print("files built")

    if not args.pretrained_model:
        model = transformers.modeling_gpt2.GPT2LMHeadModel(config=model_config)
    else:
        model = transformers.modeling_gpt2.GPT2LMHeadModel.from_pretrained(
            args.pretrained_model
        )
    model.train()
    model.to(device)

    num_parameters = 0
    parameters = model.parameters()
    for parameter in parameters:
        num_parameters += parameter.numel()
    print("number of parameters: {}".format(num_parameters))

    multi_gpu = False
    full_len = 0
    print("calculating total steps")
    for i in tqdm(range(num_pieces)):
        with open(tokenized_data_path + "tokenized_train_{}.txt".format(i), "r") as f:
            full_len += len([int(item) for item in f.read().strip().split()])
    total_steps = int(full_len / stride * epochs / batch_size / gradient_accumulation)
    print("total steps = {}".format(total_steps))

    optimizer = transformers.AdamW(model.parameters(), lr=lr, correct_bias=True)
    scheduler = transformers.WarmupLinearSchedule(
        optimizer, warmup_steps=warmup_steps, t_total=total_steps
    )
    if fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
            )
        model, optimizer = amp.initialize(model, optimizer, opt_level=fp16_opt_level)

    if torch.cuda.device_count() > 1:
        print("Let's use", torch.cuda.device_count(), "GPUs!")
        model = DataParallel(model, device_ids=[int(i) for i in args.device.split(",")])
        multi_gpu = True
    print("starting training")
    overall_step = 0
    running_loss = 0
    saving_time = datetime.now()
    for epoch in range(epochs):
        print("epoch {}".format(epoch + 1))
        now = datetime.now()
        print("time: {}".format(now))
        x = np.linspace(0, num_pieces - 1, num_pieces, dtype=np.int32)
        random.shuffle(x)
        piece_num = 0
        for i in x:
            with open(
                tokenized_data_path + "tokenized_train_{}.txt".format(i), "r"
            ) as f:
                line = f.read().strip()
            tokens = line.split()
            tokens = [int(token) for token in tokens]
            start_point = 0
            samples = []
            while start_point < len(tokens) - n_ctx:
                samples.append(tokens[start_point : start_point + n_ctx])
                start_point += stride
            if start_point < len(tokens):
                samples.append(tokens[len(tokens) - n_ctx :])
            random.shuffle(samples)
            for step in range(len(samples) // batch_size):  # drop last

                #  prepare data
                batch = samples[step * batch_size : (step + 1) * batch_size]
                batch_inputs = []
                for ids in batch:
                    int_ids = [int(x) for x in ids]
                    batch_inputs.append(int_ids)
                batch_inputs = torch.tensor(batch_inputs).long().to(device)

                #  forward pass
                outputs = model.forward(input_ids=batch_inputs, labels=batch_inputs)
                loss, logits = outputs[:2]

                #  get loss
                if multi_gpu:
                    loss = loss.mean()
                if gradient_accumulation > 1:
                    loss = loss / gradient_accumulation

                #  loss backward
                if fp16:
                    with amp.scale_loss(loss, optimizer) as scaled_loss:
                        scaled_loss.backward()
                        torch.nn.utils.clip_grad_norm_(
                            amp.master_params(optimizer), max_grad_norm
                        )
                else:
                    loss.backward()
                    torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)

                #  optimizer step
                if (overall_step + 1) % gradient_accumulation == 0:
                    running_loss += loss.item()
                    optimizer.step()
                    optimizer.zero_grad()
                    scheduler.step()
                if (overall_step + 1) % log_step == 0:
                    tb_writer.add_scalar(
                        "loss", loss.item() * gradient_accumulation, overall_step
                    )
                    print(
                        "now time: {}:{}. Step {} of piece {} of epoch {}, loss {}".format(
                            datetime.now().hour,
                            datetime.now().minute,
                            step + 1,
                            piece_num,
                            epoch + 1,
                            running_loss
                            * gradient_accumulation
                            / (log_step / gradient_accumulation),
                        )
                    )
                    running_loss = 0
                delta_time = datetime.now() - saving_time
                if delta_time.seconds > 1800:
                    print("saving model for epoch {}".format(epoch + 1))
                    if not os.path.exists(
                        output_dir + "model_epoch{}".format(epoch + 1)
                    ):
                        os.mkdir(output_dir + "model_epoch{}".format(epoch + 1))
                    model_to_save = model.module if hasattr(model, "module") else model
                    model_to_save.save_pretrained(
                        output_dir + "model_epoch{}".format(epoch + 1)
                    )
                    saving_time = datetime.now()
                overall_step += 1
            piece_num += 1

        print("saving model for epoch {}".format(epoch + 1))
        if not os.path.exists(output_dir + "model_epoch{}".format(epoch + 1)):
            os.mkdir(output_dir + "model_epoch{}".format(epoch + 1))
        model_to_save = model.module if hasattr(model, "module") else model
        model_to_save.save_pretrained(output_dir + "model_epoch{}".format(epoch + 1))
        # torch.save(scheduler.state_dict(), output_dir + 'model_epoch{}/scheduler.pt'.format(epoch + 1))
        # torch.save(optimizer.state_dict(), output_dir + 'model_epoch{}/optimizer.pt'.format(epoch + 1))
        print("epoch {} finished".format(epoch + 1))

        then = datetime.now()
        print("time: {}".format(then))
        print("time for one epoch: {}".format(then - now))

    print("training finished")
    if not os.path.exists(output_dir + "final_model"):
        os.mkdir(output_dir + "final_model")
    model_to_save = model.module if hasattr(model, "module") else model
    model_to_save.save_pretrained(output_dir + "final_model")
Example #29
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--device',
                        default='0,1,2,3',
                        type=str,
                        required=False,
                        help='设置使用哪些显卡')
    parser.add_argument('--model_config',
                        default='config/model_config_small.json',
                        type=str,
                        required=False,
                        help='选择模型参数')
    parser.add_argument('--tokenizer_path',
                        default='cache/vocab_small.txt',
                        type=str,
                        required=False,
                        help='选择词库')
    parser.add_argument('--raw_data_path',
                        default='data/train.json',
                        type=str,
                        required=False,
                        help='原始训练语料')
    parser.add_argument('--tokenized_data_path',
                        default='data/tokenized/',
                        type=str,
                        required=False,
                        help='tokenized语料存放位置')
    parser.add_argument('--raw', action='store_true', help='是否先做tokenize')
    parser.add_argument('--epochs',
                        default=5,
                        type=int,
                        required=False,
                        help='训练循环')
    parser.add_argument('--batch_size',
                        default=8,
                        type=int,
                        required=False,
                        help='训练batch size')
    parser.add_argument('--lr',
                        default=1.5e-4,
                        type=float,
                        required=False,
                        help='学习率')
    parser.add_argument('--warmup_steps',
                        default=2000,
                        type=int,
                        required=False,
                        help='warm up步数')
    parser.add_argument('--log_step',
                        default=1,
                        type=int,
                        required=False,
                        help='多少步汇报一次loss')
    parser.add_argument('--stride',
                        default=768,
                        type=int,
                        required=False,
                        help='训练时取训练数据的窗口步长')
    parser.add_argument('--gradient_accumulation',
                        default=1,
                        type=str,
                        required=False,
                        help='梯度积累')
    parser.add_argument('--fp16', action='store_true', help='混合精度')
    parser.add_argument('--fp16_opt_level',
                        default='O1',
                        type=str,
                        required=False)
    parser.add_argument('--max_grad_norm',
                        default=1.0,
                        type=float,
                        required=False)
    parser.add_argument('--num_pieces',
                        default=100,
                        type=int,
                        required=False,
                        help='将训练语料分成多少份')
    parser.add_argument('--output_dir',
                        default='model/',
                        type=str,
                        required=False,
                        help='模型输出路径')
    parser.add_argument('--pretrained_model',
                        default='',
                        type=str,
                        required=False,
                        help='模型训练起点路径')
    parser.add_argument('--no_wordpiece',
                        action='store_true',
                        help='不做word piece切词')

    args = parser.parse_args()
    print('args:\n' + args.__repr__())

    if args.no_wordpiece:
        import tokenization_bert_without_wordpiece as tokenization_bert
    else:
        import tokenization_bert

    os.environ["CUDA_VISIBLE_DEVICES"] = args.device  # 此处设置程序使用哪些显卡
    model_config = pytorch_transformers.modeling_gpt2.GPT2Config.from_json_file(
        args.model_config)
    print('config:\n' + model_config.to_json_string())

    n_ctx = model_config.n_ctx
    full_tokenizer = tokenization_bert.BertTokenizer(
        vocab_file=args.tokenizer_path)
    full_tokenizer.max_len = n_ctx
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    print('using device:', device)

    raw_data_path = args.raw_data_path
    tokenized_data_path = args.tokenized_data_path
    raw = args.raw  # 选择是否从零开始构建数据集
    epochs = args.epochs
    batch_size = args.batch_size
    lr = args.lr
    warmup_steps = args.warmup_steps
    log_step = args.log_step
    stride = args.stride
    gradient_accumulation = args.gradient_accumulation
    fp16 = args.fp16  # 不支持半精度的显卡请勿打开
    fp16_opt_level = args.fp16_opt_level
    max_grad_norm = args.max_grad_norm
    num_pieces = args.num_pieces
    output_dir = args.output_dir

    if raw:
        print('building files')
        build_files(raw_data_path=raw_data_path,
                    tokenized_data_path=tokenized_data_path,
                    full_tokenizer=full_tokenizer,
                    num_pieces=num_pieces)
        print('files built')

    if not args.pretrained_model:
        model = pytorch_transformers.modeling_gpt2.GPT2LMHeadModel(
            config=model_config)
    else:
        model = pytorch_transformers.modeling_gpt2.GPT2LMHeadModel.from_pretrained(
            args.pretrained_model)
    model.train()
    model.to(device)
    multi_gpu = False
    full_len = 0
    print('calculating total steps')
    for i in tqdm(range(num_pieces)):
        with open(tokenized_data_path + 'tokenized_train_{}.txt'.format(i),
                  'r') as f:
            full_len += len([int(item) for item in f.read().strip().split()])
    total_steps = int(full_len / stride * epochs / batch_size /
                      gradient_accumulation)
    print('total steps = {}'.format(total_steps))

    optimizer = pytorch_transformers.AdamW(model.parameters(),
                                           lr=lr,
                                           correct_bias=True)
    scheduler = pytorch_transformers.WarmupLinearSchedule(
        optimizer, warmup_steps=warmup_steps, t_total=total_steps)
    if fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
            )
        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level=fp16_opt_level)

    if torch.cuda.device_count() > 1:
        print("Let's use", torch.cuda.device_count(), "GPUs!")
        model = DataParallel(model)
        multi_gpu = True
    print('starting training')
    for epoch in range(epochs):
        print('epoch {}'.format(epoch + 1))
        now = datetime.now()
        print('time: {}'.format(now))
        x = np.linspace(0, num_pieces - 1, num_pieces, dtype=np.int32)
        random.shuffle(x)
        piece_num = 0
        for i in x:
            running_loss = 0
            with open(tokenized_data_path + 'tokenized_train_{}.txt'.format(i),
                      'r') as f:
                line = f.read().strip()
            tokens = line.split()
            tokens = [int(token) for token in tokens]
            start_point = 0
            samples = []
            while start_point < len(tokens) - n_ctx:
                samples.append(tokens[start_point:start_point + n_ctx])
                start_point += stride
            random.shuffle(samples)
            for step in range(len(samples) // batch_size):

                #  prepare data
                batch = samples[step * batch_size:(step + 1) * batch_size]
                batch_labels = []
                batch_inputs = []
                for ids in batch:
                    int_ids_for_labels = [int(x) for x in ids]
                    int_ids_for_inputs = [int(x) for x in ids]
                    batch_labels.append(int_ids_for_labels)
                    batch_inputs.append(int_ids_for_inputs)
                batch_labels = torch.tensor(batch_labels).long().to(device)
                batch_inputs = torch.tensor(batch_inputs).long().to(device)

                #  forward pass
                outputs = model.forward(input_ids=batch_inputs,
                                        labels=batch_labels)
                loss, logits = outputs[:2]

                #  get loss
                if multi_gpu:
                    loss = loss.mean()
                if gradient_accumulation > 1:
                    loss = loss / gradient_accumulation

                #  loss backward
                if fp16:
                    with amp.scale_loss(loss, optimizer) as scaled_loss:
                        scaled_loss.backward()
                        torch.nn.utils.clip_grad_norm_(
                            amp.master_params(optimizer), max_grad_norm)
                else:
                    loss.backward()
                    torch.nn.utils.clip_grad_norm_(model.parameters(),
                                                   max_grad_norm)

                #  optimizer step
                if (step + 1) % gradient_accumulation == 0:
                    running_loss += loss.item()
                    scheduler.step()
                    optimizer.step()
                    optimizer.zero_grad()
                if (step + 1) % log_step == 0:
                    print(
                        'now time: {}:{}. Step {} of piece {} of epoch {}, loss {}'
                        .format(
                            datetime.now().hour,
                            datetime.now().minute,
                            (step + 1) // gradient_accumulation, piece_num,
                            epoch + 1,
                            running_loss * gradient_accumulation / log_step))
                    running_loss = 0
            piece_num += 1

        print('saving model for epoch {}'.format(epoch + 1))
        if not os.path.exists(output_dir + 'model_epoch{}'.format(epoch + 1)):
            os.mkdir(output_dir + 'model_epoch{}'.format(epoch + 1))
        model_to_save = model.module if hasattr(model, 'module') else model
        model_to_save.save_pretrained(output_dir +
                                      'model_epoch{}'.format(epoch + 1))
        # torch.save(scheduler.state_dict(), output_dir + 'model_epoch{}/scheduler.pt'.format(epoch + 1))
        # torch.save(optimizer.state_dict(), output_dir + 'model_epoch{}/optimizer.pt'.format(epoch + 1))
        print('epoch {} finished'.format(epoch + 1))

        then = datetime.now()
        print('time: {}'.format(then))
        print('time for one epoch: {}'.format(then - now))

    print('training finished')
    if not os.path.exists(output_dir + 'final_model'):
        os.mkdir(output_dir + 'final_model')
    model_to_save = model.module if hasattr(model, 'module') else model
    model_to_save.save_pretrained(output_dir + 'final_model')
Example #30
0
def train_model(train_dataset, train_num_each, val_dataset, val_num_each):
    num_train = len(train_dataset)
    num_val = len(val_dataset)

    train_useful_start_idx = get_useful_start_idx(sequence_length,
                                                  train_num_each)

    val_useful_start_idx = get_useful_start_idx(sequence_length, val_num_each)

    num_train_we_use = len(train_useful_start_idx) // num_gpu * num_gpu
    num_val_we_use = len(val_useful_start_idx) // num_gpu * num_gpu
    # num_train_we_use = 4
    # num_val_we_use = 800

    train_we_use_start_idx = train_useful_start_idx[0:num_train_we_use]
    val_we_use_start_idx = val_useful_start_idx[0:num_val_we_use]

    train_idx = []
    for i in range(num_train_we_use):
        for j in range(sequence_length):
            train_idx.append(train_we_use_start_idx[i] + j)

    val_idx = []
    for i in range(num_val_we_use):
        for j in range(sequence_length):
            val_idx.append(val_we_use_start_idx[i] + j)

    num_train_all = len(train_idx)
    num_val_all = len(val_idx)

    print('num train start idx : {:6d}'.format(len(train_useful_start_idx)))
    print('last idx train start: {:6d}'.format(train_useful_start_idx[-1]))
    print('num of train dataset: {:6d}'.format(num_train))
    print('num of train we use : {:6d}'.format(num_train_we_use))
    print('num of all train use: {:6d}'.format(num_train_all))
    print('num valid start idx : {:6d}'.format(len(val_useful_start_idx)))
    print('last idx valid start: {:6d}'.format(val_useful_start_idx[-1]))
    print('num of valid dataset: {:6d}'.format(num_val))
    print('num of valid we use : {:6d}'.format(num_val_we_use))
    print('num of all valid use: {:6d}'.format(num_val_all))

    train_loader = DataLoader(train_dataset,
                              batch_size=train_batch_size,
                              sampler=train_idx,
                              num_workers=workers,
                              pin_memory=False)
    val_loader = DataLoader(val_dataset,
                            batch_size=val_batch_size,
                            sampler=val_idx,
                            num_workers=workers,
                            pin_memory=False)
    model = dense_lstm()
    sig_f = nn.Sigmoid()

    if use_gpu:
        model = model.cuda()
        sig_f = sig_f.cuda()
    model = DataParallel(model)
    criterion_1 = nn.BCEWithLogitsLoss(size_average=False)
    criterion_2 = nn.CrossEntropyLoss(size_average=False)

    if multi_optim == 0:
        if optimizer_choice == 0:
            optimizer = optim.SGD(model.parameters(),
                                  lr=learning_rate,
                                  momentum=momentum,
                                  dampening=dampening,
                                  weight_decay=weight_decay,
                                  nesterov=use_nesterov)
            if sgd_adjust_lr == 0:
                exp_lr_scheduler = lr_scheduler.StepLR(optimizer,
                                                       step_size=sgd_step,
                                                       gamma=sgd_gamma)
            elif sgd_adjust_lr == 1:
                exp_lr_scheduler = lr_scheduler.ReduceLROnPlateau(
                    optimizer, 'min')
        elif optimizer_choice == 1:
            optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    elif multi_optim == 1:
        if optimizer_choice == 0:
            optimizer = optim.SGD([
                {
                    'params': model.module.features.parameters()
                },
                {
                    'params': model.module.lstm.parameters(),
                    'lr': learning_rate
                },
                {
                    'params': model.module.fc.parameters(),
                    'lr': learning_rate
                },
                {
                    'params': model.module.fc2.parameters(),
                    'lr': learning_rate
                },
            ],
                                  lr=learning_rate / 10,
                                  momentum=momentum,
                                  dampening=dampening,
                                  weight_decay=weight_decay,
                                  nesterov=use_nesterov)
            if sgd_adjust_lr == 0:
                exp_lr_scheduler = lr_scheduler.StepLR(optimizer,
                                                       step_size=sgd_step,
                                                       gamma=sgd_gamma)
            elif sgd_adjust_lr == 1:
                exp_lr_scheduler = lr_scheduler.ReduceLROnPlateau(
                    optimizer, 'min')
        elif optimizer_choice == 1:
            optimizer = optim.Adam([
                {
                    'params': model.module.features.parameters()
                },
                {
                    'params': model.module.lstm.parameters(),
                    'lr': learning_rate
                },
                {
                    'params': model.module.fc.parameters(),
                    'lr': learning_rate
                },
                {
                    'params': model.module.fc2.parameters(),
                    'lr': learning_rate
                },
            ],
                                   lr=learning_rate / 10)

    best_model_wts = copy.deepcopy(model.state_dict())
    best_val_accuracy_1 = 0.0
    best_val_accuracy_2 = 0.0  # judge by accu2
    correspond_train_acc_1 = 0.0
    correspond_train_acc_2 = 0.0

    record_np = np.zeros([epochs, 8])

    for epoch in range(epochs):
        # np.random.seed(epoch)
        np.random.shuffle(train_we_use_start_idx)
        train_idx = []
        for i in range(num_train_we_use):
            for j in range(sequence_length):
                train_idx.append(train_we_use_start_idx[i] + j)

        train_loader = DataLoader(train_dataset,
                                  batch_size=train_batch_size,
                                  sampler=train_idx,
                                  num_workers=workers,
                                  pin_memory=False)

        model.train()
        train_loss_1 = 0.0
        train_loss_2 = 0.0
        train_corrects_1 = 0
        train_corrects_2 = 0

        train_start_time = time.time()
        for data in train_loader:
            inputs, labels_1, labels_2 = data
            if use_gpu:
                inputs = Variable(inputs.cuda())
                labels_1 = Variable(labels_1.cuda())
                labels_2 = Variable(labels_2.cuda())
            else:
                inputs = Variable(inputs)
                labels_1 = Variable(labels_1)
                labels_2 = Variable(labels_2)

            optimizer.zero_grad()

            outputs_1, outputs_2 = model.forward(inputs)

            _, preds_2 = torch.max(outputs_2.data, 1)

            sig_out = sig_f(outputs_1.data)
            preds_1 = torch.ByteTensor(sig_out.cpu() > 0.5)
            preds_1 = preds_1.long()
            train_corrects_1 += torch.sum(preds_1 == labels_1.data.cpu())
            labels_1 = Variable(labels_1.data.float())
            loss_1 = criterion_1(outputs_1, labels_1)

            loss_2 = criterion_2(outputs_2, labels_2)
            loss = loss_1 + loss_2
            loss.backward()
            optimizer.step()

            train_loss_1 += loss_1.data[0]
            train_loss_2 += loss_2.data[0]
            train_corrects_2 += torch.sum(preds_2 == labels_2.data)

        train_elapsed_time = time.time() - train_start_time
        train_accuracy_1 = train_corrects_1 / num_train_all / 7
        train_accuracy_2 = train_corrects_2 / num_train_all
        train_average_loss_1 = train_loss_1 / num_train_all / 7
        train_average_loss_2 = train_loss_2 / num_train_all

        # begin eval

        model.eval()
        val_loss_1 = 0.0
        val_loss_2 = 0.0
        val_corrects_1 = 0
        val_corrects_2 = 0

        val_start_time = time.time()
        for data in val_loader:
            inputs, labels_1, labels_2 = data
            labels_2 = labels_2[(sequence_length - 1)::sequence_length]
            if use_gpu:
                inputs = Variable(inputs.cuda(), volatile=True)
                labels_1 = Variable(labels_1.cuda(), volatile=True)
                labels_2 = Variable(labels_2.cuda(), volatile=True)
            else:
                inputs = Variable(inputs, volatile=True)
                labels_1 = Variable(labels_1, volatile=True)
                labels_2 = Variable(labels_2, volatile=True)

            if crop_type == 0 or crop_type == 1:
                outputs_1, outputs_2 = model.forward(inputs)
            elif crop_type == 5:
                inputs = inputs.permute(1, 0, 2, 3, 4).contiguous()
                inputs = inputs.view(-1, 3, 224, 224)
                outputs_1, outputs_2 = model.forward(inputs)
                outputs_1 = outputs_1.view(5, -1, 7)
                outputs_1 = torch.mean(outputs_1, 0)
                outputs_2 = outputs_2.view(5, -1, 7)
                outputs_2 = torch.mean(outputs_2, 0)
            elif crop_type == 10:
                inputs = inputs.permute(1, 0, 2, 3, 4).contiguous()
                inputs = inputs.view(-1, 3, 224, 224)
                outputs_1, outputs_2 = model.forward(inputs)
                outputs_1 = outputs_1.view(10, -1, 7)
                outputs_1 = torch.mean(outputs_1, 0)
                outputs_2 = outputs_2.view(10, -1, 7)
                outputs_2 = torch.mean(outputs_2, 0)

            outputs_2 = outputs_2[sequence_length - 1::sequence_length]
            _, preds_2 = torch.max(outputs_2.data, 1)

            sig_out = sig_f(outputs_1.data)
            preds_1 = torch.ByteTensor(sig_out.cpu() > 0.5)
            preds_1 = preds_1.long()
            val_corrects_1 += torch.sum(preds_1 == labels_1.data.cpu())
            labels_1 = Variable(labels_1.data.float())
            loss_1 = criterion_1(outputs_1, labels_1)
            val_loss_1 += loss_1.data[0]

            loss_2 = criterion_2(outputs_2, labels_2)
            val_loss_2 += loss_2.data[0]
            val_corrects_2 += torch.sum(preds_2 == labels_2.data)

        val_elapsed_time = time.time() - val_start_time
        val_accuracy_1 = val_corrects_1 / (num_val_all * 7)
        val_accuracy_2 = val_corrects_2 / num_val_we_use
        val_average_loss_1 = val_loss_1 / (num_val_all * 7)
        val_average_loss_2 = val_loss_2 / num_val_we_use

        print('epoch: {:4d}'
              ' train time: {:2.0f}m{:2.0f}s'
              ' train loss_1: {:4.4f}'
              ' train accu_1: {:.4f}'
              ' valid time: {:2.0f}m{:2.0f}s'
              ' valid loss_1: {:4.4f}'
              ' valid accu_1: {:.4f}'.format(
                  epoch, train_elapsed_time // 60, train_elapsed_time % 60,
                  train_average_loss_1, train_accuracy_1,
                  val_elapsed_time // 60, val_elapsed_time % 60,
                  val_average_loss_1, val_accuracy_1))
        print('epoch: {:4d}'
              ' train time: {:2.0f}m{:2.0f}s'
              ' train loss_2: {:4.4f}'
              ' train accu_2: {:.4f}'
              ' valid time: {:2.0f}m{:2.0f}s'
              ' valid loss_2: {:4.4f}'
              ' valid accu_2: {:.4f}'.format(
                  epoch, train_elapsed_time // 60, train_elapsed_time % 60,
                  train_average_loss_2, train_accuracy_2,
                  val_elapsed_time // 60, val_elapsed_time % 60,
                  val_average_loss_2, val_accuracy_2))

        if optimizer_choice == 0:
            if sgd_adjust_lr == 0:
                exp_lr_scheduler.step()
            elif sgd_adjust_lr == 1:
                exp_lr_scheduler.step(val_average_loss_1 + val_average_loss_2)

        if val_accuracy_2 > best_val_accuracy_2 and val_accuracy_1 > 0.95:
            best_val_accuracy_2 = val_accuracy_2
            best_val_accuracy_1 = val_accuracy_1
            correspond_train_acc_1 = train_accuracy_1
            correspond_train_acc_2 = train_accuracy_2
            best_model_wts = copy.deepcopy(model.state_dict())
        elif val_accuracy_2 == best_val_accuracy_2 and val_accuracy_1 > 0.95:
            if val_accuracy_1 > best_val_accuracy_1:
                correspond_train_acc_1 = train_accuracy_1
                correspond_train_acc_2 = train_accuracy_2
                best_model_wts = copy.deepcopy(model.state_dict())
            elif val_accuracy_1 == best_val_accuracy_1:
                if train_accuracy_2 > correspond_train_acc_2:
                    correspond_train_acc_2 = train_accuracy_2
                    correspond_train_acc_1 = train_accuracy_1
                    best_model_wts = copy.deepcopy(model.state_dict())
                elif train_accuracy_2 == correspond_train_acc_2:
                    if train_accuracy_1 > best_val_accuracy_1:
                        correspond_train_acc_1 = train_accuracy_1
                        best_model_wts = copy.deepcopy(model.state_dict())

        record_np[epoch, 0] = train_accuracy_1
        record_np[epoch, 1] = train_accuracy_2
        record_np[epoch, 2] = train_average_loss_1
        record_np[epoch, 3] = train_average_loss_2
        record_np[epoch, 4] = val_accuracy_1
        record_np[epoch, 5] = val_accuracy_2
        record_np[epoch, 6] = val_average_loss_1
        record_np[epoch, 7] = val_average_loss_2

    print('best accuracy_1: {:.4f} cor train accu_1: {:.4f}'.format(
        best_val_accuracy_1, correspond_train_acc_1))
    print('best accuracy_2: {:.4f} cor train accu_2: {:.4f}'.format(
        best_val_accuracy_2, correspond_train_acc_2))
    save_val_1 = int("{:4.0f}".format(best_val_accuracy_1 * 10000))
    save_val_2 = int("{:4.0f}".format(best_val_accuracy_2 * 10000))
    save_train_1 = int("{:4.0f}".format(correspond_train_acc_1 * 10000))
    save_train_2 = int("{:4.0f}".format(correspond_train_acc_2 * 10000))
    public_name = "dense_lstm" \
                  + "_epoch_" + str(epochs) \
                  + "_length_" + str(sequence_length) \
                  + "_opt_" + str(optimizer_choice) \
                  + "_mulopt_" + str(multi_optim) \
                  + "_flip_" + str(use_flip) \
                  + "_crop_" + str(crop_type) \
                  + "_batch_" + str(train_batch_size) \
                  + "_train1_" + str(save_train_1) \
                  + "_train2_" + str(save_train_2) \
                  + "_val1_" + str(save_val_1) \
                  + "_val2_" + str(save_val_2)
    model_name = public_name + ".pth"
    torch.save(best_model_wts, model_name)

    record_name = public_name + ".npy"
    np.save(record_name, record_np)