def test_model(test_dataset): num_test = len(test_dataset) test_useful_end_idx = get_useful_end_idx(sequence_length, num_test) test_idx = [] for i in test_useful_end_idx: for j in range(sequence_length): test_idx.append(i - j * srate) test_idx.reverse() test_loader = DataLoader( test_dataset, batch_size=test_batch_size, sampler=SeqSampler(test_dataset, test_idx), # sampler=test_idx, num_workers=0, pin_memory=False) model = res34_tcn() model = DataParallel(model) model.load_state_dict(torch.load(model_name)) # model = model.module # model = DataParallel(model) if use_gpu: model = model.cuda() # model = DataParallel(model) # model = model.module model.eval() all_preds_s = [] num = 0 with torch.no_grad(): for data in test_loader: num = num + 1 inputs, _, kdatas = data if use_gpu: inputs = Variable(inputs.cuda()) kdatas = Variable(kdatas.cuda()) else: inputs = Variable(inputs) kdatas = Variable(kdatas) outputs_s = model.forward(inputs, kdatas) #outputs_s = outputs_s[-1, (sequence_length - 1):: sequence_length] outputs_s = outputs_s[-1] outputs_s = F.softmax(outputs_s, dim=-1) _, preds_s = torch.max(outputs_s.data, -1) for j in range(preds_s.shape[0]): all_preds_s.append(preds_s[j].data.item()) return all_preds_s
def test_model(test_dataset, test_num_each): num_test = len(test_dataset) test_useful_start_idx = get_useful_start_idx(sequence_length, test_num_each) num_test_we_use = len(test_useful_start_idx) # num_test_we_use = 804 # num_test_we_use = len(test_useful_start_idx) // (test_batch_size // sequence_length) * ( # test_batch_size // sequence_length) test_we_use_start_idx = test_useful_start_idx[0:num_test_we_use] test_idx = [] for i in range(num_test_we_use): for j in range(sequence_length): test_idx.append(test_we_use_start_idx[i] + j) num_test_all = len(test_idx) print('num test start idx : {:6d}'.format(len(test_useful_start_idx))) print('last idx test start: {:6d}'.format(test_useful_start_idx[-1])) print('num of test dataset: {:6d}'.format(num_test)) print('num of test we use : {:6d}'.format(num_test_we_use)) print('num of all test use: {:6d}'.format(num_test_all)) test_loader = DataLoader(test_dataset, batch_size=test_batch_size, sampler=test_idx, num_workers=workers, pin_memory=False) model = resnet_lstm() model = DataParallel(model) model.load_state_dict(torch.load(model_name)) if use_gpu: model = model.cuda() # 应该可以直接多gpu计算 # model = model.module #要测试一下 criterion = nn.CrossEntropyLoss(size_average=False) model.eval() test_loss = 0.0 test_corrects = 0 test_start_time = time.time() all_preds = [] for data in test_loader: inputs, labels_1, labels_2 = data labels_2 = labels_2[(sequence_length - 1)::sequence_length] if use_gpu: inputs = Variable(inputs.cuda(), volatile=True) labels = Variable(labels_2.cuda(), volatile=True) else: inputs = Variable(inputs, volatile=True) labels = Variable(labels_2, volatile=True) if crop_type == 0 or crop_type == 1: outputs = model.forward(inputs) elif crop_type == 5: inputs = inputs.permute(1, 0, 2, 3, 4).contiguous() inputs = inputs.view(-1, 3, 224, 224) outputs = model.forward(inputs) outputs = outputs.view(5, -1, 7) outputs = torch.mean(outputs, 0) elif crop_type == 10: inputs = inputs.permute(1, 0, 2, 3, 4).contiguous() inputs = inputs.view(-1, 3, 224, 224) outputs = model.forward(inputs) outputs = outputs.view(10, -1, 7) outputs = torch.mean(outputs, 0) outputs = outputs[sequence_length - 1::sequence_length] _, preds = torch.max(outputs.data, 1) for i in range(len(preds)): all_preds.append(preds[i]) print(len(all_preds)) loss = criterion(outputs, labels) test_loss += loss.data[0] test_corrects += torch.sum(preds == labels.data) test_elapsed_time = time.time() - test_start_time test_accuracy = test_corrects / num_test_we_use test_average_loss = test_loss / num_test_we_use print('type of all_preds:', type(all_preds)) print('leng of all preds:', len(all_preds)) save_test = int("{:4.0f}".format(test_accuracy * 10000)) pred_name = model_pure_name + '_test_' + str(save_test) + '_crop_' + str( crop_type) + '.pkl' with open(pred_name, 'wb') as f: pickle.dump(all_preds, f) print('test elapsed: {:2.0f}m{:2.0f}s' ' test loss: {:4.4f}' ' test accu: {:.4f}'.format(test_elapsed_time // 60, test_elapsed_time % 60, test_average_loss, test_accuracy))
def train_model(train_dataset, train_num_each, val_dataset, val_num_each): num_train = len(train_dataset) num_val = len(val_dataset) train_useful_start_idx = get_useful_start_idx(sequence_length, train_num_each) #print('train_useful_start_idx ',train_useful_start_idx ) val_useful_start_idx = get_useful_start_idx(sequence_length, val_num_each) #print('test_useful_start_idx ', val_useful_start_idx) num_train_we_use = len(train_useful_start_idx) // num_gpu * num_gpu # print('num_train_we_use',num_train_we_use) #92166 num_val_we_use = len(val_useful_start_idx) // num_gpu * num_gpu # print('num_val_we_use', num_val_we_use) # num_train_we_use = 8000 # num_val_we_use = 800 train_we_use_start_idx = train_useful_start_idx[0:num_train_we_use] # 训练数据开始位置 val_we_use_start_idx = val_useful_start_idx[0:num_val_we_use] np.random.seed(0) np.random.shuffle(train_we_use_start_idx) # 将序列的所有元素随机排序 train_idx = [] for i in range(num_train_we_use): # 训练集帧数 for j in range(sequence_length): train_idx.append(train_we_use_start_idx[i] + j * srate) # 训练数据位置,每一张图是一个数据 # print('train_idx',train_idx) val_idx = [] for i in range(num_val_we_use): for j in range(sequence_length): val_idx.append(val_we_use_start_idx[i] + j * srate) # print('val_idx',val_idx) num_train_all = float(len(train_idx)) num_val_all = float(len(val_idx)) print('num of train dataset: {:6d}'.format(num_train)) print('num train start idx : {:6d}'.format(len(train_useful_start_idx))) print('last idx train start: {:6d}'.format(train_useful_start_idx[-1])) print('num of train we use : {:6d}'.format(num_train_we_use)) print('num of all train use: {:6d}'.format(int(num_train_all))) print('num of valid dataset: {:6d}'.format(num_val)) print('num valid start idx : {:6d}'.format(len(val_useful_start_idx))) print('last idx valid start: {:6d}'.format(val_useful_start_idx[-1])) print('num of valid we use : {:6d}'.format(num_val_we_use)) print('num of all valid use: {:6d}'.format(int(num_val_all))) val_loader = DataLoader( val_dataset, batch_size=val_batch_size, # sampler=val_idx, sampler=SeqSampler(val_dataset, val_idx), num_workers=workers, pin_memory=False ) model = res34_tcn() if use_gpu: model = model.cuda() model = DataParallel(model) criterion = nn.CrossEntropyLoss() optimizer = optim.Adam(model.parameters(), lr=learning_rate) # model.parameters()与model.state_dict()是Pytorch中用于查看网络参数的方法。前者多见于优化器的初始化,后者多见于模型的保存 best_model_wts = copy.deepcopy(model.state_dict()) best_val_accuracy = 0.0 correspond_train_acc = 0.0 record_np = np.zeros([epochs, 4]) for epoch in range(epochs): np.random.seed(epoch) np.random.shuffle(train_we_use_start_idx) # 将序列的所有元素随机排序 train_idx = [] for i in range(num_train_we_use): for j in range(sequence_length): train_idx.append(train_we_use_start_idx[i] + j * srate) train_loader = DataLoader( train_dataset, batch_size=train_batch_size, sampler=SeqSampler(train_dataset, train_idx), num_workers=workers, pin_memory=False ) model.train() train_loss = 0.0 train_corrects = 0 train_start_time = time.time() num = 0 train_num = 0 for data in train_loader: num = num + 1 # inputs, labels_phase = data inputs, labels_phase, kdata = data if use_gpu: inputs = Variable(inputs.cuda()) # Variable就是一个存放会变化值的地理位置,里面的值会不停发生变化 labels = Variable(labels_phase.cuda()) kdatas = Variable(kdata.cuda()) else: inputs = Variable(inputs) labels = Variable(labels_phase) kdatas = Variable(kdata) optimizer.zero_grad() # 梯度初始化为零,也就是把loss关于weight的导数变成0. # outputs = model.forward(inputs) # 前向传播 outputs = model.forward(inputs, kdatas) #outputs = F.softmax(outputs, dim=-1) _, preds = torch.max(outputs.data, -1) # .data 获取Variable的内部Tensor;torch.max(a,1)返回每一行中最大值的那个元素,且返回其索引 #_, yp = torch.max(y.data, 1) #print(yp) # print(yp.shape) print(num) print(preds) print(labels) loss = criterion(outputs, labels) loss.backward() optimizer.step() train_loss += loss.data train_corrects += torch.sum(preds == labels.data) train_num += labels.shape[0] print(train_corrects.cpu().numpy() / train_num) if train_corrects.cpu().numpy() / train_num > 0.75: torch.save(copy.deepcopy(model.state_dict()), 'test.pth') # .state_dict()只保存网络中的参数(速度快,占内存少) train_elapsed_time = time.time() - train_start_time #train_accuracy1 = train_corrects1.cpu().numpy() / train_num train_accuracy = train_corrects.cpu().numpy() / train_num train_average_loss = train_loss / train_num # begin eval model.eval() val_loss = 0.0 val_corrects = 0 val_num = 0 val_start_time = time.time() for data in val_loader: inputs, labels_phase, kdata = data #inputs, labels_phase = data #labels_phase = labels_phase[(sequence_length - 1)::sequence_length] #kdata = kdata[(sequence_length - 1)::sequence_length] if use_gpu: inputs = Variable(inputs.cuda()) labels = Variable(labels_phase.cuda()) kdatas = Variable(kdata.cuda()) else: inputs = Variable(inputs) labels = Variable(labels_phase) kdatas = Variable(kdata) if crop_type == 0 or crop_type == 1: #outputs = model.forward(inputs) outputs = model.forward(inputs, kdatas) elif crop_type == 5: inputs = inputs.permute(1, 0, 2, 3, 4).contiguous() inputs = inputs.view(-1, 3, 224, 224) outputs = model.forward(inputs, kdatas) # outputs = model.forward(inputs) outputs = outputs.view(5, -1, 3) outputs = torch.mean(outputs, 0) elif crop_type == 10: inputs = inputs.permute(1, 0, 2, 3, 4).contiguous() inputs = inputs.view(-1, 3, 224, 224) outputs = model.forward(inputs, kdatas) #outputs = model.forward(inputs) outputs = outputs.view(10, -1, 3) outputs = torch.mean(outputs, 0) #outputs = outputs[sequence_length - 1::sequence_length] _, preds = torch.max(outputs.data, -1) #_, yp = torch.max(y.data, 1) print(num) print(preds) print(labels) loss = criterion(outputs, labels) #loss = 0.05 * loss1 + 0.15 * loss2 + 0.3 * loss3 + 0.5 * loss4 #loss = 0.05 * loss1 + 0.1 * loss2 + 0.25 * loss3 + 0.6 * loss4 val_loss += loss.data val_corrects += torch.sum(preds == labels.data) val_num += labels.shape[0] val_elapsed_time = time.time() - val_start_time val_accuracy = val_corrects.cpu().numpy() / val_num val_average_loss = val_loss / val_num print('epoch: {:4d}' ' train in: {:2.0f}m{:2.0f}s' ' train loss: {:4.4f}' ' train accu: {:.4f}' ' valid in: {:2.0f}m{:2.0f}s' ' valid loss: {:4.4f}' ' valid accu: {:.4f}' .format(epoch, train_elapsed_time // 60, train_elapsed_time % 60, train_average_loss, train_accuracy, val_elapsed_time // 60, val_elapsed_time % 60, val_average_loss, val_accuracy)) if optimizer_choice == 0: if sgd_adjust_lr == 0: exp_lr_scheduler.step() elif sgd_adjust_lr == 1: exp_lr_scheduler.step(val_average_loss) if val_accuracy > best_val_accuracy: best_val_accuracy = val_accuracy correspond_train_acc = train_accuracy best_model_wts = copy.deepcopy(model.state_dict()) if val_accuracy == best_val_accuracy: if train_accuracy > correspond_train_acc: correspond_train_acc = train_accuracy best_model_wts = copy.deepcopy(model.state_dict()) record_np[epoch, 0] = train_accuracy record_np[epoch, 1] = train_average_loss record_np[epoch, 2] = val_accuracy record_np[epoch, 3] = val_average_loss np.save(str(epoch) + '.npy', record_np) print('best accuracy: {:.4f} cor train accu: {:.4f}'.format(best_val_accuracy, correspond_train_acc)) save_val = int("{:4.0f}".format(best_val_accuracy * 10000)) save_train = int("{:4.0f}".format(correspond_train_acc * 10000)) model_name = "tcn" \ + "_epoch_" + str(epochs) \ + "_length_" + str(sequence_length) \ + "_opt_" + str(optimizer_choice) \ + "_mulopt_" + str(multi_optim) \ + "_flip_" + str(use_flip) \ + "_crop_" + str(crop_type) \ + "_batch_" + str(train_batch_size) \ + "_train_" + str(save_train) \ + "_val_" + str(save_val) \ + ".pth" torch.save(best_model_wts, model_name) record_name = "tcn" \ + "_epoch_" + str(epochs) \ + "_length_" + str(sequence_length) \ + "_opt_" + str(optimizer_choice) \ + "_mulopt_" + str(multi_optim) \ + "_flip_" + str(use_flip) \ + "_crop_" + str(crop_type) \ + "_batch_" + str(train_batch_size) \ + "_train_" + str(save_train) \ + "_val_" + str(save_val) \ + ".npy" np.save(record_name, record_np)
def train_model(train_dataset, train_num_each, val_dataset, val_num_each): num_train = len(train_dataset) num_val = len(val_dataset) train_useful_start_idx = get_useful_start_idx(sequence_length, train_num_each) val_useful_start_idx = get_useful_start_idx(sequence_length, val_num_each) num_train_we_use = len(train_useful_start_idx) // num_gpu * num_gpu num_val_we_use = len(val_useful_start_idx) // num_gpu * num_gpu # num_train_we_use = 4 # num_val_we_use = 800 train_we_use_start_idx = train_useful_start_idx[0:num_train_we_use] val_we_use_start_idx = val_useful_start_idx[0:num_val_we_use] train_idx = [] for i in range(num_train_we_use): for j in range(sequence_length): train_idx.append(train_we_use_start_idx[i] + j) val_idx = [] for i in range(num_val_we_use): for j in range(sequence_length): val_idx.append(val_we_use_start_idx[i] + j) num_train_all = len(train_idx) num_val_all = len(val_idx) print('num train start idx : {:6d}'.format(len(train_useful_start_idx))) print('last idx train start: {:6d}'.format(train_useful_start_idx[-1])) print('num of train dataset: {:6d}'.format(num_train)) print('num of train we use : {:6d}'.format(num_train_we_use)) print('num of all train use: {:6d}'.format(num_train_all)) print('num valid start idx : {:6d}'.format(len(val_useful_start_idx))) print('last idx valid start: {:6d}'.format(val_useful_start_idx[-1])) print('num of valid dataset: {:6d}'.format(num_val)) print('num of valid we use : {:6d}'.format(num_val_we_use)) print('num of all valid use: {:6d}'.format(num_val_all)) train_loader = DataLoader(train_dataset, batch_size=train_batch_size, sampler=train_idx, num_workers=workers, pin_memory=False) val_loader = DataLoader(val_dataset, batch_size=val_batch_size, sampler=val_idx, num_workers=workers, pin_memory=False) model = multi_lstm_4loss() sig_f = nn.Sigmoid() if use_gpu: model = model.cuda() sig_f = sig_f.cuda() model = DataParallel(model) criterion_1 = nn.BCEWithLogitsLoss(size_average=False) criterion_2 = nn.CrossEntropyLoss(size_average=False) if multi_optim == 0: if optimizer_choice == 0: optimizer = optim.SGD(model.parameters(), lr=learning_rate, momentum=momentum, dampening=dampening, weight_decay=weight_decay, nesterov=use_nesterov) if sgd_adjust_lr == 0: exp_lr_scheduler = lr_scheduler.StepLR(optimizer, step_size=sgd_adjust_lr, gamma=sgd_gamma) elif sgd_adjust_lr == 1: exp_lr_scheduler = lr_scheduler.ReduceLROnPlateau( optimizer, 'min') elif optimizer_choice == 1: optimizer = optim.Adam(model.parameters(), lr=learning_rate) elif multi_optim == 1: if optimizer_choice == 0: optimizer = optim.SGD([ { 'params': model.module.share.parameters() }, { 'params': model.module.lstm.parameters(), 'lr': learning_rate }, { 'params': model.module.fc_tool.parameters(), 'lr': learning_rate }, { 'params': model.module.fc_tool1.parameters(), 'lr': learning_rate }, { 'params': model.module.fc_tool2.parameters(), 'lr': learning_rate }, { 'params': model.module.fc_phase1.parameters(), 'lr': learning_rate }, { 'params': model.module.fc_phase2.parameters(), 'lr': learning_rate }, ], lr=learning_rate / 10, momentum=momentum, dampening=dampening, weight_decay=weight_decay, nesterov=use_nesterov) if sgd_adjust_lr == 0: exp_lr_scheduler = lr_scheduler.StepLR(optimizer, step_size=sgd_adjust_lr, gamma=sgd_gamma) elif sgd_adjust_lr == 1: exp_lr_scheduler = lr_scheduler.ReduceLROnPlateau( optimizer, 'min') elif optimizer_choice == 1: optimizer = optim.Adam([ { 'params': model.module.share.parameters() }, { 'params': model.module.lstm.parameters(), 'lr': learning_rate }, { 'params': model.module.fc_tool.parameters(), 'lr': learning_rate }, { 'params': model.module.fc_tool1.parameters(), 'lr': learning_rate }, { 'params': model.module.fc_tool2.parameters(), 'lr': learning_rate }, { 'params': model.module.fc_phase1.parameters(), 'lr': learning_rate }, { 'params': model.module.fc_phase2.parameters(), 'lr': learning_rate }, ], lr=learning_rate / 10) best_model_wts = copy.deepcopy(model.state_dict()) best_val_accuracy_1 = 0.0 best_val_accuracy_2 = 0.0 # judge by accu2 correspond_train_acc_1 = 0.0 correspond_train_acc_2 = 0.0 record_np = np.zeros([epochs, 8]) for epoch in range(epochs): # np.random.seed(epoch) np.random.shuffle(train_we_use_start_idx) train_idx = [] for i in range(num_train_we_use): for j in range(sequence_length): train_idx.append(train_we_use_start_idx[i] + j) train_loader = DataLoader(train_dataset, batch_size=train_batch_size, sampler=train_idx, num_workers=workers, pin_memory=False) model.train() train_loss_11 = 0.0 train_loss_12 = 0.0 train_loss_21 = 0.0 train_loss_22 = 0.0 train_corrects_11 = 0 train_corrects_12 = 0 train_corrects_21 = 0 train_corrects_22 = 0 train_start_time = time.time() for data in train_loader: inputs, labels_1, labels_2 = data if use_gpu: inputs = Variable(inputs.cuda()) labels_1 = Variable(labels_1.cuda()) labels_2 = Variable(labels_2.cuda()) else: inputs = Variable(inputs) labels_1 = Variable(labels_1) labels_2 = Variable(labels_2) optimizer.zero_grad() outputs_11, outputs_12, outputs_21, outputs_22 = model.forward( inputs) _, preds_12 = torch.max(outputs_12.data, 1) _, preds_22 = torch.max(outputs_22.data, 1) sig_out_11 = sig_f(outputs_11.data) sig_out_21 = sig_f(outputs_21.data) preds_11 = torch.ByteTensor(sig_out_11.cpu() > 0.5) preds_11 = preds_11.long() train_corrects_11 += torch.sum(preds_11 == labels_1.data.cpu()) preds_21 = torch.ByteTensor(sig_out_21.cpu() > 0.5) preds_21 = preds_21.long() train_corrects_21 += torch.sum(preds_21 == labels_1.data.cpu()) labels_1 = Variable(labels_1.data.float()) loss_11 = criterion_1(outputs_11, labels_1) loss_21 = criterion_1(outputs_21, labels_1) loss_12 = criterion_2(outputs_12, labels_2) loss_22 = criterion_2(outputs_22, labels_2) loss = loss_11 + loss_12 + loss_21 + loss_22 loss.backward() optimizer.step() train_loss_11 += loss_11.data[0] train_loss_12 += loss_12.data[0] train_loss_21 += loss_21.data[0] train_loss_22 += loss_22.data[0] train_corrects_12 += torch.sum(preds_12 == labels_2.data) train_corrects_22 += torch.sum(preds_22 == labels_2.data) train_elapsed_time = time.time() - train_start_time train_accuracy_11 = train_corrects_11 / num_train_all / 7 train_accuracy_21 = train_corrects_21 / num_train_all / 7 train_accuracy_12 = train_corrects_12 / num_train_all train_accuracy_22 = train_corrects_22 / num_train_all train_average_loss_11 = train_loss_11 / num_train_all / 7 train_average_loss_21 = train_loss_21 / num_train_all / 7 train_average_loss_12 = train_loss_12 / num_train_all train_average_loss_22 = train_loss_22 / num_train_all # begin eval model.eval() val_loss_11 = 0.0 val_loss_12 = 0.0 val_loss_21 = 0.0 val_loss_22 = 0.0 val_corrects_11 = 0 val_corrects_12 = 0 val_corrects_21 = 0 val_corrects_22 = 0 val_start_time = time.time() for data in val_loader: inputs, labels_1, labels_2 = data labels_2 = labels_2[(sequence_length - 1)::sequence_length] if use_gpu: inputs = Variable(inputs.cuda(), volatile=True) labels_1 = Variable(labels_1.cuda(), volatile=True) labels_2 = Variable(labels_2.cuda(), volatile=True) else: inputs = Variable(inputs, volatile=True) labels_1 = Variable(labels_1, volatile=True) labels_2 = Variable(labels_2, volatile=True) # if crop_type == 0 or crop_type == 1: # outputs_1, outputs_2 = model.forward(inputs) # elif crop_type == 5: # inputs = inputs.permute(1, 0, 2, 3, 4).contiguous() # inputs = inputs.view(-1, 3, 224, 224) # outputs_1, outputs_2 = model.forward(inputs) # outputs_1 = outputs_1.view(5, -1, 7) # outputs_1 = torch.mean(outputs_1, 0) # outputs_2 = outputs_2.view(5, -1, 7) # outputs_2 = torch.mean(outputs_2, 0) # elif crop_type == 10: # inputs = inputs.permute(1, 0, 2, 3, 4).contiguous() # inputs = inputs.view(-1, 3, 224, 224) # outputs_1, outputs_2 = model.forward(inputs) # outputs_1 = outputs_1.view(10, -1, 7) # outputs_1 = torch.mean(outputs_1, 0) # outputs_2 = outputs_2.view(10, -1, 7) # outputs_2 = torch.mean(outputs_2, 0) outputs_11, outputs_12, outputs_21, outputs_22 = model.forward( inputs) outputs_12 = outputs_12[sequence_length - 1::sequence_length] outputs_22 = outputs_22[sequence_length - 1::sequence_length] _, preds_12 = torch.max(outputs_12.data, 1) _, preds_22 = torch.max(outputs_22.data, 1) sig_out_11 = sig_f(outputs_11.data) sig_out_21 = sig_f(outputs_21.data) preds_11 = torch.ByteTensor(sig_out_11.cpu() > 0.5) preds_11 = preds_11.long() train_corrects_11 += torch.sum(preds_11 == labels_1.data.cpu()) preds_21 = torch.ByteTensor(sig_out_21.cpu() > 0.5) preds_21 = preds_21.long() train_corrects_21 += torch.sum(preds_21 == labels_1.data.cpu()) labels_1 = Variable(labels_1.data.float()) loss_11 = criterion_1(outputs_11, labels_1) loss_21 = criterion_1(outputs_21, labels_1) loss_12 = criterion_2(outputs_12, labels_2) loss_22 = criterion_2(outputs_22, labels_2) val_loss_11 += loss_11.data[0] val_loss_12 += loss_12.data[0] val_loss_21 += loss_21.data[0] val_loss_22 += loss_22.data[0] val_corrects_12 += torch.sum(preds_12 == labels_2.data) val_corrects_22 += torch.sum(preds_22 == labels_2.data) val_elapsed_time = time.time() - val_start_time val_accuracy_11 = val_corrects_11 / num_val_all / 7 val_accuracy_21 = val_corrects_21 / num_val_all / 7 val_accuracy_12 = val_corrects_12 / num_val_we_use val_accuracy_22 = val_corrects_22 / num_val_we_use val_average_loss_11 = val_loss_11 / num_val_all / 7 val_average_loss_21 = val_loss_21 / num_val_all / 7 val_average_loss_12 = val_loss_12 / num_val_we_use val_average_loss_22 = val_loss_22 / num_val_we_use print('epoch: {:4d}' ' train time: {:2.0f}m{:2.0f}s' ' train accu_11: {:.4f}' ' train accu_21: {:.4f}' ' valid time: {:2.0f}m{:2.0f}s' ' valid accu_11: {:.4f}' ' valid accu_21: {:.4f}'.format( epoch, train_elapsed_time // 60, train_elapsed_time % 60, train_accuracy_11, train_accuracy_21, val_elapsed_time // 60, val_elapsed_time % 60, val_accuracy_11, val_accuracy_21)) print('epoch: {:4d}' ' train time: {:2.0f}m{:2.0f}s' ' train accu_12: {:.4f}' ' train accu_22: {:.4f}' ' valid time: {:2.0f}m{:2.0f}s' ' valid accu_12: {:.4f}' ' valid accu_22: {:.4f}'.format( epoch, train_elapsed_time // 60, train_elapsed_time % 60, train_accuracy_12, train_accuracy_22, val_elapsed_time // 60, val_elapsed_time % 60, val_accuracy_12, val_accuracy_22)) if optimizer_choice == 0: if sgd_adjust_lr == 0: exp_lr_scheduler.step() elif sgd_adjust_lr == 1: exp_lr_scheduler.step(val_average_loss_11 + val_average_loss_12 + val_average_loss_21 + val_average_loss_22)
def main(): parser = argparse.ArgumentParser() parser.add_argument('--device', default='0,1,2,3', type=str, required=False, help='设置使用哪些显卡') parser.add_argument('--model_config', default='config/config.json', type=str, required=False, help='选择模型参数') parser.add_argument('--tokenizer_path', default='vocab/vocab.txt', type=str, required=False, help='选择词库') parser.add_argument('--raw_data_path', default='data/train.txt', type=str, required=False, help='原始训练语料') parser.add_argument('--tokenized_data_path', default='data/tokenized/', type=str, required=False, help='tokenized语料存放位置') parser.add_argument('--epochs', default=1, type=int, required=False, help='训练循环') parser.add_argument('--batch_size', default=8, type=int, required=False, help='训练batch size') parser.add_argument('--lr', default=1.5e-4, type=float, required=False, help='学习率') parser.add_argument('--warmup_steps', default=2000, type=int, required=False, help='warm up步数') parser.add_argument('--log_step', default=1, type=int, required=False, help='多少步汇报一次loss,设置为gradient accumulation的整数倍') parser.add_argument('--stride', default=768, type=int, required=False, help='训练时取训练数据的窗口步长') parser.add_argument('--gradient_accumulation', default=1, type=int, required=False, help='梯度积累') parser.add_argument('--fp16', action='store_true', help='混合精度') parser.add_argument('--fp16_opt_level', default='O1', type=str, required=False) parser.add_argument('--max_grad_norm', default=1.0, type=float, required=False) parser.add_argument('--num_pieces', default=100, type=int, required=False, help='将训练语料分成多少份') parser.add_argument('--min_length', default=128, type=int, required=False, help='最短收录文章长度') parser.add_argument('--output_dir', default='model/', type=str, required=False, help='模型输出路径') parser.add_argument('--pretrained_model', default='model/', type=str, required=False, help='模型训练起点路径') parser.add_argument('--writer_dir', default='tensorboard_summary/', type=str, required=False, help='Tensorboard路径') parser.add_argument('--segment', action='store_true', help='中文以词为单位') parser.add_argument('--bpe_token', action='store_true', help='subword') parser.add_argument('--encoder_json', default="tokenizations/encoder.json", type=str, help="encoder.json") parser.add_argument('--vocab_bpe', default="tokenizations/vocab.bpe", type=str, help="vocab.bpe") args = parser.parse_args() print('args:\n' + args.__repr__()) if args.segment: from tokenizations import tokenization_bert_word_level as tokenization_bert else: from tokenizations import tokenization_bert os.environ["CUDA_VISIBLE_DEVICES"] = args.device # 此处设置程序使用哪些显卡 model_config = transformers.modeling_gpt2.GPT2Config.from_json_file( args.model_config) print('config:\n' + model_config.to_json_string()) n_ctx = model_config.n_ctx if args.bpe_token: full_tokenizer = get_encoder(args.encoder_json, args.vocab_bpe) else: full_tokenizer = tokenization_bert.BertTokenizer( vocab_file=args.tokenizer_path) full_tokenizer.max_len = 999999 device = 'cuda' if torch.cuda.is_available() else 'cpu' print('using device:', device) raw_data_path = args.raw_data_path tokenized_data_path = args.tokenized_data_path epochs = args.epochs batch_size = args.batch_size lr = args.lr warmup_steps = args.warmup_steps log_step = args.log_step stride = args.stride gradient_accumulation = args.gradient_accumulation fp16 = args.fp16 fp16_opt_level = args.fp16_opt_level max_grad_norm = args.max_grad_norm num_pieces = args.num_pieces min_length = args.min_length output_dir = args.output_dir tb_writer = SummaryWriter(log_dir=args.writer_dir) assert log_step % gradient_accumulation == 0 if not os.path.exists(output_dir): os.mkdir(output_dir) if not args.pretrained_model: model = transformers.modeling_gpt2.GPT2LMHeadModel(config=model_config) else: model = transformers.modeling_gpt2.GPT2LMHeadModel.from_pretrained( args.pretrained_model) model.train() model.to(device) num_parameters = 0 parameters = model.parameters() for parameter in parameters: num_parameters += parameter.numel() print('number of parameters: {}'.format(num_parameters)) multi_gpu = False full_len = 0 print('calculating total steps') for i in tqdm(range(num_pieces)): with open(tokenized_data_path + 'tokenized_train_{}.txt'.format(i), 'r') as f: full_len += len([int(item) for item in f.read().strip().split()]) total_steps = int(full_len / stride * epochs / batch_size / gradient_accumulation) print('total steps = {}'.format(total_steps)) optimizer = transformers.AdamW(model.parameters(), lr=lr, correct_bias=True) scheduler = transformers.WarmupLinearSchedule(optimizer, warmup_steps=warmup_steps, t_total=total_steps) if fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=fp16_opt_level) if torch.cuda.device_count() > 1: print("Let's use", torch.cuda.device_count(), "GPUs!") model = DataParallel( model, device_ids=[int(i) for i in args.device.split(',')]) multi_gpu = True print('starting training') overall_step = 0 running_loss = 0 for epoch in range(epochs): print('epoch {}'.format(epoch + 1)) now = datetime.now() print('time: {}'.format(now)) x = np.linspace(0, num_pieces - 1, num_pieces, dtype=np.int32) random.shuffle(x) piece_num = 0 for i in x: with open(tokenized_data_path + 'tokenized_train_{}.txt'.format(i), 'r') as f: line = f.read().strip() tokens = line.split() tokens = [int(token) for token in tokens] start_point = 0 samples = [] while start_point < len(tokens) - n_ctx: samples.append(tokens[start_point:start_point + n_ctx]) start_point += stride if start_point < len(tokens): samples.append(tokens[len(tokens) - n_ctx:]) random.shuffle(samples) for step in range(len(samples) // batch_size): # drop last # prepare data batch = samples[step * batch_size:(step + 1) * batch_size] batch_inputs = [] for ids in batch: int_ids = [int(x) for x in ids] batch_inputs.append(int_ids) batch_inputs = torch.tensor(batch_inputs).long().to(device) # forward pass with torch.no_grad(): outputs = model.forward(input_ids=batch_inputs, labels=batch_inputs) loss, logits = outputs[:2] loss.requires_grad_(True) # get loss if multi_gpu: loss = loss.mean() if gradient_accumulation > 1: loss = loss / gradient_accumulation # loss backward if fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), max_grad_norm) else: loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm) # optimizer step if (overall_step + 1) % gradient_accumulation == 0: running_loss += loss.item() optimizer.step() optimizer.zero_grad() scheduler.step() if (overall_step + 1) % log_step == 0: tb_writer.add_scalar('loss', loss.item() * gradient_accumulation, overall_step) print( 'now time: {}:{}. Step {} of piece {} of epoch {}, loss {}' .format( datetime.now().hour, datetime.now().minute, step + 1, piece_num, epoch + 1, running_loss * gradient_accumulation / (log_step / gradient_accumulation))) running_loss = 0 overall_step += 1 piece_num += 1 print('saving model for epoch {}'.format(epoch + 1)) if not os.path.exists(output_dir + 'model_epoch{}'.format(epoch + 1)): os.mkdir(output_dir + 'model_epoch{}'.format(epoch + 1)) model_to_save = model.module if hasattr(model, 'module') else model model_to_save.save_pretrained(output_dir + 'model_epoch{}'.format(epoch + 1)) print('epoch {} finished'.format(epoch + 1)) then = datetime.now() print('time: {}'.format(then)) print('time for one epoch: {}'.format(then - now)) print('training finished') if not os.path.exists(output_dir + 'final_model'): os.mkdir(output_dir + 'final_model') model_to_save = model.module if hasattr(model, 'module') else model model_to_save.save_pretrained(output_dir + 'final_model')
def test_model(test_dataset, test_num_each): num_test = len(test_dataset) test_count = 0 for i in range(len(test_num_each)): test_count += test_num_each[i] test_useful_start_idx = get_useful_start_idx(sequence_length, test_num_each) num_test_we_use = len(test_useful_start_idx) # 其实需要除以gpu个数再乘以gpu个数,但是为了保证所有都测试到,尽量保证test个数完整 # num_test_we_use = 804 test_we_use_start_idx = test_useful_start_idx[0:num_test_we_use] test_idx = [] for i in range(num_test_we_use): for j in range(sequence_length): test_idx.append(test_we_use_start_idx[i] + j) num_test_all = len(test_idx) print('num test start idx : {:6d}'.format(len(test_useful_start_idx))) print('last idx test start: {:6d}'.format(test_useful_start_idx[-1])) print('num of test dataset: {:6d}'.format(num_test)) print('num of test we use : {:6d}'.format(num_test_we_use)) print('num of all test use: {:6d}'.format(num_test_all)) test_loader = DataLoader( test_dataset, batch_size=test_batch_size, sampler=test_idx, num_workers=1, pin_memory=False ) model = multi_lstm() model = DataParallel(model) model.load_state_dict(torch.load(model_name)) # model = model.module # model = DataParallel(model) if use_gpu: model = model.cuda() # model = DataParallel(model) # model = model.module criterion_1 = nn.BCEWithLogitsLoss(size_average=False) criterion_2 = nn.CrossEntropyLoss(size_average=False) sig_f = nn.Sigmoid() model.eval() test_loss_1 = 0.0 test_loss_2 = 0.0 test_corrects_2 = 0 test_start_time = time.time() all_preds_1 = [] all_labels_1 = [] all_preds_2 = [] for data in test_loader: inputs, labels_1, labels_2 = data # labels_1 = labels_1[(sequence_length - 1)::sequence_length] #labels_2 = labels_2[(sequence_length - 1)::sequence_length] if use_gpu: inputs = Variable(inputs.cuda(), volatile=True) labels_1 = Variable(labels_1.cuda(), volatile=True) #labels_2 = Variable(labels_2.cuda(), volatile=True) else: inputs = Variable(inputs, volatile=True) labels_1 = Variable(labels_1, volatile=True) #labels_2 = Variable(labels_2, voatile=True) if crop_type == 0 or crop_type == 1: outputs_1 = model.forward(inputs) elif crop_type == 5: inputs = inputs.permute(1, 0, 2, 3, 4).contiguous() inputs = inputs.view(-1, 3, 224, 224) outputs_1 = model.forward(inputs) outputs_1 = outputs_1.view(5, -1, 7) outputs_1 = torch.mean(outputs_1, 0) #outputs_2 = outputs_2.view(5, -1, 7) #outputs_2 = torch.mean(outputs_2, 0) elif crop_type == 10: inputs = inputs.permute(1, 0, 2, 3, 4).contiguous() inputs = inputs.view(-1, 3, 224, 224) outputs_1 = model.forward(inputs) outputs_1 = outputs_1.view(10, -1, 7) outputs_1 = torch.mean(outputs_1, 0) #outputs_2 = outputs_2.view(10, -1, 7) #outputs_2 = torch.mean(outputs_2, 0) # outputs_1 = outputs_1[sequence_length-1::sequence_length] #outputs_2 = outputs_2[sequence_length-1::sequence_length] #_, preds_2 = torch.max(outputs_2.data, 1) for i in range(len(outputs_1)): all_preds_1.append(outputs_1[i].data.cpu().numpy().tolist()) all_labels_1.append(labels_1[i].data.cpu().numpy().tolist()) # for i in range(len(preds_2)): # all_preds_2.append(preds_2[i]) print('preds_1: {:6d}'.format(len(all_preds_1))) # labels_1 = Variable(labels_1.data.float()) # loss_1 = criterion_1(outputs_1, labels_1) # test_loss_1 += loss_1.data[0] #loss_2 = criterion_2(outputs_2, labels_2) #test_loss_2 += loss_2.data[0] #test_corrects_2 += torch.sum(preds_2 == labels_2.data) all_preds_1_cor = [] all_labels_1_cor = [] cor_count = 0 for i in range(len(test_num_each)): for j in range(cor_count, cor_count + test_num_each[i] - (sequence_length - 1)): if j==cor_count: for k in range(sequence_length-1): all_preds_1_cor.append(all_preds_1[sequence_length * j + k]) all_labels_1_cor.append(all_labels_1[sequence_length * j + k]) all_preds_1_cor.append(all_preds_1[sequence_length * j + sequence_length - 1]) all_labels_1_cor.append(all_labels_1[sequence_length * j + sequence_length - 1]) cor_count += test_num_each[i] + 1 - sequence_length print('all_preds_1 : {:6d}'.format(len(all_preds_1))) print('all_labels_1: {:6d}'.format(len(all_labels_1))) print('cor_labels_1: {:6d}'.format(len(all_preds_1_cor))) print('cor_labels_1: {:6d}'.format(len(all_labels_1_cor))) pt_preds_1 = torch.from_numpy(np.asarray(all_preds_1_cor, dtype=np.float32)) pt_labels_1 = torch.from_numpy(np.asarray(all_labels_1_cor, dtype=np.float32)) pt_labels_1 = Variable(pt_labels_1, requires_grad=False) pt_preds_1 = Variable(pt_preds_1, requires_grad=False) loss_1 = criterion_1(pt_preds_1, pt_labels_1) test_loss_1 += loss_1.data[0] pt_labels_1 = pt_labels_1.data pt_preds_1 = pt_preds_1.data sig_out = sig_f(pt_preds_1) preds_cor = torch.ByteTensor(sig_out > 0.5) preds_cor = preds_cor.long() pt_labels_1 = pt_labels_1.long() test_corrects_1 = torch.sum(preds_cor == pt_labels_1) test_elapsed_time = time.time() - test_start_time test_accuracy_1 = test_corrects_1 / num_test / 7 #test_accuracy_2 = test_corrects_2 / num_test_we_use test_average_loss_1 = test_loss_1 / num_test / 7 #test_average_loss_2 = test_loss_2 / num_test_we_use print('preds_1 num: {:6d}'.format(len(all_preds_1_cor))) save_test1 = int("{:4.0f}".format(test_accuracy_1 * 10000)) #save_test2 = int("{:4.0f}".format(test_accuracy_2 * 10000)) pred_1_name = model_pure_name + '_test1_' + str(save_test1) + '_crop_' + str(crop_type) + '.pkl' #pred_2_name = model_pure_name + '_test2_' + str(save_test2) + '_crop_' + str(crop_type) + '.pkl' with open(pred_1_name, 'wb') as f: pickle.dump(all_preds_1_cor, f) # with open(pred_2_name, 'wb') as f: # pickle.dump(all_preds_2, f) print('test completed in:' ' {:2.0f}m{:2.0f}s' ' test loss_1: {:4.4f}' ' test accu_1: {:.4f}' .format(test_elapsed_time // 60, test_elapsed_time % 60, test_average_loss_1, test_accuracy_1))
def main(): parser = argparse.ArgumentParser() parser.add_argument('--device', default='0,1,2,3', type=str, required=False, help='cuda visible devices') parser.add_argument('--model_config', default='config/model_config.json', type=str, required=False, help='path of the model configration file') parser.add_argument('--tokenizer_path', default='data/vocabs.txt', type=str, required=False, help='path of the vocabulary file') parser.add_argument('--raw_data_path', default='data/samples.json', type=str, required=False, help='path of the samples file') parser.add_argument('--tokenized_data_path', default='data/tokenized/', type=str, required=False, help='save the tokenized samples file to this dir') parser.add_argument( '--raw', action='store_true', help= 'do tokenize before training, no need if already tokenized with same configration' ) parser.add_argument('--epochs', default=24, type=int, required=False) parser.add_argument('--batch_size', default=16, type=int, required=False) parser.add_argument('--lr', default=2e-4, type=float, required=False) parser.add_argument('--warmup_steps', default=4000, type=int, required=False) parser.add_argument('--log_step', default=4000, type=int, required=False, help='period of reporting loss') parser.add_argument('--gradient_accumulation', default=1, type=int, required=False) parser.add_argument('--max_grad_norm', default=1.0, type=float, required=False) parser.add_argument('--output_dir', default='model/', type=str, required=False, help='save the model to this dir') parser.add_argument('--pretrained_model', default='', type=str, required=False, help='pre-trained model dir') args = parser.parse_args() print('args:\n' + args.__repr__()) from tokenizations import tokenization_bert os.environ["CUDA_VISIBLE_DEVICES"] = args.device model_config = transformers.modeling_gpt2.GPT2Config.from_json_file( args.model_config) print('config:\n' + model_config.to_json_string()) n_ctx = model_config.n_ctx full_tokenizer = tokenization_bert.BertTokenizer( vocab_file=args.tokenizer_path) full_tokenizer.max_len = 999999 if torch.cuda.is_available(): device = 'cuda' print(torch.cuda.get_device_name(0)) else: device = 'cpu' print(device) raw_data_path = args.raw_data_path tokenized_data_path = args.tokenized_data_path raw = args.raw epochs = args.epochs batch_size = args.batch_size lr = args.lr warmup_steps = args.warmup_steps log_step = args.log_step gradient_accumulation = args.gradient_accumulation max_grad_norm = args.max_grad_norm output_dir = args.output_dir assert log_step % gradient_accumulation == 0 if not os.path.exists(output_dir): os.mkdir(output_dir) if raw: print('building files') build_files(data_path=raw_data_path, tokenized_data_path=tokenized_data_path, full_tokenizer=full_tokenizer, n_ctx=n_ctx) print('files built') if not args.pretrained_model: model = transformers.modeling_gpt2.GPT2LMHeadModel(config=model_config) else: model = transformers.modeling_gpt2.GPT2LMHeadModel.from_pretrained( args.pretrained_model) model.train() model.to(device) num_parameters = 0 parameters = model.parameters() for parameter in parameters: num_parameters += parameter.numel() print('number of parameters: {}'.format(num_parameters)) multi_gpu = False full_len = 0 print('calculating total steps') with open(tokenized_data_path + 'tokenized_train_{}.txt'.format(0), 'r') as f: full_len += len([int(item) for item in f.read().strip().split()]) total_steps = int(full_len / n_ctx * epochs / batch_size / gradient_accumulation) print('total steps = {}'.format(total_steps)) optimizer = transformers.AdamW(model.parameters(), lr=lr, correct_bias=True) scheduler = transformers.WarmupLinearSchedule(optimizer, warmup_steps=warmup_steps, t_total=total_steps) if torch.cuda.device_count() > 1: print("Let's use", torch.cuda.device_count(), "GPUs!") device_ids = [] for i in args.device.split(','): try: print(torch.cuda.get_device_name(int(i))) device_ids.append(int(i)) except: pass model = DataParallel(model, device_ids=device_ids) multi_gpu = True print('starting training') overall_step = 0 running_loss = 0 with open(tokenized_data_path + 'tokenized_train_{}.txt'.format(0), 'r') as f: line = f.read().strip() tokens = line.split() tokens = [int(token) for token in tokens] start_point = 0 samples = [] while start_point < len(tokens) - n_ctx: samples.append(tokens[start_point:start_point + n_ctx]) start_point += n_ctx if start_point < len(tokens): samples.append(tokens[len(tokens) - n_ctx:]) for epoch in range(epochs): print('epoch {}'.format(epoch + 1)) now = datetime.now() print('time: {}'.format(now)) samples2 = copy.deepcopy(samples) random.shuffle(samples2) for step in range(len(samples2) // batch_size): # drop last # prepare data batch = samples2[step * batch_size:(step + 1) * batch_size] batch_inputs = torch.tensor(batch).long().to(device) # forward pass outputs = model.forward(input_ids=batch_inputs, labels=batch_inputs) loss, logits = outputs[:2] if multi_gpu: loss = loss.mean() if gradient_accumulation > 1: loss = loss / gradient_accumulation # loss backward loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm) # optimizer step if (overall_step + 1) % gradient_accumulation == 0: running_loss += loss.item() optimizer.step() optimizer.zero_grad() scheduler.step() if (overall_step + 1) % log_step == 0: tb_writer.add_scalar('loss', loss.item() * gradient_accumulation, overall_step) print('now time: {}:{}. Step {} of epoch {}, loss {}'.format( datetime.now().hour, datetime.now().minute, step + 1, epoch + 1, running_loss * gradient_accumulation / (log_step / gradient_accumulation))) running_loss = 0 overall_step += 1 print('saving model for epoch {}'.format(epoch + 1)) temp_epoch = (epoch + 1) % 2 # save disk space if not os.path.exists(output_dir + 'model_epoch{}'.format(temp_epoch)): os.mkdir(output_dir + 'model_epoch{}'.format(temp_epoch)) model_to_save = model.module if hasattr(model, 'module') else model model_to_save.save_pretrained(output_dir + 'model_epoch{}'.format(temp_epoch)) #torch.save(scheduler, output_dir + 'model_epoch{}/scheduler.pt'.format(temp_epoch)) #torch.save(optimizer, output_dir + 'model_epoch{}/optimizer.pt'.format(temp_epoch)) print('epoch {} finished'.format(epoch + 1)) then = datetime.now() print('time: {}'.format(then)) print('time for one epoch: {}'.format(then - now)) print('training finished')
def train_model(train_dataset, train_num_each, val_dataset, val_num_each): num_train = len(train_dataset) num_val = len(val_dataset) train_useful_start_idx = get_useful_start_idx(sequence_length, train_num_each) val_useful_start_idx = get_useful_start_idx(sequence_length, val_num_each) num_train_we_use = len(train_useful_start_idx) // num_gpu * num_gpu num_val_we_use = len(val_useful_start_idx) // num_gpu * num_gpu # num_train_we_use = 8000 # num_val_we_use = 800 train_we_use_start_idx = train_useful_start_idx[0:num_train_we_use] val_we_use_start_idx = val_useful_start_idx[0:num_val_we_use] # np.random.seed(0) # np.random.shuffle(train_we_use_start_idx) train_idx = [] for i in range(num_train_we_use): for j in range(sequence_length): train_idx.append(train_we_use_start_idx[i] + j) val_idx = [] for i in range(num_val_we_use): for j in range(sequence_length): val_idx.append(val_we_use_start_idx[i] + j) num_train_all = len(train_idx) num_val_all = len(val_idx) print('num of train dataset: {:6d}'.format(num_train)) print('num train start idx : {:6d}'.format(len(train_useful_start_idx))) print('last idx train start: {:6d}'.format(train_useful_start_idx[-1])) print('num of train we use : {:6d}'.format(num_train_we_use)) print('num of all train use: {:6d}'.format(num_train_all)) print('num of valid dataset: {:6d}'.format(num_val)) print('num valid start idx : {:6d}'.format(len(val_useful_start_idx))) print('last idx valid start: {:6d}'.format(val_useful_start_idx[-1])) print('num of valid we use : {:6d}'.format(num_val_we_use)) print('num of all valid use: {:6d}'.format(num_val_all)) val_loader = DataLoader(val_dataset, batch_size=val_batch_size, sampler=SeqSampler(val_dataset, val_idx), num_workers=workers, pin_memory=False) model = resnet_lstm() if use_gpu: model = DataParallel(model) model.to(device) criterion = nn.CrossEntropyLoss(size_average=False) optimizer = None exp_lr_scheduler = None if multi_optim == 0: if optimizer_choice == 0: optimizer = optim.SGD(model.parameters(), lr=learning_rate, momentum=momentum, dampening=dampening, weight_decay=weight_decay, nesterov=use_nesterov) if sgd_adjust_lr == 0: exp_lr_scheduler = lr_scheduler.StepLR(optimizer, step_size=sgd_adjust_lr, gamma=sgd_gamma) elif sgd_adjust_lr == 1: exp_lr_scheduler = lr_scheduler.ReduceLROnPlateau( optimizer, 'min') elif optimizer_choice == 1: optimizer = optim.Adam(model.parameters(), lr=learning_rate) elif multi_optim == 1: if optimizer_choice == 0: optimizer = optim.SGD([ { 'params': model.module.share.parameters() }, { 'params': model.module.lstm.parameters(), 'lr': learning_rate }, { 'params': model.module.fc.parameters(), 'lr': learning_rate }, ], lr=learning_rate / 10, momentum=momentum, dampening=dampening, weight_decay=weight_decay, nesterov=use_nesterov) if sgd_adjust_lr == 0: exp_lr_scheduler = lr_scheduler.StepLR(optimizer, step_size=sgd_adjust_lr, gamma=sgd_gamma) elif sgd_adjust_lr == 1: exp_lr_scheduler = lr_scheduler.ReduceLROnPlateau( optimizer, 'min') elif optimizer_choice == 1: optimizer = optim.Adam([ { 'params': model.module.share.parameters() }, { 'params': model.module.lstm.parameters(), 'lr': learning_rate }, { 'params': model.module.fc.parameters(), 'lr': learning_rate }, ], lr=learning_rate / 10) best_model_wts = copy.deepcopy(model.module.state_dict()) best_val_accuracy = 0.0 correspond_train_acc = 0.0 best_epoch = 0 record_np = np.zeros([epochs, 4]) for epoch in range(epochs): # np.random.seed(epoch) np.random.shuffle(train_we_use_start_idx) train_idx = [] for i in range(num_train_we_use): for j in range(sequence_length): train_idx.append(train_we_use_start_idx[i] + j) train_loader = DataLoader(train_dataset, batch_size=train_batch_size, sampler=SeqSampler(train_dataset, train_idx), num_workers=workers, pin_memory=False) # Sets the module in training mode. model.train() train_loss = 0.0 train_corrects = 0 batch_progress = 0.0 train_start_time = time.time() for data in train_loader: optimizer.zero_grad() # 释放显存 torch.cuda.empty_cache() if use_gpu: inputs, labels = data[0].to(device), data[1].to(device) labels = labels[(sequence_length - 1)::sequence_length] else: inputs, labels = data[0], data[1] labels = labels[(sequence_length - 1)::sequence_length] inputs = inputs.view(-1, sequence_length, 3, 224, 224) outputs = model.forward(inputs) outputs = outputs[sequence_length - 1::sequence_length] _, preds = torch.max(outputs.data, 1) loss = criterion(outputs, labels) loss.backward() optimizer.step() train_loss += loss.data.item() batch_corrects = torch.sum(preds == labels.data) train_corrects += batch_corrects batch_acc = float( batch_corrects) / train_batch_size * sequence_length batch_progress += 1 if batch_progress * train_batch_size >= num_train_all: percent = 100.0 print('Batch progress: %s [%d/%d] Batch acc:%.2f' % (str(percent) + '%', num_train_all, num_train_all, batch_acc), end='\n') else: percent = round( batch_progress * train_batch_size / num_train_all * 100, 2) print('Batch progress: %s [%d/%d] Batch acc:%.2f' % (str(percent) + '%', batch_progress * train_batch_size, num_train_all, batch_acc), end='\r') train_elapsed_time = time.time() - train_start_time train_accuracy = float(train_corrects) / float( num_train_all) * sequence_length train_average_loss = train_loss / num_train_all * sequence_length # Sets the module in evaluation mode. model.eval() val_loss = 0.0 val_corrects = 0 val_start_time = time.time() val_progress = 0 with torch.no_grad(): for data in val_loader: # 释放显存 torch.cuda.empty_cache() if use_gpu: inputs, labels = data[0].to(device), data[1].to(device) labels = labels[(sequence_length - 1)::sequence_length] else: inputs, labels = data[0], data[1] labels = labels[(sequence_length - 1)::sequence_length] if crop_type == 0 or crop_type == 1: inputs = inputs.view(-1, sequence_length, 3, 224, 224) outputs = model.forward(inputs) elif crop_type == 5: inputs = inputs.permute(1, 0, 2, 3, 4).contiguous() inputs = inputs.view(-1, 3, 224, 224) outputs = model.forward(inputs) outputs = outputs.view(5, -1, 7) outputs = torch.mean(outputs, 0) elif crop_type == 10: inputs = inputs.permute(1, 0, 2, 3, 4).contiguous() inputs = inputs.view(-1, 3, 224, 224) outputs = model.forward(inputs) outputs = outputs.view(10, -1, 7) outputs = torch.mean(outputs, 0) outputs = outputs[sequence_length - 1::sequence_length] _, preds = torch.max(outputs.data, 1) loss = criterion(outputs, labels) val_loss += loss.data.item() val_corrects += torch.sum(preds == labels.data) val_progress += 1 if val_progress * val_batch_size >= num_val_all: percent = 100.0 print('Val progress: %s [%d/%d]' % (str(percent) + '%', num_val_all, num_val_all), end='\n') else: percent = round( val_progress * val_batch_size / num_val_all * 100, 2) print('Val progress: %s [%d/%d]' % (str(percent) + '%', val_progress * val_batch_size, num_val_all), end='\r') val_elapsed_time = time.time() - val_start_time val_accuracy = float(val_corrects) / float(num_val_we_use) val_average_loss = val_loss / num_val_we_use print('epoch: {:4d}' ' train in: {:2.0f}m{:2.0f}s' ' train loss: {:4.4f}' ' train accu: {:.4f}' ' valid in: {:2.0f}m{:2.0f}s' ' valid loss: {:4.4f}' ' valid accu: {:.4f}'.format(epoch, train_elapsed_time // 60, train_elapsed_time % 60, train_average_loss, train_accuracy, val_elapsed_time // 60, val_elapsed_time % 60, val_average_loss, val_accuracy)) if optimizer_choice == 0: if sgd_adjust_lr == 0: exp_lr_scheduler.step() elif sgd_adjust_lr == 1: exp_lr_scheduler.step(val_average_loss) if val_accuracy > best_val_accuracy: best_val_accuracy = val_accuracy correspond_train_acc = train_accuracy best_model_wts = copy.deepcopy(model.module.state_dict()) best_epoch = epoch if val_accuracy == best_val_accuracy: if train_accuracy > correspond_train_acc: correspond_train_acc = train_accuracy best_model_wts = copy.deepcopy(model.module.state_dict()) best_epoch = epoch record_np[epoch, 0] = train_accuracy record_np[epoch, 1] = train_average_loss record_np[epoch, 2] = val_accuracy record_np[epoch, 3] = val_average_loss save_val = int("{:4.0f}".format(best_val_accuracy * 10000)) save_train = int("{:4.0f}".format(correspond_train_acc * 10000)) model_name = "lstm" \ + "_epoch_" + str(best_epoch) \ + "_length_" + str(sequence_length) \ + "_opt_" + str(optimizer_choice) \ + "_mulopt_" + str(multi_optim) \ + "_flip_" + str(use_flip) \ + "_crop_" + str(crop_type) \ + "_batch_" + str(train_batch_size) \ + "_train_" + str(save_train) \ + "_val_" + str(save_val) \ + ".pth" torch.save(best_model_wts, model_name) print("best_epoch", str(best_epoch)) record_name = "lstm" \ + "_epoch_" + str(best_epoch) \ + "_length_" + str(sequence_length) \ + "_opt_" + str(optimizer_choice) \ + "_mulopt_" + str(multi_optim) \ + "_flip_" + str(use_flip) \ + "_crop_" + str(crop_type) \ + "_batch_" + str(train_batch_size) \ + "_train_" + str(save_train) \ + "_val_" + str(save_val) \ + ".npy" np.save(record_name, record_np) print('best accuracy: {:.4f} cor train accu: {:.4f}'.format( best_val_accuracy, correspond_train_acc))
def train_model(train_dataset, train_num_each, val_dataset, val_num_each): num_train = len(train_dataset) num_val = len(val_dataset) train_idx = [i for i in range(num_train)] np.random.seed(0) np.random.shuffle(train_idx) val_idx = [i for i in range(num_val)] print('num of train dataset: {:6d}'.format(num_train)) print('num of valid dataset: {:6d}'.format(num_val)) train_loader = DataLoader( train_dataset, batch_size=train_batch_size, sampler=train_idx, num_workers=workers, pin_memory=False ) val_loader = DataLoader( val_dataset, batch_size=val_batch_size, sampler=val_idx, num_workers=workers, pin_memory=False ) # model = models.resnet50(pretrained=True) # num_ftrs = model.fc.in_features # model.fc = nn.Linear(num_ftrs, 7) model = pure_resnet() if use_gpu: model = model.cuda() model = DataParallel(model) criterion = nn.CrossEntropyLoss(size_average=False) if multi_optim == 0: if optimizer_choice == 0: optimizer = optim.SGD(model.parameters(), lr=1e-3, momentum=0.9) exp_lr_scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, 'min') elif optimizer_choice == 1: optimizer = optim.Adam(model.parameters()) elif multi_optim == 1: if optimizer_choice == 0: optimizer = optim.SGD([ {'params': model.module.share.parameters()}, {'params': model.module.fc1.parameters(), 'lr': 1e-3}, ], lr=1e-4, momentum=0.9) exp_lr_scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, 'min') elif optimizer_choice == 1: optimizer = optim.Adam([ {'params': model.module.share.parameters()}, {'params': model.module.fc1.parameters(), 'lr': 1e-3}, ], lr=1e-4) best_model_wts = copy.deepcopy(model.state_dict()) best_val_accuracy = 0.0 correspond_train_acc = 0.0 all_info = [] all_train_accuracy = [] all_train_loss = [] all_val_accuracy = [] all_val_loss = [] for epoch in range(epochs): train_idx = [i for i in range(num_train)] np.random.seed(0) np.random.shuffle(train_idx) train_loader = DataLoader( train_dataset, batch_size=train_batch_size, sampler=train_idx, num_workers=workers, pin_memory=False ) model.train() train_loss = 0.0 train_corrects = 0 train_start_time = time.time() for data in train_loader: inputs, labels_1, labels_2 = data if use_gpu: inputs = Variable(inputs.cuda()) labels = Variable(labels_2.cuda()) else: inputs = Variable(inputs) labels = Variable(labels_2) optimizer.zero_grad() # 如果optimizer(net.parameters()), 那么效果和net.zero_grad()一样 outputs = model.forward(inputs) _, preds = torch.max(outputs.data, 1) loss = criterion(outputs, labels) # print(loss) loss.backward() # count +=1 optimizer.step() train_loss += loss.data[0] train_corrects += torch.sum(preds == labels.data) # print(train_corrects) train_elapsed_time = time.time() - train_start_time train_accuracy = train_corrects / num_train train_average_loss = train_loss / num_train model.eval() val_loss = 0.0 val_corrects = 0 val_start_time = time.time() for data in val_loader: inputs, labels_1, labels_2 = data if use_gpu: inputs = Variable(inputs.cuda()) labels = Variable(labels_2.cuda()) else: inputs = Variable(inputs) labels = Variable(labels_2) outputs = model.forward(inputs) _, preds = torch.max(outputs.data, 1) loss = criterion(outputs, labels) val_loss += loss.data[0] val_corrects += torch.sum(preds == labels.data) # print(val_corrects) val_elapsed_time = time.time() - val_start_time val_accuracy = val_corrects / num_val val_average_loss = val_loss / num_val print('epoch: {:4d}' ' train in: {:2.0f}m{:2.0f}s' ' train loss: {:4.4f}' ' train accu: {:.4f}' ' valid in: {:2.0f}m{:2.0f}s' ' valid loss: {:4.4f}' ' valid accu: {:.4f}' .format(epoch, train_elapsed_time // 60, train_elapsed_time % 60, train_average_loss, train_accuracy, val_elapsed_time // 60, val_elapsed_time % 60, val_average_loss, val_accuracy)) if optimizer_choice == 0: exp_lr_scheduler.step(val_average_loss) if val_accuracy > best_val_accuracy: best_val_accuracy = val_accuracy correspond_train_acc = train_accuracy best_model_wts = copy.deepcopy(model.state_dict()) elif val_accuracy == best_val_accuracy: if train_accuracy > correspond_train_acc: correspond_train_acc = train_accuracy best_model_wts = copy.deepcopy(model.state_dict()) all_train_loss.append(train_average_loss) all_train_accuracy.append(train_accuracy) all_val_loss.append(val_average_loss) all_val_accuracy.append(val_accuracy) print('best accuracy: {:.4f} cor train accu: {:.4f}'.format(best_val_accuracy, correspond_train_acc)) save_val = int("{:4.0f}".format(best_val_accuracy * 10000)) save_train = int("{:4.0f}".format(correspond_train_acc * 10000)) model_name = "phase" \ + "_epoch_" + str(epochs) \ + "_opt_" + str(optimizer_choice) \ + "_mulopt_" + str(multi_optim) \ + "_flip_" + str(use_flip) \ + "_crop_" + str(crop_type) \ + "_batch_" + str(train_batch_size) \ + "_train_" + str(save_train) \ + "_val_" + str(save_val) \ + ".pth" torch.save(best_model_wts, model_name) all_info.append(all_train_accuracy) all_info.append(all_train_loss) all_info.append(all_val_accuracy) all_info.append(all_val_loss) record_name = "phase" \ + "_epoch_" + str(epochs) \ + "_opt_" + str(optimizer_choice) \ + "_mulopt_" + str(multi_optim) \ + "_flip_" + str(use_flip) \ + "_crop_" + str(crop_type) \ + "_batch_" + str(train_batch_size) \ + "_train_" + str(save_train) \ + "_val_" + str(save_val) \ + ".pkl" with open(record_name, 'wb') as f: pickle.dump(all_info, f) print()
def test_model(test_dataset, test_num_each): num_test = len(test_dataset) test_useful_start_idx = get_useful_start_idx(sequence_length, test_num_each) num_test_we_use = len(test_useful_start_idx) test_we_use_start_idx = test_useful_start_idx[0:num_test_we_use] test_idx = [] for i in range(num_test_we_use): for j in range(sequence_length): test_idx.append(test_we_use_start_idx[i] + j) num_test_all = len(test_idx) print('num test start idx : {:6d}'.format(len(test_useful_start_idx))) print('last idx test start: {:6d}'.format(test_useful_start_idx[-1])) print('num of test dataset: {:6d}'.format(num_test)) print('num of test we use : {:6d}'.format(num_test_we_use)) print('num of all test use: {:6d}'.format(num_test_all)) # TODO sampler test_loader = DataLoader(test_dataset, batch_size=test_batch_size, sampler=SeqSampler(test_dataset, test_idx), num_workers=workers) model = resnet_lstm() print(model) if torch.cuda.device_count() > 1: print("Let's use", torch.cuda.device_count(), "GPUs!") model.load_state_dict(torch.load(model_name)) model = DataParallel(model) if use_gpu: model.to(device) # 应该可以直接多gpu计算 # model = model.module #要测试一下 criterion = nn.CrossEntropyLoss(size_average=False) model.eval() test_loss = 0.0 test_corrects = 0 test_start_time = time.time() all_preds = [] all_preds_score = [] with torch.no_grad(): for data in test_loader: # 释放显存 torch.cuda.empty_cache() if use_gpu: inputs, labels = data[0].to(device), data[1].to(device) labels = labels[(sequence_length - 1)::sequence_length] else: inputs, labels = data labels = labels[(sequence_length - 1)::sequence_length] inputs = inputs.view(-1, sequence_length, 3, 224, 224) if crop_type == 0 or crop_type == 1 or crop_type == 2 or crop_type == 3: outputs = model.forward(inputs) elif crop_type == 5: inputs = inputs.permute(1, 0, 2, 3, 4).contiguous() inputs = inputs.view(-1, 3, 224, 224) outputs = model.forward(inputs) outputs = outputs.view(5, -1, 7) outputs = torch.mean(outputs, 0) elif crop_type == 10: inputs = inputs.permute(1, 0, 2, 3, 4).contiguous() inputs = inputs.view(-1, 3, 224, 224) outputs = model.forward(inputs) outputs = outputs.view(10, -1, 7) outputs = torch.mean(outputs, 0) outputs = outputs[sequence_length - 1::sequence_length] Sm = nn.Softmax() outputs = Sm(outputs) possibility, preds = torch.max(outputs.data, 1) print("possibility:", possibility) for i in range(len(preds)): all_preds.append(preds[i]) for i in range(len(possibility)): all_preds_score.append(possibility[i]) print("all_preds length:", len(all_preds)) print("all_preds_score length:", len(all_preds_score)) loss = criterion(outputs, labels) # TODO 和batchsize相关 # test_loss += loss.data[0]/test_loss += loss.data.item() print("preds:", preds.data.cpu()) print("labels:", labels.data.cpu()) test_loss += loss.data.item() test_corrects += torch.sum(preds == labels.data) print("test_corrects:", test_corrects) test_elapsed_time = time.time() - test_start_time test_accuracy = float(test_corrects) / float(num_test_we_use) test_average_loss = test_loss / num_test_we_use print('type of all_preds:', type(all_preds)) print('leng of all preds:', len(all_preds)) save_test = int("{:4.0f}".format(test_accuracy * 10000)) pred_name = model_pure_name + '_test_' + str(save_test) + '_crop_' + str( crop_type) + '.pkl' pred_score_name = model_pure_name + '_test_' + str( save_test) + '_crop_' + str(crop_type) + '_score' + '.pkl' with open(pred_name, 'wb') as f: pickle.dump(all_preds, f) with open(pred_score_name, 'wb') as f: pickle.dump(all_preds_score, f) print('test elapsed: {:2.0f}m{:2.0f}s' ' test loss: {:4.4f}' ' test accu: {:.4f}'.format(test_elapsed_time // 60, test_elapsed_time % 60, test_average_loss, test_accuracy))
def train_model(train_dataset, train_num_each, val_dataset, val_num_each): if if_load_old == True: pdb.set_trace() print("please choose the previous one") time_cur = '1586310709.4848218' else: time_cur = time.time() writer = SummaryWriter(summary_dir + str(time_cur)) logger = utils.get_log('log/' + str(time_cur) + '.txt') # num_train = len(train_dataset) # num_val = len(val_dataset) train_useful_start_idx = get_useful_start_idx(sequence_length, train_num_each) val_useful_start_idx = get_useful_start_idx(sequence_length, val_num_each) # train_idx = [] # for i in range(num_train_we_use): # for j in range(sequence_length): # train_idx.append(train_we_use_start_idx[i] + j) val_idx = [] for i in range(len(val_useful_start_idx)): for j in range(sequence_length): val_idx.append(val_useful_start_idx[i] + j) # num_train_all = len(train_idx) num_val_all = len(val_idx) # print('num of train dataset: {:6d}'.format(num_train)) # print('num train start idx : {:6d}'.format(len(train_useful_start_idx))) # print('last idx train start: {:6d}'.format(train_useful_start_idx[-1])) # print('num of train we use : {:6d}'.format(num_train_we_use)) # print('num of all train use: {:6d}'.format(num_train_all)) # print('num of valid dataset: {:6d}'.format(num_val)) # print('num valid start idx : {:6d}'.format(len(val_useful_start_idx))) # print('last idx valid start: {:6d}'.format(val_useful_start_idx[-1])) # print('num of valid we use : {:6d}'.format(num_val_we_use)) # print('num of all valid use: {:6d}'.format(num_val_all)) val_loader = DataLoader(val_dataset, batch_size=val_batch_size, sampler=SeqSampler(val_dataset, val_idx), num_workers=workers, pin_memory=True) #select data to train X = train_useful_start_idx select_num = math.floor(len(X) * quary_portion) #every time choose 10% if is_first_selection is True: pdb.set_trace() print("this is first selectin!!!! please check your parameter in .sh") import random mask = [1 for n in range(0, len(X))] selected = random.sample(X, select_num) for i in range(len(X)): if X[i] in selected: mask[i] = 0 unselected = [X[i] for i in range(len(X)) if X[i] not in selected] save_select_data(save_select_txt_path, selected, unselected, mask, time_cur) else: # load_select_data return: data['selected'],data['unselected'],data['mask'] selected, unselected, mask = load_select_data( os.path.join(save_select_txt_path, json_name)) if select_chose == 'non_local': print("this is non_local select") test_idx = [] for i in range(len(unselected)): for j in range(sequence_length): test_idx.append(unselected[i] + j) num_test_all = len(test_idx) subset = Subset(train_dataset, test_idx) selected, unselected, mask = non_local_select( val_model_path, subset, sequence_length, X, select_num, selected, unselected, mask) elif select_chose == 'DBN': print("this is DBN select") test_idx = [] for i in range(len(unselected)): for j in range(sequence_length): test_idx.append(unselected[i] + j) num_test_all = len(test_idx) subset = Subset(train_dataset, test_idx) selected, unselected, mask = DBN_select(val_model_path, subset, sequence_length, X, select_num, selected, unselected, mask) elif select_chose == 'random': print("this is random select") test_idx = [] for i in range(len(unselected)): for j in range(sequence_length): test_idx.append(unselected[i] + j) num_test_all = len(test_idx) selected, unselected, mask = random_select_data( X, select_num, selected, unselected, mask) pdb.set_trace() selected = [ selected[i] for i in range(len(selected)) if selected[i] in test_idx ] else: print( "just using old load select data to train without select new data" ) # pdb.set_trace() if is_save_json is True: save_select_data(save_select_txt_path, selected, unselected, mask, time_cur) pdb.set_trace() # save_dir = save_dir_base + '/' + str(time_cur) + '_' + str(learning_rate) + '_tbs' + str(train_batch_size) \ # + '_seq' + str(sequence_length) + '_opt' + str(optimizer_choice) + '_crop' + str(crop_type) + '_adjlr' \ # + '_adamgamma' + str(adamgamma) + '_adamstep' + str(adam_step) + '_weight_decay' + str(adamweightdecay) + '_block_num' + str(block_num) if train_mode == 'RESLSTM' or train_mode == 'RESLSTM_DBN': save_dir = save_dir_base + '/' + str(train_mode) + '/' + str(time_cur) + 'txtname' + json_name + '_' + str(learning_rate) + '_tbs' + str(train_batch_size) \ + '_seq' + str(sequence_length) + '_opt' + str(optimizer_choice) + '_crop' + str(crop_type) \ + '_sgdstep' + str(sgd_step) + '_sgd_gamma' + str(sgd_gamma) + '_sgd_adjust_lr' + str(sgd_adjust_lr)+ '_weight_decay' + str(weight_decay) elif train_mode == 'RESLSTM_NOLOCAL' or train_mode == 'RESLSTM_NOLOCAL_dropout0.2': save_dir = save_dir_base + '/' + str(train_mode) + '/' + str(time_cur) + 'txtname' + json_name + '_' + str(learning_rate) + '_tbs' + str(train_batch_size) \ + '_seq' + str(sequence_length) + '_opt' + str(optimizer_choice) + '_crop' + str(crop_type) \ + '_adamgamma' + str(adamgamma) + '_adamstep' + str(adam_step) + '_adamweightdecay' + str(adamweightdecay) + '_block_num' + str(block_num) if if_load_old == True: # Check if a checkpoint is in there if len([name for name in os.listdir(save_dir)]) > 0: print("Loading old model") else: print("nothing to load") pdb.set_trace() else: os.makedirs(save_dir) if train_mode == 'RESLSTM': model = resnet_lstm() elif train_mode == 'RESLSTM_NOLOCAL': model = resnet_lstm_nonlocal() elif train_mode == 'RESLSTM_NOLOCAL_dropout0.2': model = resnet_lstm_nonlocal() chk = 'results_ResLSTM_Nolocal/RESLSTM_NOLOCAL/1572847215.642195txtname42974_1572767025.1601517.json_0.0005_tbs400_seq10_opt1_crop0_adamgamma0.1_adamstep3_adamweightdecay0.0001_block_num1/checkpoint_best-23.pt' print("Restoring: ", chk) # Load state = torch.load(chk) # newdict = {} # for k,v in state['state_dict'].items(): # if k[0:7] != 'module.': # name = 'module.' + k # newdict[name] = v # else: # newdict[k] = v model.load_state_dict(state['state_dict']) elif train_mode == 'RESLSTM_DBN': model = resnet_lstm_dropout() else: print("not implemented") pdb.set_trace() # print (model) # pdb.set_trace() if use_gpu: model = DataParallel(model) model.to(device) criterion = nn.CrossEntropyLoss(size_average=False) optimizer = None exp_lr_scheduler = None if multi_optim == 0: if optimizer_choice == 0: optimizer = optim.SGD(model.parameters(), lr=learning_rate, momentum=momentum, dampening=dampening, weight_decay=weight_decay, nesterov=use_nesterov) if sgd_adjust_lr == 0: exp_lr_scheduler = lr_scheduler.StepLR(optimizer, step_size=sgd_step, gamma=sgd_gamma) elif sgd_adjust_lr == 1: exp_lr_scheduler = lr_scheduler.ReduceLROnPlateau( optimizer, 'min') elif optimizer_choice == 1: optimizer = optim.Adam(model.parameters(), lr=learning_rate) elif multi_optim == 1: if optimizer_choice == 0: optimizer = optim.SGD([ { 'params': model.module.share.parameters() }, { 'params': model.module.lstm.parameters(), 'lr': learning_rate }, { 'params': model.module.fc.parameters(), 'lr': learning_rate }, ], lr=learning_rate / 10, momentum=momentum, dampening=dampening, weight_decay=weight_decay, nesterov=use_nesterov) if sgd_adjust_lr == 0: exp_lr_scheduler = lr_scheduler.StepLR(optimizer, step_size=sgd_step, gamma=sgd_gamma) elif sgd_adjust_lr == 1: exp_lr_scheduler = lr_scheduler.ReduceLROnPlateau( optimizer, 'min') elif optimizer_choice == 1: # optimizer = optim.Adam([ # {'params': model.module.share.parameters()}, # {'params': model.module.lstm.parameters(), 'lr': learning_rate}, # {'params': model.module.fc.parameters(), 'lr': learning_rate}, # ], lr=learning_rate / 10) optim_params = list( filter(lambda p: p.requires_grad, model.parameters())) print('Optimizing %d paramters' % len(optim_params)) optimizer = optim.Adam(optim_params, lr=learning_rate, weight_decay=adamweightdecay) exp_lr_scheduler = lr_scheduler.StepLR(optimizer, step_size=adam_step, gamma=adamgamma) #check if need load old weigth, optimizer if if_load_old: # Find last, not last best checkpoint files = glob(save_dir + '/*') global_steps = np.zeros([len(files)]) for i in range(len(files)): # Use meta files to find the highest index if 'best' in files[i]: continue if 'checkpoint-' not in files[i]: continue # Extract global step nums = [int(s) for s in re.findall(r'\d+', files[i])] global_steps[i] = nums[-1] # Create path with maximum global step found chkPath = save_dir + '/checkpoint-' + str(int( np.max(global_steps))) + '.pt' print("Restoring: ", chkPath) # Load state = torch.load(chkPath) # Initialize model and optimizer newdict = {} for k, v in state['state_dict'].items(): if k[0:7] != 'module.': name = 'module.' + k newdict[name] = v else: newdict[k] = v model.load_state_dict(newdict) # model.load_state_dict(state['state_dict']) optimizer.load_state_dict(state['optimizer']) # pdb.set_trace() start_epoch = state['epoch'] best_epoch = int(np.max(global_steps)) best_val_accuracy = state['best_val_accuracy'] correspond_train_acc = state['correspond_train_acc'] else: start_epoch = 1 best_epoch = -1 best_val_accuracy = 0.0 correspond_train_acc = 0.0 if sv_init_model is not None: print("Restoring supervised model: ", sv_init_model) # Load state = torch.load(sv_init_model) # Initialize model and optimizer newdict = {} for k, v in state['state_dict'].items(): if k[0:7] != 'module.': name = 'module.' + k newdict[name] = v else: newdict[k] = v model.load_state_dict(newdict) best_model_wts = copy.deepcopy(model.module.state_dict()) for epoch in range(start_epoch, epochs + 1): np.random.shuffle(selected) train_idx = [] for i in range(len(selected)): for j in range(sequence_length): train_idx.append(selected[i] + j) num_train_all = len(train_idx) # subset = Subset(train_dataset,train_idx) # train_loader = DataLoader( # subset, # batch_size=train_batch_size, # sampler=SeqSampler(subset, train_idx), # num_workers=workers, # pin_memory=True # ) train_loader = DataLoader(train_dataset, batch_size=train_batch_size, sampler=SeqSampler(train_dataset, train_idx), num_workers=workers, pin_memory=True) # pdb.set_trace() # Sets the module in training mode. model.train() train_loss = 0.0 train_corrects = 0 batch_progress = 0.0 train_start_time = time.time() for data in train_loader: optimizer.zero_grad() # torch.cuda.empty_cache() with torch.set_grad_enabled(True): if use_gpu: inputs, labels = data[0].to(device), data[1].to(device) labels = labels[(sequence_length - 1)::sequence_length] else: inputs, labels = data[0], data[1] labels = labels[(sequence_length - 1)::sequence_length] # pdb.set_trace() inputs = inputs.view(-1, sequence_length, 3, 224, 224) # pdb.set_trace() outputs = model.forward(inputs) # pdb.set_trace() outputs = outputs[sequence_length - 1::sequence_length] _, preds = torch.max(outputs.data, 1) loss = criterion(outputs, labels) loss.backward() optimizer.step() train_loss += loss.data.item() batch_corrects = torch.sum(preds == labels.data) train_corrects += batch_corrects batch_acc = float( batch_corrects) / train_batch_size * sequence_length batch_progress += 1 if batch_progress * train_batch_size >= num_train_all: percent = 100.0 print('Batch progress: %s [%d/%d] Batch acc:%.2f' % (str(percent) + '%', num_train_all, num_train_all, batch_acc), end='\n') else: percent = round( batch_progress * train_batch_size / num_train_all * 100, 2) print('Batch progress: %s [%d/%d] Batch acc:%.2f' % (str(percent) + '%', batch_progress * train_batch_size, num_train_all, batch_acc), end='\r') train_elapsed_time = time.time() - train_start_time train_accuracy = float(train_corrects) / float( num_train_all) * sequence_length train_average_loss = train_loss / num_train_all * sequence_length # Sets the module in evaluation mode. model.eval() val_loss = 0.0 val_corrects = 0 val_start_time = time.time() val_progress = 0 with torch.no_grad(): for data in val_loader: # torch.cuda.empty_cache() if use_gpu: inputs, labels = data[0].to(device), data[1].to(device) labels = labels[(sequence_length - 1)::sequence_length] else: inputs, labels = data[0], data[1] labels = labels[(sequence_length - 1)::sequence_length] if crop_type == 0 or crop_type == 1: inputs = inputs.view(-1, sequence_length, 3, 224, 224) outputs = model.forward(inputs) elif crop_type == 5: inputs = inputs.permute(1, 0, 2, 3, 4).contiguous() inputs = inputs.view(-1, 3, 224, 224) outputs = model.forward(inputs) outputs = outputs.view(5, -1, 7) outputs = torch.mean(outputs, 0) elif crop_type == 10: inputs = inputs.permute(1, 0, 2, 3, 4).contiguous() inputs = inputs.view(-1, 3, 224, 224) outputs = model.forward(inputs) outputs = outputs.view(10, -1, 7) outputs = torch.mean(outputs, 0) outputs = outputs[sequence_length - 1::sequence_length] _, preds = torch.max(outputs.data, 1) loss = criterion(outputs, labels) val_loss += loss.data.item() val_corrects += torch.sum(preds == labels.data) val_progress += 1 if val_progress * val_batch_size >= num_val_all: percent = 100.0 print('Val progress: %s [%d/%d]' % (str(percent) + '%', num_val_all, num_val_all), end='\n') else: percent = round( val_progress * val_batch_size / num_val_all * 100, 2) print('Val progress: %s [%d/%d]' % (str(percent) + '%', val_progress * val_batch_size, num_val_all), end='\r') val_elapsed_time = time.time() - val_start_time val_accuracy = float(val_corrects) / float( num_val_all) * sequence_length val_average_loss = val_loss / num_val_all * sequence_length write_dict = { "train_loss": train_average_loss, "val_loss": val_average_loss, "train_accuracy": train_accuracy, "val_accuracy": val_accuracy } writer.add_scalars('scalar', write_dict, epoch) if optimizer_choice == 0: if sgd_adjust_lr == 0: exp_lr_scheduler.step() elif sgd_adjust_lr == 1: exp_lr_scheduler.step(val_average_loss) if optimizer_choice == 1: exp_lr_scheduler.step() if val_accuracy >= best_val_accuracy: if val_accuracy > best_val_accuracy: best_val_accuracy = val_accuracy correspond_train_acc = train_accuracy best_model_wts = copy.deepcopy(model.module.state_dict()) oldBestInd = best_epoch best_epoch = epoch if val_accuracy == best_val_accuracy: if train_accuracy > correspond_train_acc: correspond_train_acc = train_accuracy best_model_wts = copy.deepcopy(model.module.state_dict()) oldBestInd = best_epoch best_epoch = epoch # Delte previously best model if os.path.isfile(save_dir + '/checkpoint_best-' + str(oldBestInd) + '.pt'): os.remove(save_dir + '/checkpoint_best-' + str(oldBestInd) + '.pt') # Save currently best model state = { 'epoch': epoch, 'state_dict': best_model_wts, 'optimizer': optimizer.state_dict(), 'best_val_accuracy': best_val_accuracy, 'correspond_train_acc': correspond_train_acc } torch.save(state, save_dir + '/checkpoint_best-' + str(epoch) + '.pt') # If its not better, just save it delete the last checkpoint if it is not current best one # Save current model state = { 'epoch': epoch, 'state_dict': model.module.state_dict(), 'optimizer': optimizer.state_dict(), 'best_val_accuracy': best_val_accuracy, 'correspond_train_acc': correspond_train_acc } torch.save(state, save_dir + '/checkpoint-' + str(epoch) + '.pt') # Delete last one if os.path.isfile(save_dir + '/checkpoint-' + str(epoch - 1) + '.pt'): os.remove(save_dir + '/checkpoint-' + str(epoch - 1) + '.pt') logger.info("\n") logger.info('Epoch: %d/%d (%d h %d m %d s)' % (epoch, epochs, int(train_elapsed_time / 3600), int(np.mod(train_elapsed_time, 3600) / 60), int(np.mod(np.mod(train_elapsed_time, 3600), 60))) + time.strftime("%d.%m.-%H:%M:%S", time.localtime())) logger.info('validation time: %d h %d m %d s' % (int(val_elapsed_time / 3600), int(np.mod(val_elapsed_time, 3600) / 60), int(np.mod(np.mod(val_elapsed_time, 3600), 60))) + time.strftime("%d.%m.-%H:%M:%S", time.localtime())) logger.info("training loss: %6f" % train_average_loss) logger.info("validation loss: %6f" % val_average_loss) logger.info("train accu: %6f" % train_accuracy) logger.info("validation accu: %6f" % val_accuracy) logger.info("best val accu: %6f at Epoch %d" % (best_val_accuracy, best_epoch)) logger.info("best corresponding train accu: %6f" % correspond_train_acc) writer.close()
def train(self): if not self.pretrained_model: model = pytorch_transformers.modeling_gpt2.GPT2LMHeadModel( config=self.model_config) else: model = pytorch_transformers.modeling_gpt2.GPT2LMHeadModel.from_pretrained( self.pretrained_model) model.train() model.to(self.device) # 计算模型参数量 num_parameters = 0 parameters = model.parameters() for parameter in parameters: num_parameters += parameter.numel() self.print_and_log('模型参数量 = {}'.format(num_parameters)) if self.do_tokenize: self.print_and_log("开始加载训练集") self.tokenize_and_save() self.print_and_log("训练集加载完毕") full_len = 0 for i in range(self.split_num): with open( self.tokenized_data_path + 'tokenized_train_{}.txt'.format(i), 'r') as f: full_len += len( [int(item) for item in f.read().strip().split()]) sample_num = int(full_len / self.stride) epoch_steps = int(full_len / self.stride / self.batch_size / self.gradient_accumulation) total_steps = int(full_len / self.stride * self.epochs / self.batch_size / self.gradient_accumulation) self.print_and_log('样本数 = {}'.format(sample_num)) self.print_and_log('epoch 步数 = {}'.format(epoch_steps)) self.print_and_log('总步数 = {}'.format(total_steps)) optimizer = pytorch_transformers.AdamW(model.parameters(), lr=self.lr, correct_bias=True) scheduler = pytorch_transformers.WarmupLinearSchedule( optimizer, warmup_steps=self.warmup_steps, t_total=total_steps) if self.fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=self.fp16_opt_level) if torch.cuda.device_count() > 1: model = DataParallel(model) multi_gpu = True else: multi_gpu = False overall_step = 0 running_loss = 0 for epoch in range(self.epochs): self.print_and_log('epoch {}'.format(epoch + 1)) now = datetime.now() self.print_and_log('time: {}'.format(now)) optimizer.zero_grad() split_indices = np.linspace(0, self.split_num - 1, self.split_num, dtype=np.int32) random.shuffle(split_indices) for split_index in split_indices: with open( self.tokenized_data_path + 'tokenized_train_{}.txt'.format(split_index), 'r') as f: line = f.read().strip() all_ids = line.split() all_ids = [int(x) for x in all_ids] start_point = 0 samples = [] while start_point < len(all_ids) - self.n_ctx: samples.append(all_ids[start_point:start_point + self.n_ctx]) start_point += self.stride random.shuffle(samples) for i in range(len(samples) // self.batch_size): # drop last batch = samples[i * self.batch_size:(i + 1) * self.batch_size] batch_labels = torch.tensor(batch, dtype=torch.long).to( self.device) batch_inputs = torch.tensor(batch, dtype=torch.long).to( self.device) outputs = model.forward(input_ids=batch_inputs, labels=batch_labels) loss, logits = outputs[:2] if multi_gpu: loss = loss.mean() if self.gradient_accumulation > 1: loss = loss / self.gradient_accumulation # loss backward if self.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), self.max_grad_norm) else: loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), self.max_grad_norm) if (i + 1) % self.gradient_accumulation == 0: running_loss += loss.item() scheduler.step() optimizer.step() optimizer.zero_grad() overall_step += 1 if (overall_step + 1) % self.log_step == 0 and running_loss != 0: self.print_and_log( 'now time: {}:{}. Step {} of epoch {}, loss {}'. format( datetime.now().hour, datetime.now().minute, overall_step + 1, epoch + 1, running_loss * self.gradient_accumulation / self.log_step)) running_loss = 0 if not os.path.exists(self.output_dir + 'model_epoch{}'.format(epoch + 1)): os.makedirs(self.output_dir + 'model_epoch{}'.format(epoch + 1)) gpt2_model = model.transformer model_to_save = gpt2_model.module if hasattr( gpt2_model, 'module') else gpt2_model model_to_save.save_pretrained(self.output_dir + 'model_epoch{}'.format(epoch + 1)) # torch.save(scheduler.state_dict(), output_dir + 'model_epoch{}/scheduler.pt'.format(epoch + 1)) # torch.save(optimizer.state_dict(), output_dir + 'model_epoch{}/optimizer.pt'.format(epoch + 1)) then = datetime.now() self.print_and_log('time: {}'.format(then)) self.print_and_log('time for one epoch: {}'.format(then - now)) self.print_and_log('training finished') self.f_log.close() if not os.path.exists(self.output_dir + 'final_model'): os.makedirs(self.output_dir + 'final_model') gpt2_model = model.transformer model_to_save = gpt2_model.module if hasattr(gpt2_model, 'module') else gpt2_model model_to_save.save_pretrained(self.output_dir + 'final_model')
def train_model(train_dataset, train_num_each, val_dataset, val_num_each): num_train = len(train_dataset) num_val = len(val_dataset) train_useful_start_idx = get_useful_start_idx(sequence_length, train_num_each) val_useful_start_idx = get_useful_start_idx(sequence_length, val_num_each) num_train_we_use = len(train_useful_start_idx) // num_gpu * num_gpu num_val_we_use = len(val_useful_start_idx) // num_gpu * num_gpu # num_train_we_use = 8000 # num_val_we_use = 800 train_we_use_start_idx = train_useful_start_idx[0:num_train_we_use] # 训练数据开始位置 val_we_use_start_idx = val_useful_start_idx[0:num_val_we_use] np.random.seed(0) np.random.shuffle(train_we_use_start_idx) train_idx = [] for i in range(num_train_we_use): for j in range(sequence_length): train_idx.append(train_we_use_start_idx[i] + j * srate) # 训练数据位置,每一张图是一个数据 val_idx = [] for i in range(num_val_we_use): for j in range(sequence_length): val_idx.append(val_we_use_start_idx[i] + j * srate) num_train_all = float(len(train_idx)) num_val_all = float(len(val_idx)) print('num of train dataset: {:6d}'.format(num_train)) print('num train start idx : {:6d}'.format(len(train_useful_start_idx))) print('last idx train start: {:6d}'.format(train_useful_start_idx[-1])) print('num of train we use : {:6d}'.format(num_train_we_use)) print('num of all train use: {:6d}'.format(int(num_train_all))) print('num of valid dataset: {:6d}'.format(num_val)) print('num valid start idx : {:6d}'.format(len(val_useful_start_idx))) print('last idx valid start: {:6d}'.format(val_useful_start_idx[-1])) print('num of valid we use : {:6d}'.format(num_val_we_use)) print('num of all valid use: {:6d}'.format(int(num_val_all))) val_loader = DataLoader( val_dataset, batch_size=val_batch_size, # sampler=val_idx, sampler=SeqSampler(val_dataset, val_idx), num_workers=workers, pin_memory=False ) model = resnet_lstm() if use_gpu: model = model.cuda() model = DataParallel(model) criterion = nn.CrossEntropyLoss() ''' if multi_optim == 0: if optimizer_choice == 0: optimizer = optim.SGD(model.parameters(), lr=learning_rate, momentum=momentum, dampening=dampening, weight_decay=weight_decay, nesterov=use_nesterov) if sgd_adjust_lr == 0: exp_lr_scheduler = lr_scheduler.StepLR(optimizer, step_size=sgd_adjust_lr, gamma=sgd_gamma) elif sgd_adjust_lr == 1: exp_lr_scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, 'min') elif optimizer_choice == 1: optimizer = optim.Adam(model.parameters(), lr=learning_rate) elif multi_optim == 1: if optimizer_choice == 0: optimizer = optim.SGD([ {'params': model.module.share.parameters()}, {'params': model.module.lstm.parameters(), 'lr': learning_rate}, {'params': model.module.fc.parameters(), 'lr': learning_rate}, ], lr=learning_rate / 10, momentum=momentum, dampening=dampening, weight_decay=weight_decay, nesterov=use_nesterov) if sgd_adjust_lr == 0: exp_lr_scheduler = lr_scheduler.StepLR(optimizer, step_size=sgd_adjust_lr, gamma=sgd_gamma) elif sgd_adjust_lr == 1: exp_lr_scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, 'min') elif optimizer_choice == 1: optimizer = optim.Adam([ {'params': model.module.share.parameters()}, {'params': model.module.lstm.parameters(), 'lr': learning_rate}, {'params': model.module.fc.parameters(), 'lr': learning_rate}, ], lr=learning_rate / 10) ''' optimizer = optim.Adam(model.parameters(), lr=learning_rate) best_model_wts = copy.deepcopy(model.state_dict()) best_val_accuracy = 0.0 correspond_train_acc = 0.0 record_np = np.zeros([epochs, 4]) for epoch in range(epochs): np.random.seed(epoch) np.random.shuffle(train_we_use_start_idx) train_idx = [] for i in range(num_train_we_use): for j in range(sequence_length): train_idx.append(train_we_use_start_idx[i] + j * srate) train_loader = DataLoader( train_dataset, batch_size=train_batch_size, sampler=SeqSampler(train_dataset, train_idx), num_workers=workers, pin_memory=False ) model.train() train_loss = 0.0 train_corrects = 0 train_start_time = time.time() num = 0 train_num = 0 for data in train_loader: num = num + 1 #inputs, labels_phase, kdata = data inputs, labels_phase = data if use_gpu: inputs = Variable(inputs.cuda()) labels = Variable(labels_phase.cuda()) #kdatas = Variable(kdata.cuda()) else: inputs = Variable(inputs) labels = Variable(labels_phase) #kdatas = Variable(kdata) optimizer.zero_grad() #outputs = model.forward(inputs, kdatas) outputs = model.forward(inputs) outputs = F.softmax(outputs, dim=1) _, preds = torch.max(outputs.data, 1) print(num) print(preds) print(labels) loss = criterion(outputs, labels) loss.backward() optimizer.step() train_loss += loss.data train_corrects += torch.sum(preds == labels.data) train_num += labels.shape[0] print(train_corrects.cpu().numpy() / train_num) if train_corrects.cpu().numpy() / train_num > 0.75: torch.save(copy.deepcopy(model.state_dict()), 'test.pth') train_elapsed_time = time.time() - train_start_time train_accuracy = train_corrects.cpu().numpy() / train_num train_average_loss = train_loss / train_num # begin eval model.eval() val_loss = 0.0 val_corrects = 0 val_num = 0 val_start_time = time.time() for data in val_loader: #inputs, labels_phase, kdata = data inputs, labels_phase = data #labels_phase = labels_phase[(sequence_length - 1)::sequence_length] #kdata = kdata[(sequence_length - 1)::sequence_length] if use_gpu: inputs = Variable(inputs.cuda()) labels = Variable(labels_phase.cuda()) #kdatas = Variable(kdata.cuda()) else: inputs = Variable(inputs) labels = Variable(labels_phase) #kdatas = Variable(kdata) if crop_type == 0 or crop_type == 1: #outputs = model.forward(inputs, kdatas) outputs = model.forward(inputs) elif crop_type == 5: inputs = inputs.permute(1, 0, 2, 3, 4).contiguous() inputs = inputs.view(-1, 3, 224, 224) #outputs = model.forward(inputs, kdatas) outputs = model.forward(inputs) outputs = outputs.view(5, -1, 3) outputs = torch.mean(outputs, 0) elif crop_type == 10: inputs = inputs.permute(1, 0, 2, 3, 4).contiguous() inputs = inputs.view(-1, 3, 224, 224) #outputs = model.forward(inputs, kdatas) outputs = model.forward(inputs) outputs = outputs.view(10, -1, 3) outputs = torch.mean(outputs, 0) #outputs = outputs[sequence_length - 1::sequence_length] _, preds = torch.max(outputs.data, 1) print(num) print(preds) print(labels) loss = criterion(outputs, labels) val_loss += loss.data val_corrects += torch.sum(preds == labels.data) val_num += labels.shape[0] val_elapsed_time = time.time() - val_start_time val_accuracy = val_corrects.cpu().numpy() / val_num val_average_loss = val_loss / val_num print('epoch: {:4d}' ' train in: {:2.0f}m{:2.0f}s' ' train loss: {:4.4f}' ' train accu: {:.4f}' ' valid in: {:2.0f}m{:2.0f}s' ' valid loss: {:4.4f}' ' valid accu: {:.4f}' .format(epoch, train_elapsed_time // 60, train_elapsed_time % 60, train_average_loss, train_accuracy, val_elapsed_time // 60, val_elapsed_time % 60, val_average_loss, val_accuracy)) if optimizer_choice == 0: if sgd_adjust_lr == 0: exp_lr_scheduler.step() elif sgd_adjust_lr == 1: exp_lr_scheduler.step(val_average_loss) if val_accuracy > best_val_accuracy: best_val_accuracy = val_accuracy correspond_train_acc = train_accuracy best_model_wts = copy.deepcopy(model.state_dict()) if val_accuracy == best_val_accuracy: if train_accuracy > correspond_train_acc: correspond_train_acc = train_accuracy best_model_wts = copy.deepcopy(model.state_dict()) record_np[epoch, 0] = train_accuracy record_np[epoch, 1] = train_average_loss record_np[epoch, 2] = val_accuracy record_np[epoch, 3] = val_average_loss np.save(str(epoch) + '.npy', record_np) print('best accuracy: {:.4f} cor train accu: {:.4f}'.format(best_val_accuracy, correspond_train_acc)) save_val = int("{:4.0f}".format(best_val_accuracy * 10000)) save_train = int("{:4.0f}".format(correspond_train_acc * 10000)) model_name = "lstm" \ + "_epoch_" + str(epochs) \ + "_length_" + str(sequence_length) \ + "_opt_" + str(optimizer_choice) \ + "_mulopt_" + str(multi_optim) \ + "_flip_" + str(use_flip) \ + "_crop_" + str(crop_type) \ + "_batch_" + str(train_batch_size) \ + "_train_" + str(save_train) \ + "_val_" + str(save_val) \ + ".pth" torch.save(best_model_wts, model_name) record_name = "lstm" \ + "_epoch_" + str(epochs) \ + "_length_" + str(sequence_length) \ + "_opt_" + str(optimizer_choice) \ + "_mulopt_" + str(multi_optim) \ + "_flip_" + str(use_flip) \ + "_crop_" + str(crop_type) \ + "_batch_" + str(train_batch_size) \ + "_train_" + str(save_train) \ + "_val_" + str(save_val) \ + ".npy" np.save(record_name, record_np)
def main(): parser = argparse.ArgumentParser() parser.add_argument('--device', default='0,1,2,3', type=str, required=False, help='设置使用哪些显卡') parser.add_argument('--model_config', default='config/model_config_small.json', type=str, required=False, help='选择模型参数') parser.add_argument('--tokenizer_path', default='cache/vocab_small.txt', type=str, required=False, help='选择词库') parser.add_argument('--raw_data_path', default='data/train.json', type=str, required=False, help='原始训练语料') parser.add_argument('--tokenized_data_path', default='data/tokenized/', type=str, required=False, help='tokenized语料存放位置') parser.add_argument('--raw', action='store_true', help='是否先做tokenize') parser.add_argument('--epochs', default=5, type=int, required=False, help='训练循环') parser.add_argument('--batch_size', default=64, type=int, required=False, help='训练batch size') parser.add_argument('--lr', default=1.5e-4, type=float, required=False, help='学习率') parser.add_argument('--warmup_steps', default=2000, type=int, required=False, help='warm up步数') parser.add_argument('--log_step', default=1, type=int, required=False, help='多少步汇报一次loss,设置为gradient accumulation的整数倍') parser.add_argument('--stride', default=768, type=int, required=False, help='训练时取训练数据的窗口步长') parser.add_argument('--gradient_accumulation', default=1, type=int, required=False, help='梯度积累') parser.add_argument('--fp16', action='store_true', help='混合精度') parser.add_argument('--fp16_opt_level', default='O1', type=str, required=False) parser.add_argument('--max_grad_norm', default=1.0, type=float, required=False) parser.add_argument('--num_pieces', default=100, type=int, required=False, help='将训练语料分成多少份') parser.add_argument('--min_length', default=128, type=int, required=False, help='最短收录文章长度') parser.add_argument('--max_length', default=256, type=int, required=False, help='最短收录文章长度') parser.add_argument('--output_dir', default='model/', type=str, required=False, help='模型输出路径') parser.add_argument('--pretrained_model', default='', type=str, required=False, help='模型训练起点路径') parser.add_argument('--writer_dir', default='tensorboard_summary/', type=str, required=False, help='Tensorboard路径') parser.add_argument('--segment', action='store_true', help='中文以词为单位') parser.add_argument('--bpe_token', action='store_true', help='subword') parser.add_argument('--encoder_json', default="tokenizations/encoder.json", type=str, help="encoder.json") parser.add_argument('--vocab_bpe', default="tokenizations/vocab.bpe", type=str, help="vocab.bpe") parser.add_argument('--max_steps_perEpoch_perPiece', default=1000000, type=int, required=False) parser.add_argument('--steps_savemodel', default=10000, type=int, required=False, help='保存模型步数') parser.add_argument('--padding', action='store_true', help='输入是否定长') args = parser.parse_args() print('args:\n' + args.__repr__()) if args.segment: from tokenizations import tokenization_bert_word_level as tokenization_bert else: from tokenizations import tokenization_bert #os.environ["CUDA_VISIBLE_DEVICES"] = args.device # 此处设置程序使用哪些显卡 model_config = transformers.modeling_gpt2.GPT2Config.from_json_file(args.model_config) print('config:\n' + model_config.to_json_string()) n_ctx = model_config.n_ctx if args.bpe_token: full_tokenizer = get_encoder(args.encoder_json, args.vocab_bpe) else: full_tokenizer = tokenization_bert.BertTokenizer(vocab_file=args.tokenizer_path) full_tokenizer.max_len = 999999 device = 'cuda' if torch.cuda.is_available() else 'cpu' print('using device:', device) raw_data_path = args.raw_data_path tokenized_data_path = args.tokenized_data_path raw = args.raw # 选择是否从零开始构建数据集 epochs = args.epochs batch_size = args.batch_size lr = args.lr warmup_steps = args.warmup_steps log_step = args.log_step stride = args.stride gradient_accumulation = args.gradient_accumulation fp16 = args.fp16 # 不支持半精度的显卡请勿打开 fp16_opt_level = args.fp16_opt_level max_grad_norm = args.max_grad_norm num_pieces = args.num_pieces min_length = args.min_length output_dir = args.output_dir padding = args.padding max_length = args.max_length #tb_writer = SummaryWriter(log_dir=args.writer_dir) assert log_step % gradient_accumulation == 0 if not os.path.exists(output_dir): os.mkdir(output_dir) if not args.pretrained_model: model = transformers.modeling_gpt2.GPT2LMHeadModel(config=model_config) else: model = transformers.modeling_gpt2.GPT2LMHeadModel.from_pretrained(args.pretrained_model) model.train() model.to(device) num_parameters = 0 parameters = model.parameters() for parameter in parameters: num_parameters += parameter.numel() print('number of parameters: {}'.format(num_parameters)) multi_gpu = False optimizer = transformers.AdamW(model.parameters(), lr=lr, correct_bias=True) #scheduler = transformers.WarmupLinearSchedule(optimizer, warmup_steps=warmup_steps, # t_total=total_steps) if fp16: try: from apex import amp except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.") model, optimizer = amp.initialize(model, optimizer, opt_level=fp16_opt_level) if torch.cuda.device_count() > 1: print("Let's use", torch.cuda.device_count(), "GPUs!") model = DataParallel(model, device_ids=[int(i) for i in args.device.split(',')]) multi_gpu = True print('starting training') step_loss = 0 running_loss = 10 loss_ = 10 iter = iterData(args.tokenized_data_path, rate=1.0, batch_size=batch_size, epochs=epochs) step = 0 epoch0 = -1 while True: data = next(iter) if data=='__STOP__': break epoch, epochs, idx_file, nb_files, batch_inputs = data random.shuffle(batch_inputs) batch_inputs = torch.tensor(batch_inputs).long().to(device) # forward pass outputs = model.forward(input_ids=batch_inputs, labels=batch_inputs) loss, logits = outputs[:2] # get loss if multi_gpu: loss = loss.mean() if gradient_accumulation > 1: loss = loss / gradient_accumulation # loss backward if fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), max_grad_norm) else: loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm) # optimizer step if (step + 1) % gradient_accumulation == 0: running_loss += loss.item() optimizer.step() optimizer.zero_grad() step_loss += 1 #scheduler.step() if (step + 1) % log_step == 0: loss_ = running_loss * gradient_accumulation / (log_step / gradient_accumulation) print('now time: {}:{}. step: {}, progress-innerEpoch: {}/{}, progress-outerEpoch: {}/{}, loss {}'.format( datetime.now().hour, datetime.now().minute, step+1, idx_file+1, nb_files, epoch + 1, epochs, loss_)) running_loss = 0 if step%args.steps_savemodel==0: print('saving model for epoch {}'.format(epoch + 1)) output_dir_ = output_dir + 'model_epoch{}_step{}_loss-{}'.format(epoch + 1, step,'%0.2f'%loss_) if not os.path.exists(output_dir_): os.mkdir(output_dir_) model_to_save = model.module if hasattr(model, 'module') else model model_to_save.save_pretrained(output_dir_) step += 1 if epoch!=epoch0: if not os.path.exists(output_dir + 'model_epoch{}'.format(epoch + 1)): os.mkdir(output_dir + 'model_epoch{}'.format(epoch + 1)) model_to_save = model.module if hasattr(model, 'module') else model model_to_save.save_pretrained(output_dir + 'model_epoch{}'.format(epoch + 1)) epoch0 = epoch print('epoch {} finished'.format(epoch + 1)) if not os.path.exists(output_dir + 'final_model'): os.mkdir(output_dir + 'final_model') model_to_save = model.module if hasattr(model, 'module') else model model_to_save.save_pretrained(output_dir + 'final_model') print('training finished')
def main(): parser = argparse.ArgumentParser() parser.add_argument('--device', default='0,1,2,3', type=str, required=False, help='设置使用哪些显卡') parser.add_argument('--model_config', default='config/model_config.json', type=str, required=False, help='选择模型参数') parser.add_argument('--tokenizer_path', default='cache/vocab_small.txt', type=str, required=False, help='选择词库') parser.add_argument('--raw_data_path', default='data/train.json', type=str, required=False, help='原始训练语料') parser.add_argument('--tokenized_data_path', default='data/tokenized/', type=str, required=False, help='tokenized语料存放位置') parser.add_argument('--raw', action='store_true', help='是否先做tokenize') parser.add_argument('--epochs', default=5, type=int, required=False, help='训练循环') parser.add_argument('--batch_size', default=8, type=int, required=False, help='训练batch size') parser.add_argument('--lr', default=1.5e-4, type=float, required=False, help='学习率') parser.add_argument('--warmup_steps', default=2000, type=int, required=False, help='warm up步数') parser.add_argument('--log_step', default=1, type=int, required=False, help='多少步汇报一次loss') parser.add_argument('--stride', default=768, type=int, required=False, help='训练时取训练数据的窗口步长') parser.add_argument('--gradient_accumulation', default=1, type=int, required=False, help='梯度积累') parser.add_argument('--fp16', action='store_true', help='混合精度') parser.add_argument('--fp16_opt_level', default='O1', type=str, required=False) parser.add_argument('--max_grad_norm', default=1.0, type=float, required=False) parser.add_argument('--num_pieces', default=1, type=int, required=False, help='将训练语料分成多少份') parser.add_argument('--output_dir', default='model/', type=str, required=False, help='模型输出路径') parser.add_argument('--pretrained_model', default='', type=str, required=False, help='模型训练起点路径') parser.add_argument('--segment', action='store_true', help='中文以词为单位') ''' 配置参数------------------------------------------------------------------- ''' args = parser.parse_args() args.device = '1' args.batch_size = 5 from tokenizations import tokenization proj_root_path = os.path.dirname( os.path.dirname(os.path.realpath(__file__))) vocab_file_path = "tokenizations/clue-vocab.txt" #使用预训练里面的词典进行编码 text = '我是一个人' tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file_path, do_lower_case=True) line = tokenization.convert_to_unicode(text) bert_tokens = tokenizer.tokenize(line) encoded = tokenizer.convert_tokens_to_ids(bert_tokens) # 下面关注一下数据集的写法. args.raw = True args.raw_data_path = '172166.txt' # -small是小的版本 args.epochs = 200 args.output_dir = 'model/' # 结果存到e盘的final_model args.num_pieces = 10 # 结果存到e盘的final_model from pre_data_byOnlyOneBook import get_data as get_data name2 = args.raw_data_path.split('.')[0] get_data(name2 + '.txt', name2 + '.json') # 下面使用166893.json即可. ''' ------------------------------------------------------------------------------ ''' #---------------配置完毕 print('args:\n' + args.__repr__()) if args.segment: from tokenizations import tokenization_bert_word_level as tokenization_bert else: from tokenizations import tokenization_bert os.environ["CUDA_VISIBLE_DEVICES"] = args.device # 此处设置程序使用哪些显卡 model_config = transformers.modeling_gpt2.GPT2Config.from_json_file( args.model_config) # 这个参数很重要,表示一句话的长度. print('config:\n' + model_config.to_json_string()) n_ctx = model_config.n_ctx # full_tokenizer = tokenization_bert.BertTokenizer(vocab_file=args.tokenizer_path) full_tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file_path, do_lower_case=True) ''' full_tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file_path, do_lower_case=True) ''' ''' 直接使用gpt2的tokenizer ''' full_tokenizer.max_len = 999999 device = 'cuda' if torch.cuda.is_available() else 'cpu' print('using device:', device) raw_data_path = args.raw_data_path tokenized_data_path = args.tokenized_data_path raw = args.raw # 选择是否从零开始构建数据集 epochs = args.epochs batch_size = args.batch_size lr = args.lr warmup_steps = args.warmup_steps log_step = args.log_step stride = args.stride gradient_accumulation = args.gradient_accumulation fp16 = args.fp16 # 不支持半精度的显卡请勿打开 fp16_opt_level = args.fp16_opt_level max_grad_norm = args.max_grad_norm num_pieces = args.num_pieces output_dir = args.output_dir # 'data/tokenized/' 编码之后的东西放在这里. if raw: print('building files') build_files(raw_data_path=name2 + '.json', tokenized_data_path=tokenized_data_path, full_tokenizer=full_tokenizer, num_pieces=num_pieces) print('files built') if not args.pretrained_model: model = transformers.modeling_gpt2.GPT2LMHeadModel(config=model_config) else: model = transformers.modeling_gpt2.GPT2LMHeadModel.from_pretrained( args.pretrained_model) model.train() model.to(device) multi_gpu = False full_len = 0 print('calculating total steps') for i in tqdm(range(num_pieces)): with open(tokenized_data_path + 'tokenized_train_{}.txt'.format(i), 'r') as f: full_len += len([int(item) for item in f.read().strip().split()]) import math total_steps = math.ceil(full_len / stride * epochs / batch_size / gradient_accumulation) print('total steps = {}'.format(total_steps)) optimizer = transformers.AdamW(model.parameters(), lr=lr, correct_bias=True) scheduler = transformers.WarmupLinearSchedule(optimizer, warmup_steps=warmup_steps, t_total=total_steps) if fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=fp16_opt_level) if torch.cuda.device_count() > 1: print("Let's use", torch.cuda.device_count(), "GPUs!") model = DataParallel(model) multi_gpu = True print('starting training') running_loss = 0 for epoch in range(epochs): print('epoch {}'.format(epoch + 1)) now = datetime.now() print('time: {}'.format(now)) x = np.linspace(0, num_pieces - 1, num_pieces, dtype=np.int32) random.shuffle(x) piece_num = 0 loss_save = [] for i in x: with open(tokenized_data_path + 'tokenized_train_{}.txt'.format(i), 'r') as f: line = f.read().strip() tokens = line.split() tokens = [int(token) for token in tokens] start_point = 0 samples = [] while start_point < len(tokens) - n_ctx: # n_ctx 表示上下文的长度. samples.append(tokens[start_point:start_point + n_ctx]) start_point += stride if start_point < len(tokens): # 拼接上最后一个例子. samples.append(tokens[len(tokens) - n_ctx:]) random.shuffle(samples) for step in range((len(samples) // batch_size) + 1): # 多跑一个 # prepare data #先判断是否超界,如果超界就表示最后一个组不成batch,所以break if step * batch_size > len(samples) - 1: break batch = samples[step * batch_size:(step + 1) * batch_size] batch_labels = [] batch_inputs = [] for ids in batch: int_ids_for_labels = [int(x) for x in ids] int_ids_for_inputs = [int(x) for x in ids] batch_labels.append(int_ids_for_labels) batch_inputs.append(int_ids_for_inputs) batch_labels = torch.tensor(batch_labels).long().to(device) batch_inputs = torch.tensor(batch_inputs).long().to(device) # forward pass 居然输入输出都一样????????很奇怪这个模型. ''' 下面为了对比,把ctrl的模型写这里: flag_input, inputs = numericalize(domain+tokenized_train_text[i:i+seq_length]) # 注意输入要牵头加上domain. flag_output, outputs = numericalize(tokenized_train_text[i:i+seq_length+1]) # ctrl算法输入是 i:j 输出是i:j+1 研究一下这个数据的问题: https://www.cnblogs.com/wwj99/p/12503545.html 好像还真是,样本和标签一样. ''' outputs = model.forward(input_ids=batch_inputs, labels=batch_labels) loss, logits = outputs[:2] # get loss if multi_gpu: loss = loss.mean() if gradient_accumulation > 1: loss = loss / gradient_accumulation # loss backward if fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), max_grad_norm) else: loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm) # optimizer step if (step + 1) % gradient_accumulation == 0: running_loss += loss.item() optimizer.step() optimizer.zero_grad() scheduler.step() if (step + 1) % log_step == 0: print( 'now time: {}:{}. Step {} of piece {} of epoch {}, loss {}' .format(datetime.now().hour, datetime.now().minute, (step + 1) // gradient_accumulation, piece_num, epoch + 1, running_loss / log_step)) loss_save.append(running_loss / log_step) running_loss = 0 piece_num += 1 #--------------检测是否提前退出 last = loss_save[:10] avg1 = sum(last) / 10 #如果全在avg1上下百分之5以内就停止: last = np.array(last) avg1 = np.array(avg1) tmp = np.all(last >= avg1 * 0.97) and np.all(last >= avg1 * 1.03) if len(last) >= 10 and tmp and loss_save[-1] < 0.05: break #-------------------- print('training finished') if not os.path.exists(output_dir + 'final_model'): os.makedirs(output_dir + 'final_model') model_to_save = model.module if hasattr(model, 'module') else model model_to_save.save_pretrained(output_dir + 'final_model')
def test_model(test_dataset, test_num_each): num_test = len(test_dataset) test_useful_start_idx = get_useful_start_idx(sequence_length, test_num_each) num_test_we_use = len(test_useful_start_idx) test_we_use_start_idx = test_useful_start_idx[0:num_test_we_use] test_idx = [] for i in range(num_test_we_use): for j in range(sequence_length): test_idx.append(test_we_use_start_idx[i] + j) num_test_all = len(test_idx) print('num test start idx : {:6d}'.format(len(test_useful_start_idx))) print('last idx test start: {:6d}'.format(test_useful_start_idx[-1])) print('num of test dataset: {:6d}'.format(num_test)) print('num of test we use : {:6d}'.format(num_test_we_use)) print('num of all test use: {:6d}'.format(num_test_all)) test_loader = DataLoader(test_dataset, batch_size=test_batch_size, sampler=SeqSampler(test_dataset, test_idx), num_workers=workers) # model = i3_res50_nl_new_test(400) # model = i3_res50_nl_new_test_1block(400) model = resnet_lstm_nonlocal() # num_ftrs = model.fc.in_features # model.fc = nn.Linear(num_ftrs, class_num) print(model) if torch.cuda.device_count() > 1: print("Let's use", torch.cuda.device_count(), "GPUs!") #consider multi gpu formatted at module. state = torch.load(model_name) newdict = {} for k,v in state['state_dict'].items(): if k[0:7] == 'module.': name = k[7:] newdict[name] = v else: newdict[k] = v model.load_state_dict(newdict) model = DataParallel(model) if use_gpu: model.to(device) criterion = nn.CrossEntropyLoss(size_average=False) model.eval() test_loss = 0.0 test_corrects = 0 test_start_time = time.time() all_preds = [] pth_blobs = {} # f = open('./possibility.txt', 'a') with torch.no_grad(): for data in test_loader: # torch.cuda.empty_cache() if use_gpu: inputs, labels = data[0].to(device), data[1].to(device) labels = labels[(sequence_length - 1)::sequence_length] else: inputs, labels = data[0], data[1] labels = labels[(sequence_length - 1)::sequence_length] if crop_type == 0 or crop_type == 1: inputs = inputs.view(-1, sequence_length, 3, 224, 224) outputs = model.forward(inputs) outputs = outputs[sequence_length - 1::sequence_length] _, preds = torch.max(outputs.data, 1) for i in range(len(preds)): all_preds.append(preds[i]) print("all_preds length:",len(all_preds)) loss = criterion(outputs, labels) test_loss += loss.data.item() test_corrects += torch.sum(preds == labels.data) print("preds:",preds) print("labels:",labels.data) # pdb.set_trace() test_loss += loss.data.item() print("test_corrects:",test_corrects) # f.write("preds:"+str(preds.cpu().numpy())) # f.write('\t') # f.write("labels:" + str(labels.data.cpu().numpy())) # f.write('\t') # f.write("possibility:" + str(possibility.cpu().numpy())) # f.write('\n') # f.close() test_elapsed_time = time.time() - test_start_time test_accuracy = float(test_corrects) / float(num_test_we_use) test_average_loss = test_loss / num_test_we_use # print('type of all_preds:', type(all_preds)) # print('leng of all preds:', len(all_preds)) save_test = int("{:4.0f}".format(test_accuracy * 10000)) pred_name = model_pure_name + '_test_' + str(save_test) + '_crop_' + str(crop_type) + '.pkl' with open(pred_name, 'wb') as f: pickle.dump(all_preds, f) print('test elapsed: {:2.0f}m{:2.0f}s' ' test loss: {:4.4f}' ' test accu: {:.4f}' .format(test_elapsed_time // 60, test_elapsed_time % 60, test_average_loss, test_accuracy))
def train_model(train_dataset, train_num_each, val_dataset, val_num_each): num_train = len(train_dataset) num_val = len(val_dataset) train_useful_start_idx = get_useful_start_idx(sequence_length, train_num_each) val_useful_start_idx = get_useful_start_idx(sequence_length, val_num_each) num_train_we_use = len(train_useful_start_idx) // num_gpu * num_gpu num_val_we_use = len(val_useful_start_idx) // num_gpu * num_gpu # num_train_we_use = 8000 # num_val_we_use = 800 train_we_use_start_idx = train_useful_start_idx[0:num_train_we_use] val_we_use_start_idx = val_useful_start_idx[0:num_val_we_use] # np.random.seed(0) # np.random.shuffle(train_we_use_start_idx) train_idx = [] for i in range(num_train_we_use): for j in range(sequence_length): train_idx.append(train_we_use_start_idx[i] + j) val_idx = [] for i in range(num_val_we_use): for j in range(sequence_length): val_idx.append(val_we_use_start_idx[i] + j) num_train_all = len(train_idx) num_val_all = len(val_idx) print('num of train dataset: {:6d}'.format(num_train)) print('num train start idx : {:6d}'.format(len(train_useful_start_idx))) print('last idx train start: {:6d}'.format(train_useful_start_idx[-1])) print('num of train we use : {:6d}'.format(num_train_we_use)) print('num of all train use: {:6d}'.format(num_train_all)) print('num of valid dataset: {:6d}'.format(num_val)) print('num valid start idx : {:6d}'.format(len(val_useful_start_idx))) print('last idx valid start: {:6d}'.format(val_useful_start_idx[-1])) print('num of valid we use : {:6d}'.format(num_val_we_use)) print('num of all valid use: {:6d}'.format(num_val_all)) train_loader = DataLoader(train_dataset, batch_size=train_batch_size, sampler=train_idx, num_workers=workers, pin_memory=False) val_loader = DataLoader(val_dataset, batch_size=val_batch_size, sampler=val_idx, num_workers=workers, pin_memory=False) model = resnet_lstm_dp() if use_gpu: model = model.cuda() model = DataParallel(model) criterion = nn.CrossEntropyLoss(size_average=False) if multi_optim == 0: if optimizer_choice == 0: optimizer = optim.SGD(model.parameters(), lr=learning_rate, momentum=momentum, dampening=dampening, weight_decay=weight_decay, nesterov=use_nesterov) if sgd_adjust_lr == 0: exp_lr_scheduler = lr_scheduler.StepLR(optimizer, step_size=sgd_adjust_lr, gamma=sgd_gamma) elif sgd_adjust_lr == 1: exp_lr_scheduler = lr_scheduler.ReduceLROnPlateau( optimizer, 'min') elif optimizer_choice == 1: optimizer = optim.Adam(model.parameters(), lr=learning_rate) elif multi_optim == 1: if optimizer_choice == 0: optimizer = optim.SGD([ { 'params': model.module.share.parameters() }, { 'params': model.module.lstm.parameters(), 'lr': learning_rate }, { 'params': model.module.fc.parameters(), 'lr': learning_rate }, ], lr=learning_rate / 10, momentum=momentum, dampening=dampening, weight_decay=weight_decay, nesterov=use_nesterov) if sgd_adjust_lr == 0: exp_lr_scheduler = lr_scheduler.StepLR(optimizer, step_size=sgd_adjust_lr, gamma=sgd_gamma) elif sgd_adjust_lr == 1: exp_lr_scheduler = lr_scheduler.ReduceLROnPlateau( optimizer, 'min') elif optimizer_choice == 1: optimizer = optim.Adam([ { 'params': model.module.share.parameters() }, { 'params': model.module.lstm.parameters(), 'lr': learning_rate }, { 'params': model.module.fc.parameters(), 'lr': learning_rate }, ], lr=learning_rate / 10) best_model_wts = copy.deepcopy(model.state_dict()) best_val_accuracy = 0.0 correspond_train_acc = 0.0 all_info = [] all_train_accuracy = [] all_train_loss = [] all_val_accuracy = [] all_val_loss = [] for epoch in range(epochs): # np.random.seed(epoch) np.random.shuffle(train_we_use_start_idx) train_idx = [] for i in range(num_train_we_use): for j in range(sequence_length): train_idx.append(train_we_use_start_idx[i] + j) train_loader = DataLoader(train_dataset, batch_size=train_batch_size, sampler=train_idx, num_workers=workers, pin_memory=False) model.train() train_loss = 0.0 train_corrects = 0 train_start_time = time.time() for data in train_loader: inputs, labels_1, labels_2 = data if use_gpu: inputs = Variable(inputs.cuda()) labels = Variable(labels_2.cuda()) else: inputs = Variable(inputs) labels = Variable(labels_2) optimizer.zero_grad() outputs = model.forward(inputs) _, preds = torch.max(outputs.data, 1) loss = criterion(outputs, labels) loss.backward() optimizer.step() train_loss += loss.data[0] train_corrects += torch.sum(preds == labels.data) train_elapsed_time = time.time() - train_start_time train_accuracy = train_corrects / num_train_all train_average_loss = train_loss / num_train_all # begin eval model.eval() val_loss = 0.0 val_corrects = 0 val_start_time = time.time() for data in val_loader: inputs, labels_1, labels_2 = data labels_2 = labels_2[(sequence_length - 1)::sequence_length] if use_gpu: inputs = Variable(inputs.cuda()) labels = Variable(labels_2.cuda()) else: inputs = Variable(inputs) labels = Variable(labels_2) if crop_type == 0 or crop_type == 1: outputs = model.forward(inputs) elif crop_type == 5: inputs = inputs.permute(1, 0, 2, 3, 4).contiguous() inputs = inputs.view(-1, 3, 224, 224) outputs = model.forward(inputs) outputs = outputs.view(5, -1, 7) outputs = torch.mean(outputs, 0) elif crop_type == 10: inputs = inputs.permute(1, 0, 2, 3, 4).contiguous() inputs = inputs.view(-1, 3, 224, 224) outputs = model.forward(inputs) outputs = outputs.view(10, -1, 7) outputs = torch.mean(outputs, 0) outputs = outputs[sequence_length - 1::sequence_length] _, preds = torch.max(outputs.data, 1) loss = criterion(outputs, labels) val_loss += loss.data[0] val_corrects += torch.sum(preds == labels.data) val_elapsed_time = time.time() - val_start_time val_accuracy = val_corrects / num_val_we_use val_average_loss = val_loss / num_val_we_use print('epoch: {:4d}' ' train in: {:2.0f}m{:2.0f}s' ' train loss: {:4.4f}' ' train accu: {:.4f}' ' valid in: {:2.0f}m{:2.0f}s' ' valid loss: {:4.4f}' ' valid accu: {:.4f}'.format(epoch, train_elapsed_time // 60, train_elapsed_time % 60, train_average_loss, train_accuracy, val_elapsed_time // 60, val_elapsed_time % 60, val_average_loss, val_accuracy)) if optimizer_choice == 0: if sgd_adjust_lr == 0: exp_lr_scheduler.step() elif sgd_adjust_lr == 1: exp_lr_scheduler.step(val_average_loss) if val_accuracy > best_val_accuracy: best_val_accuracy = val_accuracy correspond_train_acc = train_accuracy best_model_wts = copy.deepcopy(model.state_dict()) if val_accuracy == best_val_accuracy: if train_accuracy > correspond_train_acc: correspond_train_acc = train_accuracy best_model_wts = copy.deepcopy(model.state_dict()) all_train_loss.append(train_average_loss) all_train_accuracy.append(train_accuracy) all_val_loss.append(val_average_loss) all_val_accuracy.append(val_accuracy) print('best accuracy: {:.4f} cor train accu: {:.4f}'.format( best_val_accuracy, correspond_train_acc)) save_val = int("{:4.0f}".format(best_val_accuracy * 10000)) save_train = int("{:4.0f}".format(correspond_train_acc * 10000)) model_name = "lstm" \ + "_epoch_" + str(epochs) \ + "_length_" + str(sequence_length) \ + "_opt_" + str(optimizer_choice) \ + "_mulopt_" + str(multi_optim) \ + "_flip_" + str(use_flip) \ + "_crop_" + str(crop_type) \ + "_batch_" + str(train_batch_size) \ + "_train_" + str(save_train) \ + "_val_" + str(save_val) \ + ".pth" torch.save(best_model_wts, model_name) all_info.append(all_train_accuracy) all_info.append(all_train_loss) all_info.append(all_val_accuracy) all_info.append(all_val_loss) record_name = "lstm" \ + "_epoch_" + str(epochs) \ + "_length_" + str(sequence_length) \ + "_opt_" + str(optimizer_choice) \ + "_mulopt_" + str(multi_optim) \ + "_flip_" + str(use_flip) \ + "_crop_" + str(crop_type) \ + "_batch_" + str(train_batch_size) \ + "_train_" + str(save_train) \ + "_val_" + str(save_val) \ + ".pkl" with open(record_name, 'wb') as f: pickle.dump(all_info, f) print()
def train_process(): global global_step summary_writer = tensorboardX.SummaryWriter( log_dir=config.result_sub_folder, comment=config.comment) train_tfs = compose( [rotate_y(), rand_scale(), rand_translate(), jitter(), normalize()]) test_tfs = normalize() scene_tfs = compose([normalize(), to_tensor()]) # prepare data print("config.dataset") if config.dataset == "ModelNet40": train_set = ModelNet40(partition='train', transforms=train_tfs) valid_set = ModelNet40(partition='test', transforms=test_tfs) elif config.dataset == "Mnist": train_set = Mnist(partition='train') valid_set = Mnist(partition='test') elif config.dataset == "ScanNet": train_set = ScanNet(partition='train', transforms=train_tfs) valid_set = ScanNet(partition='test', transforms=test_tfs) elif config.dataset == "ModelNet10": train_set = ModelNet10(partition='train') valid_set = ModelNet10(partition='test') elif config.dataset == "S3DIS": train_set = S3DIS(partition='train', transforms=train_tfs) valid_set = S3DIS(partition='test', transforms=test_tfs) scene_set = S3DIS(partition='data/zero_0.h5', transforms=test_tfs) elif config.dataset == "ShapeNetParts": train_set = ShapeNetPart(partition='trainval', transforms=train_tfs) valid_set = ShapeNetPart(partition='test', transforms=test_tfs) elif config.dataset == "Cifar10": train_set = Cifar10(partition='train') valid_set = Cifar10(partition='test') else: raise NotImplementedError train_loader = DataLoader(train_set, batch_size=config.train.batch_size, shuffle=True, num_workers=config.num_workers, drop_last=True) valid_loader = DataLoader(valid_set, batch_size=config.validation.batch_size, shuffle=False, num_workers=config.num_workers, drop_last=False) if config.dataset == "S3DIS": scene_loader = DataLoader(scene_set, batch_size=config.validation.batch_size, shuffle=False, num_workers=config.num_workers, drop_last=False) print('train set size: {}'.format(len(train_set))) print('valid set size: {}'.format(len(valid_set))) if config.dataset == "S3DIS": print('scene set size: {}'.format(len(scene_set))) # prepare model net = create_model(config.base_model).to(config.device) # prepare optimizer if config.train.optimizer == 'SGD': optimizer = optim.SGD(net.parameters(), config.train.learning_rate_base, momentum=config.train.momentum) elif config.train.optimizer == 'ADAM': optimizer = optim.Adam(net.parameters(), lr=config.train.learning_rate_base, eps=1e-08, weight_decay=1e-4) else: raise NotImplementedError net = DataParallel(net) if config.train.resume: model_recorder = ModelRecorder(config.resume_ckpt_file, optimizer, summary_writer=summary_writer) else: model_recorder = ModelRecorder(config.ckpt_file, optimizer, summary_writer=summary_writer) start_epoch = 0 if config.train.resume: if not config.task == "seg": start_epoch = model_recorder.resume(net.module, optimizer, from_measurement='acc') else: start_epoch = model_recorder.resume(net.module, optimizer, from_measurement='iou') if config.train.resume_epoch is not None: start_epoch = config.train.resume_epoch print("Force resume at {}".format(start_epoch)) else: print("Resume at {}".format(start_epoch)) # prepare the criterion criterion = nn.CrossEntropyLoss() # start to train for epoch in range(start_epoch, config.train.num_epochs): lr = config.train.learning_rate_base * (math.pow( config.train.decay_rate, epoch // 10)) if lr < config.train.learning_rate_min: lr = config.train.learning_rate_min for g in optimizer.param_groups: g['lr'] = lr summary_writer.add_scalar('Learning rate', lr, global_step=epoch) if config.task == "seg": training_loss, training_acc, avg_per_class_acc, train_ious = train_epoch( train_loader, net, criterion, optimizer, epoch) summary_writer.add_scalar('Training Loss', training_loss, global_step=epoch) summary_writer.add_scalar('Training Accuracy', training_acc, global_step=epoch) summary_writer.add_scalar('Training Average Precision ', avg_per_class_acc, global_step=epoch) summary_writer.add_scalar('Training IOUs ', train_ious, global_step=epoch) else: training_loss, training_acc = train_epoch(train_loader, net, criterion, optimizer, epoch) summary_writer.add_scalar('Training Accuracy', training_acc, global_step=epoch) summary_writer.add_scalar('Training Loss', training_loss, global_step=epoch) if (epoch % config.validation.step_val == 0) or (epoch == config.train.num_epochs - 1): with torch.no_grad(): if config.task == "seg": validation_loss, validation_acc, avg_per_class_acc, val_ious = evaluate( valid_loader, net, html_path="training_output") summary_writer.add_scalar('Validation Loss', validation_loss, global_step=epoch) summary_writer.add_scalar('Validation Accuracy', validation_acc, global_step=epoch) summary_writer.add_scalar('Validation Average Precision ', avg_per_class_acc, global_step=epoch) summary_writer.add_scalar('Validation IOUs ', val_ious, global_step=epoch) if config.dataset == "ScanNet": net.eval() print('Scene Validation') y_true = [] y_pred = [] sample_num = 2048 max_point_num = 8192 batch_size = math.ceil(max_point_num / sample_num) indices_batch_indices = np.tile( np.reshape(np.arange(batch_size), (batch_size, 1, 1)), (1, sample_num, 1)) data_h5 = h5py.File("zero_0.h5", 'r+') data = data_h5['data'][...].astype(np.float32) data_num = data_h5['data_num'][...].astype(np.int32) data_labels_seg = data_h5['label_seg'][...].astype( np.int64) data_h5.close() batch_num = data.shape[0] labels_pred = np.full((batch_num, max_point_num), -1, dtype=np.int32) confidences_pred = np.zeros((batch_num, max_point_num), dtype=np.float32) for batch_idx in range(batch_num): if batch_idx % 10 == 0: print('{}-Processing {} of {} batches.'.format( datetime.now(), batch_idx, batch_num)) points_batch = data[batch_idx] point_num = data_num[batch_idx] seg_np = (data_labels_seg[batch_idx])[:point_num] y_true.append(seg_np.reshape(-1, 1)) tile_num = math.ceil( (sample_num * batch_size) / point_num) indices_shuffle = np.tile(np.arange(point_num), tile_num)[0:sample_num * batch_size] np.random.shuffle(indices_shuffle) input_points = scene_tfs( (points_batch[indices_shuffle]).reshape( (batch_size, sample_num, -1))).to(config.device) seg_probs = net.forward(input_points) probs_2d = np.reshape( seg_probs.detach().cpu().numpy(), (sample_num * batch_size, -1)) predictions = [(-1, 0.0)] * point_num for idx in range(sample_num * batch_size): point_idx = indices_shuffle[idx] probs = probs_2d[idx, :] confidence = np.amax(probs) label = np.argmax(probs) if confidence > predictions[point_idx][1]: predictions[point_idx] = [ label, confidence ] pred_np = np.array(predictions)[:, 0] y_pred.append(pred_np.reshape(-1, 1)) print( metrics.classification_report( np.concatenate(y_true, axis=0), np.concatenate(y_pred, axis=0))) else: validation_loss, acc = evaluate(valid_loader, net) summary_writer.add_scalar('Validation Accuracy', acc, global_step=epoch) summary_writer.add_scalar('Validation Loss', validation_loss, global_step=epoch) if config.task == "seg": model_recorder.add( epoch, net, dict(acc=validation_acc, iou=val_ious, avg_acc=avg_per_class_acc)) else: model_recorder.add(epoch, net, dict(acc=acc)) model_recorder.print_curr_stat() print('\nTrain Finished: {}'.format( time.strftime("%Y-%m-%d_%H-%M-%S", time.localtime())))
def train_model(train_dataset, train_num_each, val_dataset, val_num_each): num_train = len(train_dataset) num_val = len(val_dataset) train_useful_start_idx = get_useful_start_idx(sequence_length, train_num_each) val_useful_start_idx = get_useful_start_idx(sequence_length, val_num_each) num_train_we_use = len(train_useful_start_idx) // num_gpu * num_gpu num_val_we_use = len(val_useful_start_idx) // num_gpu * num_gpu # num_train_we_use = 800 # num_val_we_use = 800 train_we_use_start_idx = train_useful_start_idx[0:num_train_we_use] val_we_use_start_idx = val_useful_start_idx[0:num_val_we_use] train_idx = [] for i in range(num_train_we_use): for j in range(sequence_length): train_idx.append(train_we_use_start_idx[i] + j) val_idx = [] for i in range(num_val_we_use): for j in range(sequence_length): val_idx.append(val_we_use_start_idx[i] + j) num_train_all = len(train_idx) num_val_all = len(val_idx) print('num train start idx : {:6d}'.format(len(train_useful_start_idx))) print('last idx train start: {:6d}'.format(train_useful_start_idx[-1])) print('num of train dataset: {:6d}'.format(num_train)) print('num of train we use : {:6d}'.format(num_train_we_use)) print('num of all train use: {:6d}'.format(num_train_all)) print('num valid start idx : {:6d}'.format(len(val_useful_start_idx))) print('last idx valid start: {:6d}'.format(val_useful_start_idx[-1])) print('num of valid dataset: {:6d}'.format(num_val)) print('num of valid we use : {:6d}'.format(num_val_we_use)) print('num of all valid use: {:6d}'.format(num_val_all)) train_loader = DataLoader( train_dataset, batch_size=train_batch_size, sampler=train_idx, num_workers=workers, pin_memory=False ) val_loader = DataLoader( val_dataset, batch_size=val_batch_size, sampler=val_idx, num_workers=workers, pin_memory=False ) model = multi_lstm() model = DataParallel(model) model.load_state_dict(torch.load( 'cnn_lstm_epoch_25_length_4_opt_1_mulopt_1_flip_0_crop_1_batch_200_train1_9998_train2_9987_val1_9731_val2_8752.pth')) kl_fc_p2t = nn.Linear(7, 7) kl_fc_t2p = nn.Linear(7, 7) # fix 前面网络层,学习两个矩阵 for param in model.module.parameters(): param.requires_grad = False for param in kl_fc_p2t.parameters(): param.requires_grad = True for param in kl_fc_t2p.parameters(): param.requires_grad = True if use_gpu: model = model.cuda() kl_fc_p2t = kl_fc_p2t.cuda() kl_fc_t2p = kl_fc_t2p.cuda() criterion_1 = nn.BCEWithLogitsLoss(size_average=False) criterion_2 = nn.CrossEntropyLoss(size_average=False) sigmoid = nn.Sigmoid() if use_gpu: sigmoid = sigmoid.cuda() if optimizer_choice == 0: optimizer = optim.SGD([{'params': kl_fc_p2t.parameters()}, {'params': kl_fc_t2p.parameters()}], lr=learning_rate, momentum=momentum, dampening=dampening, weight_decay=weight_decay, nesterov=use_nesterov) if sgd_adjust_lr == 0: exp_lr_scheduler = lr_scheduler.StepLR(optimizer, step_size=sgd_adjust_lr, gamma=sgd_gamma) elif sgd_adjust_lr == 1: exp_lr_scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, 'min') elif optimizer_choice == 1: optimizer = optim.Adam([{'params': kl_fc_p2t.parameters()}, {'params': kl_fc_t2p.parameters()}], lr=learning_rate) best_val_accuracy_1 = 0.0 best_val_accuracy_2 = 0.0 correspond_train_acc_1 = 0.0 correspond_train_acc_2 = 0.0 kl_fc_t2p_np = copy.deepcopy(kl_fc_t2p.weight.data.cpu().numpy()) kl_fc_p2t_np = copy.deepcopy(kl_fc_p2t.weight.data.cpu().numpy()) record_np = np.zeros([epochs, 8]) for epoch in range(epochs): np.random.shuffle(train_we_use_start_idx) train_idx = [] for i in range(num_train_we_use): for j in range(sequence_length): train_idx.append(train_we_use_start_idx[i] + j) train_loader = DataLoader( train_dataset, batch_size=train_batch_size, sampler=train_idx, num_workers=workers, pin_memory=False ) # train train_loss_1 = 0.0 train_loss_2 = 0.0 train_corrects_1 = 0 train_corrects_2 = 0 train_start_time = time.time() for data in train_loader: inputs, labels_1, labels_2 = data if use_gpu: inputs = Variable(inputs.cuda()) labels_1 = Variable(labels_1.cuda()) labels_2 = Variable(labels_2.cuda()) else: inputs = Variable(inputs) labels_1 = Variable(labels_1) labels_2 = Variable(labels_2) optimizer.zero_grad() outputs_1, outputs_2 = model.forward(inputs) kl_output_1 = kl_fc_t2p(outputs_1) kl_output_2 = kl_fc_p2t(outputs_2) outputs_1 = (kl_output_2 + outputs_1) / 2 outputs_2 = (kl_output_1 + outputs_2) / 2 _, preds_2 = torch.max(outputs_2.data, 1) # 统计tool正确个数 sig_out = sigmoid(outputs_1.data) if use_gpu: preds_1 = torch.cuda.ByteTensor(sig_out > 0.5) else: preds_1 = torch.ByteTensor(sig_out > 0.5) preds_1 = preds_1.long() train_corrects_1 += torch.sum(preds_1 == labels_1.data) labels_1 = Variable(labels_1.data.float()) loss_1 = criterion_1(outputs_1, labels_1) loss_2 = criterion_2(outputs_2, labels_2) loss = loss_1 + loss_2 loss.backward() optimizer.step() train_loss_1 += loss_1.data[0] train_loss_2 += loss_2.data[0] train_corrects_2 += torch.sum(preds_2 == labels_2.data) train_elapsed_time = time.time() - train_start_time train_accuracy_1 = train_corrects_1 / num_train_all / 7 train_accuracy_2 = train_corrects_2 / num_train_all train_average_loss_1 = train_loss_1 / num_train_all / 7 train_average_loss_2 = train_loss_2 / num_train_all # begin eval val_loss_1 = 0.0 val_loss_2 = 0.0 val_corrects_1 = 0 val_corrects_2 = 0 val_start_time = time.time() for data in val_loader: inputs, labels_1, labels_2 = data labels_2 = labels_2[(sequence_length - 1):: sequence_length] if use_gpu: inputs = Variable(inputs.cuda(), volatile=True) labels_1 = Variable(labels_1.cuda(), volatile=True) labels_2 = Variable(labels_2.cuda(), volatile=True) else: inputs = Variable(inputs, volatile=True) labels_1 = Variable(labels_1, volatile=True) labels_2 = Variable(labels_2, volatile=True) outputs_1, outputs_2 = model.forward(inputs) kl_output_1 = kl_fc_t2p(outputs_1) kl_output_2 = kl_fc_p2t(outputs_2) outputs_1 = (kl_output_2 + outputs_1) / 2 outputs_2 = (kl_output_1 + outputs_2) / 2 outputs_2 = outputs_2[sequence_length - 1::sequence_length] _, preds_2 = torch.max(outputs_2.data, 1) sig_out = sigmoid(outputs_1.data) if use_gpu: preds_1 = torch.cuda.ByteTensor(sig_out > 0.5) else: preds_1 = torch.ByteTensor(sig_out > 0.5) preds_1 = preds_1.long() val_corrects_1 += torch.sum(preds_1 == labels_1.data) labels_1 = Variable(labels_1.data.float()) loss_1 = criterion_1(outputs_1, labels_1) loss_2 = criterion_2(outputs_2, labels_2) val_loss_1 += loss_1.data[0] val_loss_2 += loss_2.data[0] val_corrects_2 += torch.sum(preds_2 == labels_2.data) val_elapsed_time = time.time() - val_start_time val_accuracy_1 = val_corrects_1 / (num_val_all * 7) val_accuracy_2 = val_corrects_2 / num_val_we_use val_average_loss_1 = val_loss_1 / (num_val_all * 7) val_average_loss_2 = val_loss_2 / num_val_we_use print('epoch: {:4d}' ' train time: {:2.0f}m{:2.0f}s' ' train accu_1: {:.4f}' ' train accu_2: {:.4f}' ' train loss_1: {:4.4f}' ' train loss_2: {:4.4f}' .format(epoch, train_elapsed_time // 60, train_elapsed_time % 60, train_accuracy_1, train_accuracy_2, train_average_loss_1, train_average_loss_2)) print('epoch: {:4d}' ' valid time: {:2.0f}m{:2.0f}s' ' valid accu_1: {:.4f}' ' valid accu_2: {:.4f}' ' valid loss_1: {:4.4f}' ' valid loss_2: {:4.4f}' .format(epoch, val_elapsed_time // 60, val_elapsed_time % 60, val_accuracy_1, val_accuracy_2, val_average_loss_1, val_average_loss_2)) if optimizer_choice == 0: if sgd_adjust_lr == 0: exp_lr_scheduler.step() elif sgd_adjust_lr == 1: exp_lr_scheduler.step(val_average_loss_1 + val_average_loss_2) if val_accuracy_2 > best_val_accuracy_2 and val_accuracy_1 > 0.95: best_val_accuracy_2 = val_accuracy_2 best_val_accuracy_1 = val_accuracy_1 correspond_train_acc_1 = train_accuracy_1 correspond_train_acc_2 = train_accuracy_2 kl_fc_t2p_np = copy.deepcopy(kl_fc_t2p.weight.data.cpu().numpy()) kl_fc_p2t_np = copy.deepcopy(kl_fc_p2t_np.weight.data.cpu().numpy()) elif val_accuracy_2 == best_val_accuracy_2 and val_accuracy_1 > 0.95: if val_accuracy_1 > best_val_accuracy_1: correspond_train_acc_1 = train_accuracy_1 correspond_train_acc_2 = train_accuracy_2 kl_fc_t2p_np = copy.deepcopy(kl_fc_t2p.weight.data.cpu().numpy()) kl_fc_p2t_np = copy.deepcopy(kl_fc_p2t_np.weight.data.cpu().numpy()) elif val_accuracy_1 == best_val_accuracy_1: if train_accuracy_2 > correspond_train_acc_2: correspond_train_acc_2 = train_accuracy_2 correspond_train_acc_1 = train_accuracy_1 kl_fc_t2p_np = copy.deepcopy(kl_fc_t2p.weight.data.cpu().numpy()) kl_fc_p2t_np = copy.deepcopy(kl_fc_p2t_np.weight.data.cpu().numpy()) elif train_accuracy_2 == correspond_train_acc_2: if train_accuracy_1 > best_val_accuracy_1: correspond_train_acc_1 = train_accuracy_1 kl_fc_t2p_np = copy.deepcopy(kl_fc_t2p.weight.data.cpu().numpy()) kl_fc_p2t_np = copy.deepcopy(kl_fc_p2t_np.weight.data.cpu().numpy()) record_np[epoch, 0] = train_accuracy_1 record_np[epoch, 1] = train_accuracy_2 record_np[epoch, 2] = train_average_loss_1 record_np[epoch, 3] = train_average_loss_2 record_np[epoch, 4] = val_accuracy_1 record_np[epoch, 5] = val_accuracy_2 record_np[epoch, 6] = val_average_loss_1 record_np[epoch, 7] = val_average_loss_2 print('best accuracy_1: {:.4f} cor train accu_1: {:.4f}'.format(best_val_accuracy_1, correspond_train_acc_1)) print('best accuracy_2: {:.4f} cor train accu_2: {:.4f}'.format(best_val_accuracy_2, correspond_train_acc_2)) save_val_1 = int("{:4.0f}".format(best_val_accuracy_1 * 10000)) save_val_2 = int("{:4.0f}".format(best_val_accuracy_2 * 10000)) save_train_1 = int("{:4.0f}".format(correspond_train_acc_1 * 10000)) save_train_2 = int("{:4.0f}".format(correspond_train_acc_2 * 10000)) public_name = "train_klave" \ + "_epoch_" + str(epochs) \ + "_length_" + str(sequence_length) \ + "_opt_" + str(optimizer_choice) \ + "_flip_" + str(use_flip) \ + "_crop_" + str(crop_type) \ + "_batch_" + str(train_batch_size) \ + "_train1_" + str(save_train_1) \ + "_train2_" + str(save_train_2) \ + "_val1_" + str(save_val_1) \ + "_val2_" + str(save_val_2) record_name = public_name + ".npy" np.save(record_name, record_np) np.save('fc_p2t', kl_fc_p2t_np) np.save('fc_t2p', kl_fc_t2p_np)
start_point += args.stride if start_point < len(tokens): samples.append(tokens[len(tokens) - n_ctx:]) random.shuffle(samples) ##s数据准备 prepare data for step in range(len(samples) // args.batch_size): batch = samples[step * args.batch_size:(step + 1) * args.batch_size] batch_inputs = [] for ids in batch: int_ids = [int(x) for x in ids] batch_inputs.append(int_ids) batch_inputs = torch.tensor(batch_inputs).long().cuda() ##forward pass outputs = model.forward(input_ids=batch_inputs, labels=batch_inputs) loss, logits = outputs[:2] ##get loss #if multi_gpu: # loss = loss.mean() if args.gradient_accumulation > 1: loss = loss / args.gradient_accumulation ## loss backward loss.backward() torch.nn.utils.clip_grad_norm(model.parameters(), args.max_grad_norm) ## optimizer step if (overall_step + 1) % args.gradient_accumulation == 0:
def main(): if raw: print('building files') build_files(data_path=raw_data_path) print('files built') model = pytorch_transformers.modeling_gpt2.GPT2LMHeadModel( config=model_config) model.to(device) num_parameters = 0 parameters = model.parameters() for parameter in parameters: num_parameters += parameter.numel() print('number of parameters: {}'.format(num_parameters)) multi_gpu = False full_len = 0 print('calculating total steps') for i in tqdm(range(num_pieces)): with open(tokenized_data_path + 'tokenized_train_{}.txt'.format(i), 'r') as f: full_len += len([int(item) for item in f.read().strip().split()]) total_steps = int(full_len / stride * epochs / batch_size / gradient_accumulation) print('total steps = {}'.format(total_steps)) optimizer = pytorch_transformers.AdamW(model.parameters(), lr=lr, correct_bias=True) scheduler = pytorch_transformers.WarmupLinearSchedule( optimizer, warmup_steps=warmup_steps, t_total=total_steps) if fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=fp16_opt_level) if torch.cuda.device_count() > 1: print("Let's use", torch.cuda.device_count(), "GPUs!") model = DataParallel(model) multi_gpu = True print('starting training') for epoch in range(epochs): print('epoch {}'.format(epoch + 1)) now = datetime.now() print('time: {}'.format(now)) x = np.linspace(0, num_pieces - 1, num_pieces, dtype=np.int32) random.shuffle(x) piece_num = 0 for i in x: running_loss = 0 with open(tokenized_data_path + 'tokenized_train_{}.txt'.format(i), 'r') as f: line = f.read().strip() tokens = line.split() tokens = [int(token) for token in tokens] start_point = 0 samples = [] while start_point < len(tokens) - n_ctx: samples.append(tokens[start_point:start_point + n_ctx]) start_point += stride random.shuffle(samples) for step in range(len(samples) // batch_size): # prepare data batch = samples[step * batch_size:(step + 1) * batch_size] batch_labels = [] batch_inputs = [] for ids in batch: int_ids_for_labels = [int(x) for x in ids] int_ids_for_inputs = [int(x) for x in ids] batch_labels.append(int_ids_for_labels) batch_inputs.append(int_ids_for_inputs) batch_labels = torch.tensor(batch_labels).long().to(device) batch_inputs = torch.tensor(batch_inputs).long().to(device) # forward pass outputs = model.forward(input_ids=batch_inputs, labels=batch_labels) loss, logits = outputs[:2] # get loss if multi_gpu: loss = loss.mean() if gradient_accumulation > 1: loss = loss / gradient_accumulation # loss backward if fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), max_grad_norm) else: loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm) # optimizer step if (step + 1) % gradient_accumulation == 0: running_loss += loss.item() scheduler.step() optimizer.step() optimizer.zero_grad() if (step + 1) % log_step == 0: print('step {} of piece {} of epoch {}, loss {}'.format( (step + 1) // gradient_accumulation, piece_num, epoch + 1, running_loss * gradient_accumulation / log_step)) running_loss = 0 piece_num += 1 print('saving model for epoch {}'.format(epoch + 1)) if not os.path.exists(output_dir + 'model_epoch{}'.format(epoch + 1)): os.mkdir(output_dir + 'model_epoch{}'.format(epoch + 1)) model_to_save = model.module if hasattr(model, 'module') else model model_to_save.save_pretrained(output_dir + 'model_epoch{}'.format(epoch + 1)) # torch.save(scheduler.state_dict(), output_dir + 'model_epoch{}/scheduler.pt'.format(epoch + 1)) # torch.save(optimizer.state_dict(), output_dir + 'model_epoch{}/optimizer.pt'.format(epoch + 1)) print('epoch {} finished'.format(epoch + 1)) then = datetime.now() print('time: {}'.format(then)) print('time for one epoch: {}'.format(then - now)) print('training finished') if not os.path.exists(output_dir + 'final_model'): os.mkdir(output_dir + 'final_model') model_to_save = model.module if hasattr(model, 'module') else model model_to_save.save_pretrained(output_dir + 'final_model')
def train_model(train_dataset, train_num_each, val_dataset, val_num_each): num_train = len(train_dataset) num_val = len(val_dataset) train_useful_start_idx = get_useful_start_idx(sequence_length, train_num_each) val_useful_start_idx = get_useful_start_idx(sequence_length, val_num_each) num_train_we_use = len(train_useful_start_idx) // num_gpu * num_gpu num_val_we_use = len(val_useful_start_idx) // num_gpu * num_gpu # num_train_we_use = 800 # num_val_we_use = 800 train_we_use_start_idx = train_useful_start_idx[0:num_train_we_use] val_we_use_start_idx = val_useful_start_idx[0:num_val_we_use] train_idx = [] for i in range(num_train_we_use): for j in range(sequence_length): train_idx.append(train_we_use_start_idx[i] + j) val_idx = [] for i in range(num_val_we_use): for j in range(sequence_length): val_idx.append(val_we_use_start_idx[i] + j) num_train_all = len(train_idx) num_val_all = len(val_idx) print('num train start idx : {:6d}'.format(len(train_useful_start_idx))) print('last idx train start: {:6d}'.format(train_useful_start_idx[-1])) print('num of train dataset: {:6d}'.format(num_train)) print('num of train we use : {:6d}'.format(num_train_we_use)) print('num of all train use: {:6d}'.format(num_train_all)) print('num valid start idx : {:6d}'.format(len(val_useful_start_idx))) print('last idx valid start: {:6d}'.format(val_useful_start_idx[-1])) print('num of valid dataset: {:6d}'.format(num_val)) print('num of valid we use : {:6d}'.format(num_val_we_use)) print('num of all valid use: {:6d}'.format(num_val_all)) train_loader = DataLoader(train_dataset, batch_size=train_batch_size, sampler=train_idx, num_workers=workers, pin_memory=False) val_loader = DataLoader(val_dataset, batch_size=val_batch_size, sampler=val_idx, num_workers=workers, pin_memory=False) model = multi_lstm() model = DataParallel(model) model.load_state_dict( torch.load( 'cnn_lstm_epoch_25_length_4_opt_1_mulopt_1_flip_0_crop_1_batch_200_train1_9998_train2_9987_val1_9731_val2_8752.pth' )) kl_fc_p2t = nn.Linear(7, 7) kl_fc_t2p = nn.Linear(7, 7) all_phase_to_tool = np.load('kl_fc_p2t.npy') all_tool_to_phase = np.load('kl_fc_t2p.npy') kl_fc_p2t.weight.data = torch.from_numpy( all_phase_to_tool.astype('float32')) kl_fc_t2p.weight.data = torch.from_numpy( all_tool_to_phase.astype('float32')) for param in kl_fc_p2t.parameters(): param.requires_grad = True for param in kl_fc_t2p.parameters(): param.requires_grad = True if use_gpu: model = model.cuda() kl_fc_p2t = kl_fc_p2t.cuda() kl_fc_t2p = kl_fc_t2p.cuda() criterion_1 = nn.BCEWithLogitsLoss(size_average=False) criterion_2 = nn.CrossEntropyLoss(size_average=False) criterion_3 = nn.KLDivLoss(size_average=False) softmax_cuda = nn.Softmax().cuda() sigmoid_cuda = nn.Sigmoid().cuda() if multi_optim == 0: if optimizer_choice == 0: optimizer = optim.SGD([{ 'params': model.module.parameters() }, { 'params': kl_fc_p2t.parameters() }, { 'params': kl_fc_t2p.parameters() }], lr=learning_rate, momentum=momentum, dampening=dampening, weight_decay=weight_decay, nesterov=use_nesterov) if sgd_adjust_lr == 0: exp_lr_scheduler = lr_scheduler.StepLR(optimizer, step_size=sgd_step, gamma=sgd_gamma) elif sgd_adjust_lr == 1: exp_lr_scheduler = lr_scheduler.ReduceLROnPlateau( optimizer, 'min') elif optimizer_choice == 1: optimizer = optim.Adam([[{ 'params': model.module.parameters() }, { 'params': kl_fc_p2t.parameters() }, { 'params': kl_fc_t2p.parameters() }]], lr=learning_rate) elif multi_optim == 1: if optimizer_choice == 0: optimizer = optim.SGD([ { 'params': model.module.share.parameters() }, { 'params': kl_fc_p2t.parameters() }, { 'params': kl_fc_t2p.parameters() }, { 'params': model.module.lstm.parameters(), 'lr': learning_rate }, { 'params': model.module.fc.parameters(), 'lr': learning_rate }, ], lr=learning_rate / 10, momentum=momentum, dampening=dampening, weight_decay=weight_decay, nesterov=use_nesterov) if sgd_adjust_lr == 0: exp_lr_scheduler = lr_scheduler.StepLR(optimizer, step_size=sgd_step, gamma=sgd_gamma) elif sgd_adjust_lr == 1: exp_lr_scheduler = lr_scheduler.ReduceLROnPlateau( optimizer, 'min') elif optimizer_choice == 1: optimizer = optim.Adam([ { 'params': model.module.share.parameters() }, { 'params': kl_fc_p2t.parameters() }, { 'params': kl_fc_t2p.parameters() }, { 'params': model.module.lstm.parameters(), 'lr': learning_rate }, { 'params': model.module.fc.parameters(), 'lr': learning_rate }, ], lr=learning_rate / 10) best_model_wts = copy.deepcopy(model.state_dict()) best_val_accuracy_1 = 0.0 best_val_accuracy_2 = 0.0 # judge by accu2 correspond_train_acc_1 = 0.0 correspond_train_acc_2 = 0.0 # 要存储2个train的准确率 2个valid的准确率 4个train 4个loss的loss, 一共12个数据要记录 record_np = np.zeros([epochs, 12]) for epoch in range(epochs): # np.random.seed(epoch) np.random.shuffle(train_we_use_start_idx) train_idx = [] for i in range(num_train_we_use): for j in range(sequence_length): train_idx.append(train_we_use_start_idx[i] + j) train_loader = DataLoader(train_dataset, batch_size=train_batch_size, sampler=train_idx, num_workers=workers, pin_memory=False) model.train() train_loss_1 = 0.0 train_loss_2 = 0.0 train_loss_3 = 0.0 train_loss_4 = 0.0 train_corrects_1 = 0 train_corrects_2 = 0 train_start_time = time.time() for data in train_loader: inputs, labels_1, labels_2 = data if use_gpu: inputs = Variable(inputs.cuda()) labels_1 = Variable(labels_1.cuda()) labels_2 = Variable(labels_2.cuda()) else: inputs = Variable(inputs) labels_1 = Variable(labels_1) labels_2 = Variable(labels_2) optimizer.zero_grad() outputs_1, outputs_2 = model.forward(inputs) _, preds_2 = torch.max(outputs_2.data, 1) sig_out = outputs_1.data sig_out = sigmoid_cuda(sig_out) preds_1 = torch.cuda.ByteTensor(sig_out > 0.5) preds_1 = preds_1.long() train_corrects_1 += torch.sum(preds_1 == labels_1.data) labels_1 = Variable(labels_1.data.float()) loss_1 = criterion_1(outputs_1, labels_1) loss_2 = criterion_2(outputs_2, labels_2) sig_output_1 = sigmoid_cuda(outputs_1) soft_output_2 = softmax_cuda(outputs_2) sig_output_1 = Variable(sig_output_1.data, requires_grad=False) soft_output_2 = Variable(soft_output_2.data, requires_grad=False) kl_output_1 = kl_fc_t2p(sig_output_1) kl_output_2 = kl_fc_p2t(soft_output_2) loss_3 = torch.abs(criterion_3(kl_output_1, soft_output_2)) loss_4 = torch.abs(criterion_3(kl_output_2, sig_output_1)) loss = loss_1 + loss_2 + loss_3 + loss_4 loss.backward() optimizer.step() train_loss_1 += loss_1.data[0] train_loss_2 += loss_2.data[0] train_loss_3 += loss_3.data[0] train_loss_4 += loss_4.data[0] train_corrects_2 += torch.sum(preds_2 == labels_2.data) train_elapsed_time = time.time() - train_start_time train_accuracy_1 = train_corrects_1 / num_train_all / 7 train_accuracy_2 = train_corrects_2 / num_train_all train_average_loss_1 = train_loss_1 / num_train_all / 7 train_average_loss_2 = train_loss_2 / num_train_all train_average_loss_3 = train_loss_3 / num_train_all train_average_loss_4 = train_loss_4 / num_train_all # begin eval model.eval() val_loss_1 = 0.0 val_loss_2 = 0.0 val_loss_3 = 0.0 val_loss_4 = 0.0 val_corrects_1 = 0 val_corrects_2 = 0 val_start_time = time.time() for data in val_loader: inputs, labels_1, labels_2 = data labels_2 = labels_2[(sequence_length - 1)::sequence_length] if use_gpu: inputs = Variable(inputs.cuda(), volatile=True) labels_1 = Variable(labels_1.cuda(), volatile=True) labels_2 = Variable(labels_2.cuda(), volatile=True) else: inputs = Variable(inputs, volatile=True) labels_1 = Variable(labels_1, volatile=True) labels_2 = Variable(labels_2, volatile=True) if crop_type == 0 or crop_type == 1: outputs_1, outputs_2 = model.forward(inputs) elif crop_type == 5: inputs = inputs.permute(1, 0, 2, 3, 4).contiguous() inputs = inputs.view(-1, 3, 224, 224) outputs_1, outputs_2 = model.forward(inputs) outputs_1 = outputs_1.view(5, -1, 7) outputs_1 = torch.mean(outputs_1, 0) outputs_2 = outputs_2.view(5, -1, 7) outputs_2 = torch.mean(outputs_2, 0) elif crop_type == 10: inputs = inputs.permute(1, 0, 2, 3, 4).contiguous() inputs = inputs.view(-1, 3, 224, 224) outputs_1, outputs_2 = model.forward(inputs) outputs_1 = outputs_1.view(10, -1, 7) outputs_1 = torch.mean(outputs_1, 0) outputs_2 = outputs_2.view(10, -1, 7) outputs_2 = torch.mean(outputs_2, 0) sig_output_1 = sigmoid_cuda(outputs_1) soft_output_2 = softmax_cuda(outputs_2) kl_output_1 = (kl_fc_t2p(sig_output_1)) kl_output_2 = (kl_fc_p2t(soft_output_2)) sig_output_1 = Variable(sig_output_1.data, requires_grad=False) soft_output_2 = Variable(soft_output_2.data, requires_grad=False) outputs_1 = outputs_1 + kl_output_2 outputs_2 = outputs_2 + kl_output_1 outputs_2 = outputs_2[sequence_length - 1::sequence_length] _, preds_2 = torch.max(outputs_2.data, 1) sig_out = outputs_1.data sig_out = sigmoid_cuda(sig_out) preds_1 = torch.cuda.ByteTensor(sig_out > 0.5) preds_1 = preds_1.long() val_corrects_1 += torch.sum(preds_1 == labels_1.data) labels_1 = Variable(labels_1.data.float()) loss_1 = criterion_1(outputs_1, labels_1) val_loss_1 += loss_1.data[0] loss_2 = criterion_2(outputs_2, labels_2) val_corrects_2 += torch.sum(preds_2 == labels_2.data) val_loss_2 += loss_2.data[0] loss_3 = torch.abs(criterion_3(kl_output_1, soft_output_2)) loss_4 = torch.abs(criterion_3(kl_output_2, sig_output_1)) val_loss_3 += loss_3.data[0] val_loss_4 += loss_4.data[0] val_elapsed_time = time.time() - val_start_time val_accuracy_1 = val_corrects_1 / (num_val_all * 7) val_accuracy_2 = val_corrects_2 / num_val_we_use val_average_loss_1 = val_loss_1 / (num_val_all * 7) val_average_loss_2 = val_loss_2 / num_val_we_use val_average_loss_3 = val_loss_3 / num_val_all val_average_loss_4 = val_loss_4 / num_val_all print('epoch: {:3d}' ' train time: {:2.0f}m{:2.0f}s' ' train accu_1: {:.4f}' ' train accu_2: {:.4f}' ' train loss_1: {:4.4f}' ' train loss_2: {:4.4f}' ' train loss_3: {:4.4f}' ' train loss_3: {:4.4f}'.format( epoch, train_elapsed_time // 60, train_elapsed_time % 60, train_accuracy_1, train_accuracy_2, train_average_loss_1, train_average_loss_2, train_average_loss_3, train_average_loss_4)) print('epoch: {:3d}' ' valid time: {:2.0f}m{:2.0f}s' ' valid accu_1: {:.4f}' ' valid accu_2: {:.4f}' ' valid loss_1: {:4.4f}' ' valid loss_2: {:4.4f}' ' valid loss_3: {:4.4f}' ' valid loss_4: {:4.4f}'.format( epoch, val_elapsed_time // 60, val_elapsed_time % 60, val_accuracy_1, val_accuracy_2, val_average_loss_1, val_average_loss_2, val_average_loss_3, val_average_loss_4)) if optimizer_choice == 0: if sgd_adjust_lr == 0: exp_lr_scheduler.step() elif sgd_adjust_lr == 1: exp_lr_scheduler.step(val_average_loss_1 + val_average_loss_2) if val_accuracy_2 > best_val_accuracy_2 and val_accuracy_1 > 0.95: best_val_accuracy_2 = val_accuracy_2 best_val_accuracy_1 = val_accuracy_1 correspond_train_acc_1 = train_accuracy_1 correspond_train_acc_2 = train_accuracy_2 best_model_wts = copy.deepcopy(model.state_dict()) elif val_accuracy_2 == best_val_accuracy_2 and val_accuracy_1 > 0.95: if val_accuracy_1 > best_val_accuracy_1: correspond_train_acc_1 = train_accuracy_1 correspond_train_acc_2 = train_accuracy_2 best_model_wts = copy.deepcopy(model.state_dict()) elif val_accuracy_1 == best_val_accuracy_1: if train_accuracy_2 > correspond_train_acc_2: correspond_train_acc_2 = train_accuracy_2 correspond_train_acc_1 = train_accuracy_1 best_model_wts = copy.deepcopy(model.state_dict()) elif train_accuracy_2 == correspond_train_acc_2: if train_accuracy_1 > best_val_accuracy_1: correspond_train_acc_1 = train_accuracy_1 best_model_wts = copy.deepcopy(model.state_dict()) record_np[epoch, 0] = train_accuracy_1 record_np[epoch, 1] = train_accuracy_2 record_np[epoch, 2] = train_average_loss_1 record_np[epoch, 3] = train_average_loss_2 record_np[epoch, 4] = train_average_loss_3 record_np[epoch, 5] = train_average_loss_4 record_np[epoch, 6] = val_accuracy_1 record_np[epoch, 7] = val_accuracy_2 record_np[epoch, 8] = val_average_loss_1 record_np[epoch, 9] = val_average_loss_2 record_np[epoch, 10] = val_average_loss_3 record_np[epoch, 11] = val_average_loss_4 print('best accuracy_1: {:.4f} cor train accu_1: {:.4f}'.format( best_val_accuracy_1, correspond_train_acc_1)) print('best accuracy_2: {:.4f} cor train accu_2: {:.4f}'.format( best_val_accuracy_2, correspond_train_acc_2)) save_val_1 = int("{:4.0f}".format(best_val_accuracy_1 * 10000)) save_val_2 = int("{:4.0f}".format(best_val_accuracy_2 * 10000)) save_train_1 = int("{:4.0f}".format(correspond_train_acc_1 * 10000)) save_train_2 = int("{:4.0f}".format(correspond_train_acc_2 * 10000)) public_name = "cnn_lstm_klave" \ + "_epoch_" + str(epochs) \ + "_length_" + str(sequence_length) \ + "_opt_" + str(optimizer_choice) \ + "_mulopt_" + str(multi_optim) \ + "_flip_" + str(use_flip) \ + "_crop_" + str(crop_type) \ + "_batch_" + str(train_batch_size) \ + "_train1_" + str(save_train_1) \ + "_train2_" + str(save_train_2) \ + "_val1_" + str(save_val_1) \ + "_val2_" + str(save_val_2) model_name = public_name + ".pth" torch.save(best_model_wts, model_name) record_name = public_name + ".npy" np.save(record_name, record_np) kl_fc_p2t_name = public_name + "p2t.npy" kl_fc_t2p_name = public_name + "t2p.npy" kl_fc_p2t_np = kl_fc_p2t.cpu().weight.data.numpy() np.save(kl_fc_p2t_name, kl_fc_p2t_np) kl_fc_t2p_np = kl_fc_t2p.cpu().weight.data.numpy() np.save(kl_fc_t2p_name, kl_fc_t2p_np)
class TrainLoop_GPT2(): def __init__(self, args, logger): self.args = args self.logger = logger self.args.device = 'cuda:{}'.format( self.args.gpu) if self.args.use_cuda else 'cpu' self.logger.info('using device:{}'.format(self.args.device)) self.opt = vars(self.args) self.batch_size = self.opt['batch_size'] self.use_cuda = self.opt['use_cuda'] self.device = self.args.device self.multi_gpu = self.args.use_multi_gpu # self.movie_ids = pickle.load(open("data/movie_ids.pickle", "rb")) self.build_data() self.build_model() def build_data(self): self.tokenizer = BertTokenizer(vocab_file=self.args.vocab_path) self.vocab_size = len(self.tokenizer) self.pad_id = self.tokenizer.convert_tokens_to_ids('[PAD]') # 对原始数据进行预处理,将原始语料转换成对应的token_id if self.args.raw: for subset in ['train', 'valid', 'test']: self.preprocess_raw_data(subset) # 加载tokenized data self.subset2data = {} with open(self.args.test_tokenized_path, "r", encoding="utf8") as f: self.subset2data['test'] = f.read() if not self.args.do_eval: with open(self.args.train_tokenized_path, "r", encoding="utf8") as f: self.subset2data['train'] = f.read() with open(self.args.valid_tokenized_path, "r", encoding="utf8") as f: self.subset2data['valid'] = f.read() # 这一步是干啥的 for subset in self.subset2data: self.subset2data[subset] = self.subset2data[subset].split("\n") self.logger.info("Train/Valid/Test set has {} convs".format( [len(self.subset2data[subset]) for subset in self.subset2data])) def build_model(self): """ :param args: :param vocab_size:字典大小 :return: """ if self.args.pretrained_model: # 如果指定了预训练的GPT2模型 self.model = GPT2LMHeadModel.from_pretrained( self.args.pretrained_model) else: # 若没有指定预训练模型,则初始化模型 model_config = transformers.modeling_gpt2.GPT2Config.from_json_file( self.args.model_config) self.model = GPT2LMHeadModel(config=model_config) # 根据tokenizer的vocabulary调整GPT2模型的voca的大小 self.model.resize_token_embeddings(self.vocab_size) if self.use_cuda: self.model.to(self.device) self.logger.info('model config:\n{}'.format( self.model.config.to_json_string())) self.n_ctx = self.model.config.to_dict().get("n_ctx") # 建立模型存储路径 if self.args.is_model_output and not os.path.exists( self.args.dialogue_model_output_path): os.mkdir(self.args.dialogue_model_output_path) # 记录模型参数数量 num_parameters = 0 parameters = self.model.parameters() for parameter in parameters: num_parameters += parameter.numel() self.logger.info( 'number of model parameters: {}'.format(num_parameters)) # 是否使用多块GPU进行并行运算 if self.args.use_multi_gpu: if self.args.use_cuda and torch.cuda.device_count() > 1: self.logger.info("Let's use GPUs to train") self.model = DataParallel( self.model, device_ids=[int(i) for i in self.args.device.split(',')]) else: self.args.use_multi_gpu = False def train(self): train_dataset = GPT2Dataset(self.subset2data['train']) train_dataloader = DataLoader(train_dataset, batch_size=self.args.batch_size, shuffle=True, num_workers=self.args.num_workers, collate_fn=self.collate_fn) # 计算所有epoch进行参数优化的总步数total_steps self.total_steps = int(train_dataset.__len__() * self.args.epochs / self.args.batch_size / self.args.gradient_accumulation) self.logger.info('total training steps = {}'.format(self.total_steps)) self.init_optim() self.logger.info('starting training') # 用于统计每次梯度累计的loss running_loss = 0 # 统计一共训练了多少个step overall_step = 0 # 记录tensorboardX # tb_writer = SummaryWriter(log_dir=self.args.writer_dir) # 记录 out of memory的次数 oom_time = 0 # patience patience = 0 max_patience = 2 best_test_loss = 10000 # 开始训练 for epoch in range(self.args.epochs): epoch_start_time = datetime.now() train_loss = [] # 记录一个epoch里面的train loss for batch_idx, (input_ids, mask_r) in enumerate(train_dataloader): # 注意:GPT2模型的forward()函数,是对于给定的context,生成一个token,而不是生成一串token # GPT2Model的输入为n个token_id时,输出也是n个hidden_state,使用第n个hidden_state预测第n+1个token # self.logger.info(input_ids == mask_r) # self.logger.info(input_ids) # self.logger.info(mask_r) # for context in input_ids: # print(tokenizer.convert_ids_to_tokens(int(id) for id in context)) # ipdb.set_trace() self.model.train() input_ids = input_ids.to(self.device) # 解决在运行过程中,由于显存不足产生的cuda out of memory的问题 try: outputs = self.model.forward(input_ids=input_ids) loss, accuracy = self.calculate_loss_and_accuracy( outputs, input_ids, mask_r, device=self.device) train_loss.append(loss.item()) if self.multi_gpu: loss = loss.mean() accuracy = accuracy.mean() if self.args.gradient_accumulation > 1: loss = loss / self.args.gradient_accumulation accuracy = accuracy / self.args.gradient_accumulation loss.backward() # 梯度裁剪解决的是梯度消失或爆炸的问题,即设定阈值 torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.args.max_grad_norm) # 进行一定step的梯度累计之后,更新参数 if (batch_idx + 1) % self.args.gradient_accumulation == 0: running_loss += loss.item() self.optimizer.step() self.optimizer.zero_grad() self.scheduler.step() overall_step += 1 # 更新日志与tnesorboardX信息 if (overall_step + 1) % self.args.log_step == 0: self.logger.info( "batch {} of epoch {}, loss {:.4f}, ppl {:.5f}" .format(batch_idx + 1, epoch + 1, loss, exp(loss))) # tb_writer.add_scalar('loss', loss.item(), overall_step) except RuntimeError as exception: if "out of memory" in str(exception): oom_time += 1 self.logger.info( "WARNING: ran out of memory,times: {}".format( oom_time)) if hasattr(torch.cuda, 'empty_cache'): torch.cuda.empty_cache() else: self.logger.info(str(exception)) raise exception train_loss = sum(train_loss) / len(train_loss) epoch_finish_time = datetime.now() self.logger.info( 'epoch {}, train loss is {:.4f}, ppl is {:.5f}, spend {} time'. format(epoch + 1, train_loss, exp(train_loss), epoch_finish_time - epoch_start_time)) # val # test_loss = val(model, device, test_list, multi_gpu, self.args) test_loss = self.val('valid') if test_loss <= best_test_loss: patience = 0 best_test_loss = test_loss self.logger.info('saving model for epoch {}'.format(epoch + 1)) model_path = join(self.args.dialogue_model_output_path, 'model') if not os.path.exists(model_path): os.mkdir(model_path) # 这里是什么意思,还不是很懂 model_to_save = self.model.module if hasattr( self.model, 'module') else self.model model_to_save.save_pretrained(model_path) self.logger.info("save model to " + str(model_path)) else: patience += 1 self.logger.info('Patience = ' + str(patience)) if patience >= max_patience: break test_loss = self.val('test') # self.logger.info('training finished') def val(self, subset): # self.logger.info("start evaluating model") self.model.eval() # self.logger.info('starting evaluating') # 记录tensorboardX # tb_writer = SummaryWriter(log_dir=self.args.writer_dir) test_dataset = GPT2Dataset(self.subset2data[subset]) test_dataloader = DataLoader(test_dataset, batch_size=self.args.batch_size, shuffle=True, num_workers=self.args.num_workers, collate_fn=self.collate_fn) test_loss = [] # test_accuracy = [] with torch.no_grad(): for batch_idx, (input_ids, mask_r) in enumerate(test_dataloader): input_ids = input_ids.to(self.device) outputs = self.model.forward(input_ids=input_ids) loss, accuracy = self.calculate_loss_and_accuracy( outputs, input_ids, mask_r, device=self.device) test_loss.append(loss.item()) # test_accuracy.append(accuracy) if self.multi_gpu: loss = loss.mean() accuracy = accuracy.mean() if self.args.gradient_accumulation > 1: loss = loss / self.args.gradient_accumulation accuracy = accuracy / self.args.gradient_accumulation # self.logger.info("val batch {} ,loss {} ,accuracy {}".format(batch_idx, loss, accuracy)) # tb_writer.add_scalar('loss', loss.item(), overall_step) test_loss = sum(test_loss) / len(test_loss) self.logger.info("val {} loss {:.4f} , ppl {:.5f}".format( subset, test_loss, exp(test_loss))) return test_loss def generate(self): samples_file = open(self.args.save_samples_path, 'w', encoding='utf8') convs = pickle.load(open(self.args.test_path, 'rb')) for conv in tqdm(convs[:]): conv_id = conv['conv_id'] history = [] # list of id, to model for message in conv['messages']: message_id, role, content = int( message['local_id']), message['role'], message['content'] if role == 'Recommender' and message_id != 1: try: if self.args.save_samples_path: samples_file.write(f"[GroundTruth]: {content}\n") input_ids = [ self.tokenizer.cls_token_id ] + history[-self.args.max_context_len + 1:] # 每个input以[CLS]为开头 [SEP]结尾 # tensor of [input_token_num] curr_input_tensor = torch.tensor(input_ids).long().to( self.device) generated = [] # 最多生成max_len个token for _ in range(self.args.max_len): # (tensor of [input_token_nums, 13317], tuple of 10 tensor) outputs = self.model( input_ids=curr_input_tensor) #?shape? # tensor of [13317] next_token_logits = outputs[0][-1, :] # 对于已生成的结果generated中的每个token添加一个重复惩罚项,降低其生成概率 for id in set(generated): next_token_logits[ id] /= self.args.repetition_penalty next_token_logits = next_token_logits / self.args.temperature # 对于[UNK]的概率设为无穷小,也就是说模型的预测结果不可能是[UNK]这个token next_token_logits[ self.tokenizer.convert_tokens_to_ids( '[UNK]')] = -float('Inf') # 将topk以外的token的概率设置为-inf,然后排序,然后将accum-概率大与topp的token的概率设置为-inf filtered_logits = top_k_top_p_filtering( next_token_logits, top_k=self.args.topk, top_p=self.args.topp) # torch.multinomial表示从候选集合中无放回地进行抽取num_samples个元素,权重越高,抽到的几率越高,返回元素的下标 next_token = torch.multinomial(F.softmax( filtered_logits, dim=-1), num_samples=1) if next_token == self.tokenizer.sep_token_id: # 遇到[SEP]则表明response生成结束 break generated.append(next_token.item()) curr_input_tensor = torch.cat( (curr_input_tensor, next_token), dim=0)[-self.n_ctx:] generated_text = self.tokenizer.convert_ids_to_tokens( generated) if self.args.save_samples_path: samples_file.write("[Generated]: {}\n\n".format( "".join(generated_text))) except Exception as e: print(e) print(conv_id, message_id) print(max(input_ids)) print('\n') history.extend( self.tokenizer.encode(content) + [self.tokenizer.sep_token_id]) #? encode成了啥 samples_file.close() def calculate_loss_and_accuracy(self, outputs, labels, mask_r, device): """ 计算非self.pad_id的平均loss和准确率 :param outputs: :param labels: :param device: :return: """ logits = outputs[ 0] # 每个token用来预测下一个token的prediction_score,维度:[batch_size,token_len,voca_size] # 用前n-1个token,预测出第n个token # 用第i个token的prediction_score用来预测第i+1个token。 # 假定有input有n个token,则shift_logits表示model中第[0,n-2]个token的prediction_score,shift_labels表示第[1,n-1]的label shift_logits = logits[..., :-1, :].contiguous() shift_labels = labels[..., 1:].contiguous().to(device) ##################################### shift_labels给mask掉 mask_shift_labels = mask_r[..., 1:].contiguous().to(device) shift_labels = shift_labels * mask_shift_labels ####################################### loss_fct = CrossEntropyLoss( ignore_index=self.pad_id, reduction='sum') # 忽略self.pad_id的loss,并对所有的非self.pad_id的loss进行求和 loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)) _, preds = shift_logits.max( dim=-1 ) # preds表示对应的prediction_score预测出的token在voca中的id。维度为[batch_size,token_len] # 对非self.pad_id的token的loss进行求平均,且计算出预测的准确率 not_ignore = shift_labels.ne( self.pad_id ) # 进行非运算,返回一个tensor,若targets_view的第i个位置为self.pad_id,则置为0,否则为1 num_targets = not_ignore.long().sum().item( ) # 计算target中的非self.pad_id的数量 correct = (shift_labels == preds) & not_ignore # 计算model预测正确的token的个数,排除pad的tokne correct = correct.float().sum() accuracy = correct / num_targets loss = loss / num_targets return loss, accuracy def preprocess_raw_data(self, subset): """ 对原始语料进行处理,将原始语料转换为用于train的token id,对于每个dialogue,将其处于成如下形式"[CLS]utterance1[SEP]utterance2[SEP]utterance3[SEP]" :param args: :param tokenizer: :param n_ctx:GPT2模型的上下文窗口大小,对于超过n_ctx(n_ctx包括了特殊字符)的dialogue进行截断 :return: """ self.logger.info( "tokenizing raw data,raw data path:{}, token output path:{}". format(args.train_raw_path, args.train_tokenized_path)) if subset == 'train': raw_path = self.args.train_raw_path elif subset == 'valid': raw_path = self.args.valid_raw_path elif subset == 'test': raw_path = self.args.test_raw_path with open(raw_path, 'rb') as f: data = f.read().decode("utf-8") if "\r\n" in data: train_data = data.split("\r\n\r\n") else: train_data = data.split("\n\n") self.logger.info("there are {} dialogue in raw dataset".format( len(train_data))) if subset == 'train': path = self.args.train_tokenized_path elif subset == 'valid': path = self.args.valid_tokenized_path elif subset == 'test': path = self.args.test_tokenized_path with open(path, "w", encoding="utf-8") as f: for dialogue_index, dialogue in enumerate(tqdm(train_data)): if "\r\n" in data: utterances = dialogue.split("\r\n") else: utterances = dialogue.split("\n") # dialogue_ids = [tokenizer.cls_token_id] # 每个dialogue以[CLS]开头 dialogue_ids = [] # 每个dialogue以[CLS]开头 for utterance in utterances: dialogue_ids.extend([ self.tokenizer.convert_tokens_to_ids(word) for word in utterance ]) dialogue_ids.append(self.tokenizer.sep_token_id ) # 每个utterance之后添加[SEP],表示utterance结束 # 对超过n_ctx的长度进行截断,否则GPT2模型会报错 ###############################m dialogue_ids = [self.tokenizer.cls_token_id ] + dialogue_ids[-self.n_ctx + 1:] # dialogue_ids = dialogue_ids[:n_ctx] for dialogue_id in dialogue_ids: f.write(str(dialogue_id) + ' ') # 最后一条记录不添加换行符 if dialogue_index < len(train_data) - 1: f.write("\n") self.logger.info( "finish preprocessing raw data,the result is stored in {}".format( self.args.train_tokenized_path)) def collate_fn(self, batch): """ 计算该batch中的所有sample的最长的input,并且将其他input的长度向其对齐 :param batch: :return: """ input_ids = [] mask_rs = [] btc_size = len(batch) max_input_len = 0 # 该batch中最长的input,用于该batch的数据对齐 # 计算该batch中input的最大长度 # for btc_idx in range(btc_size): # if max_input_len < len(batch[btc_idx]): # max_input_len = len(batch[btc_idx]) # 使用pad_id对小于max_input_len的input_id进行补全 # for btc_idx in range(btc_size): # input_len = len(batch[btc_idx]) # input_ids.append(batch[btc_idx]) # input_ids[btc_idx].extend([pad_id] * (max_input_len - input_len)) # 计算该batch中input的最大长度 for btc_idx, (inputs, mask_r) in enumerate(batch): if max_input_len < len(inputs): max_input_len = len(inputs) # 使用pad_id对小于max_input_len的input_id进行补全 for btc_idx, (inputs, mask_r) in enumerate(batch): assert len(inputs) == len(mask_r), f"{len(inputs)}, {len(mask_r)}" input_len = len(inputs) input_ids.append(inputs) input_ids[btc_idx].extend([self.pad_id] * (max_input_len - input_len)) mask_rs.append(mask_r) mask_rs[btc_idx].extend([self.pad_id] * (max_input_len - input_len)) # self.logger.info(torch.tensor(input_ids, dtype=torch.long).shape) # self.logger.info(torch.tensor(mask_rs, dtype=torch.long).shape) return (torch.tensor(input_ids, dtype=torch.long), torch.tensor(mask_rs, dtype=torch.long)) def vector2sentence(self, batch_sen): # 一个batch的sentence 从id换成token sentences = [] for sen in batch_sen.numpy().tolist(): sentence = [] for word in sen: if word > 3: sentence.append(self.index2word[word]) elif word == 3: sentence.append('_UNK_') sentences.append(sentence) return sentences @classmethod def optim_opts(self): """ Fetch optimizer selection. By default, collects everything in torch.optim, as well as importing: - qhm / qhmadam if installed from github.com/facebookresearch/qhoptim Override this (and probably call super()) to add your own optimizers. """ # first pull torch.optim in optims = { k.lower(): v for k, v in optim.__dict__.items() if not k.startswith('__') and k[0].isupper() } try: import apex.optimizers.fused_adam as fused_adam optims['fused_adam'] = fused_adam.FusedAdam except ImportError: pass try: # https://openreview.net/pdf?id=S1fUpoR5FQ from qhoptim.pyt import QHM, QHAdam optims['qhm'] = QHM optims['qhadam'] = QHAdam except ImportError: # no QHM installed pass self.logger.info(optims) return optims def init_optim(self): """ Initialize optimizer with model parameters. :param params: parameters from the model :param optim_states: optional argument providing states of optimizer to load :param saved_optim_type: type of optimizer being loaded, if changed will skip loading optimizer states """ # 设置优化器,并且在初始训练时,使用warmup策略 self.optimizer = transformers.AdamW(self.model.parameters(), lr=self.args.lr, correct_bias=True) self.scheduler = transformers.WarmupLinearSchedule( self.optimizer, warmup_steps=self.args.warmup_steps, t_total=self.total_steps) def backward(self, loss): """ Perform a backward pass. It is recommended you use this instead of loss.backward(), for integration with distributed training and FP16 training. """ loss.backward() def update_params(self): """ Perform step of optimization, clipping gradients and adjusting LR schedule if needed. Gradient accumulation is also performed if agent is called with --update-freq. It is recommended (but not forced) that you call this in train_step. """ update_freq = 1 if update_freq > 1: # we're doing gradient accumulation, so we don't only want to step # every N updates instead self._number_grad_accum = (self._number_grad_accum + 1) % update_freq if self._number_grad_accum != 0: return #0.1是不是太小了,原版就是这样 if self.opt['gradient_clip'] > 0: torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.opt['gradient_clip']) self.optimizer.step() def zero_grad(self): """ Zero out optimizer. It is recommended you call this in train_step. It automatically handles gradient accumulation if agent is called with --update-freq. """ self.optimizer.zero_grad()
class DataParalleledLoss(Loss): r""" Loss class wrapper of torch.nn.DataParallel. It can be used as the original loss class. `eval` & `forward` methods support data-parallel running. Examples -------- >>> import torch >>> from torch import optim >>> from torch.nn import functional as F >>> from pixyz.distributions import Bernoulli, Normal >>> from pixyz.losses import KullbackLeibler, DataParalleledLoss >>> from pixyz.models import Model >>> used_gpu_i = set() >>> used_gpu_g = set() >>> # Set distributions (Distribution API) >>> class Inference(Normal): ... def __init__(self): ... super().__init__(var=["z"],cond_var=["x"],name="q") ... self.model_loc = torch.nn.Linear(128, 64) ... self.model_scale = torch.nn.Linear(128, 64) ... def forward(self, x): ... used_gpu_i.add(x.device.index) ... return {"loc": self.model_loc(x), "scale": F.softplus(self.model_scale(x))} >>> class Generator(Bernoulli): ... def __init__(self): ... super().__init__(var=["x"],cond_var=["z"],name="p") ... self.model = torch.nn.Linear(64, 128) ... def forward(self, z): ... used_gpu_g.add(z.device.index) ... return {"probs": torch.sigmoid(self.model(z))} >>> p = Generator() >>> q = Inference() >>> prior = Normal(loc=torch.tensor(0.), scale=torch.tensor(1.), ... var=["z"], features_shape=[64], name="p_{prior}") >>> # Define a loss function (Loss API) >>> reconst = -p.log_prob().expectation(q) >>> kl = KullbackLeibler(q,prior) >>> batch_loss_cls = (reconst - kl) >>> # device settings >>> device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") >>> device_count = torch.cuda.device_count() >>> if device_count > 1: ... loss_cls = DataParalleledLoss(batch_loss_cls).mean().to(device) ... else: ... loss_cls = batch_loss_cls.mean().to(device) >>> # Set a model (Model API) >>> model = Model(loss=loss_cls, distributions=[p, q], ... optimizer=optim.Adam, optimizer_params={"lr": 1e-3}) >>> # Train and test the model >>> data = torch.randn(2, 128).to(device) # Pseudo data >>> train_loss = model.train({"x": data}) >>> expected = set(range(device_count)) if torch.cuda.is_available() else {None} >>> assert used_gpu_i==expected >>> assert used_gpu_g==expected """ def __init__(self, loss, distributed=False, **kwargs): super().__init__(loss.input_var) if distributed: self.paralleled = DistributedDataParallel(loss, **kwargs) else: self.paralleled = DataParallel(loss, **kwargs) def forward(self, x_dict, **kwargs): return self.paralleled.forward(x_dict, **kwargs) @property def _symbol(self): return self.paralleled.module._symbol def __getattr__(self, name): try: return super().__getattr__(name) except AttributeError: return getattr(self.paralleled.module, name)
def train_model(train_dataset, train_num_each, val_dataset, val_num_each): num_train = len(train_dataset) num_val = len(val_dataset) train_useful_start_idx = get_useful_start_idx(sequence_length, train_num_each) val_useful_start_idx = get_useful_start_idx(sequence_length, val_num_each) num_train_we_use = len(train_useful_start_idx) // num_gpu * num_gpu num_val_we_use = len(val_useful_start_idx) // num_gpu * num_gpu # num_train_we_use = 800 # num_val_we_use = 80 train_we_use_start_idx = train_useful_start_idx[0:num_train_we_use] val_we_use_start_idx = val_useful_start_idx[0:num_val_we_use] train_idx = [] for i in range(num_train_we_use): for j in range(sequence_length): train_idx.append(train_we_use_start_idx[i] + j) val_idx = [] for i in range(num_val_we_use): for j in range(sequence_length): val_idx.append(val_we_use_start_idx[i] + j) num_train_all = len(train_idx) num_val_all = len(val_idx) print('num train start idx : {:6d}'.format(len(train_useful_start_idx))) print('last idx train start: {:6d}'.format(train_useful_start_idx[-1])) print('num of train dataset: {:6d}'.format(num_train)) print('num of train we use : {:6d}'.format(num_train_we_use)) print('num of all train use: {:6d}'.format(num_train_all)) print('num valid start idx : {:6d}'.format(len(val_useful_start_idx))) print('last idx valid start: {:6d}'.format(val_useful_start_idx[-1])) print('num of valid dataset: {:6d}'.format(num_val)) print('num of valid we use : {:6d}'.format(num_val_we_use)) print('num of all valid use: {:6d}'.format(num_val_all)) train_loader = DataLoader(train_dataset, batch_size=train_batch_size, sampler=train_idx, num_workers=workers, pin_memory=False) val_loader = DataLoader(val_dataset, batch_size=val_batch_size, sampler=val_idx, num_workers=workers, pin_memory=False) model_old = multi_lstm() model_old = DataParallel(model_old) model_old.load_state_dict( torch.load( "cnn_lstm_epoch_25_length_10_opt_1_mulopt_1_flip_0_crop_1_batch_400_train1_9997_train2_9982_val1_9744_val2_8876.pth" )) model = multi_lstm_p2t() model.share = model_old.module.share model.lstm = model_old.module.lstm model.fc = model_old.module.fc model.fc2 = model_old.module.fc2 model = DataParallel(model) for param in model.module.fc_p2t.parameters(): param.requires_grad = False model.module.fc_p2t.load_state_dict( torch.load( "fc_epoch_25_length_4_opt_1_mulopt_1_flip_0_crop_1_batch_800_train1_9951_train2_9713_val1_9686_val2_7867_p2t.pth" )) if use_gpu: model = model.cuda() model.module.fc_p2t = model.module.fc_p2t.cuda() criterion_1 = nn.BCEWithLogitsLoss(size_average=False) criterion_2 = nn.CrossEntropyLoss(size_average=False) criterion_3 = nn.KLDivLoss(size_average=False) sigmoid_cuda = nn.Sigmoid().cuda() if multi_optim == 0: if optimizer_choice == 0: optimizer = optim.SGD([{ 'params': model.module.share.parameters() }, { 'params': model.module.lstm.parameters(), }, { 'params': model.module.fc.parameters() }, { 'params': model.module.fc2.parameters() }], lr=learning_rate, momentum=momentum, dampening=dampening, weight_decay=weight_decay, nesterov=use_nesterov) if sgd_adjust_lr == 0: exp_lr_scheduler = lr_scheduler.StepLR(optimizer, step_size=sgd_step, gamma=sgd_gamma) elif sgd_adjust_lr == 1: exp_lr_scheduler = lr_scheduler.ReduceLROnPlateau( optimizer, 'min') elif optimizer_choice == 1: optimizer = optim.Adam([{ 'params': model.module.share.parameters() }, { 'params': model.module.lstm.parameters(), }, { 'params': model.module.fc.parameters() }, { 'params': model.module.fc2.parameters() }], lr=learning_rate) elif multi_optim == 1: if optimizer_choice == 0: optimizer = optim.SGD([{ 'params': model.module.share.parameters() }, { 'params': model.module.lstm.parameters(), 'lr': learning_rate }, { 'params': model.module.fc.parameters(), 'lr': learning_rate }], lr=learning_rate / 10, momentum=momentum, dampening=dampening, weight_decay=weight_decay, nesterov=use_nesterov) if sgd_adjust_lr == 0: exp_lr_scheduler = lr_scheduler.StepLR(optimizer, step_size=sgd_step, gamma=sgd_gamma) elif sgd_adjust_lr == 1: exp_lr_scheduler = lr_scheduler.ReduceLROnPlateau( optimizer, 'min') elif optimizer_choice == 1: optimizer = optim.Adam([{ 'params': model.module.share.parameters() }, { 'params': model.module.lstm.parameters(), 'lr': learning_rate }, { 'params': model.module.fc.parameters(), 'lr': learning_rate }, { 'params': model.module.fc2.parameters(), 'lr': learning_rate }], lr=learning_rate / 10) best_model_wts = copy.deepcopy(model.state_dict()) best_val_accuracy_1 = 0.0 best_val_accuracy_2 = 0.0 correspond_train_acc_1 = 0.0 correspond_train_acc_2 = 0.0 # 要存储2个train的准确率 2个valid的准确率 3个train 3个loss的loss, 一共10个数据要记录 record_np = np.zeros([epochs, 12]) for epoch in range(epochs): # np.random.seed(epoch) np.random.shuffle(train_we_use_start_idx) train_idx = [] for i in range(num_train_we_use): for j in range(sequence_length): train_idx.append(train_we_use_start_idx[i] + j) train_loader = DataLoader(train_dataset, batch_size=train_batch_size, sampler=train_idx, num_workers=workers, pin_memory=False) model.train() train_loss_1 = 0.0 train_loss_2 = 0.0 train_loss_3 = 0.0 train_corrects_1 = 0 train_corrects_2 = 0 train_corrects_3 = 0 train_start_time = time.time() for data in train_loader: inputs, labels_1, labels_2 = data if use_gpu: inputs = Variable(inputs.cuda()) labels_1 = Variable(labels_1.cuda()) labels_2 = Variable(labels_2.cuda()) else: inputs = Variable(inputs) labels_1 = Variable(labels_1) labels_2 = Variable(labels_2) optimizer.zero_grad() outputs_1, outputs_2, outputs_3 = model.forward(inputs) _, preds_2 = torch.max(outputs_2.data, 1) train_corrects_2 += torch.sum(preds_2 == labels_2.data) sig_output_1 = sigmoid_cuda(outputs_1) sig_output_3 = sigmoid_cuda(outputs_3) sig_average = (sig_output_1.data + sig_output_3.data) / 2 preds_1 = torch.cuda.ByteTensor(sig_output_1.data > 0.5) preds_1 = preds_1.long() train_corrects_1 += torch.sum(preds_1 == labels_1.data) preds_3 = torch.cuda.ByteTensor(sig_average > 0.5) preds_3 = preds_3.long() train_corrects_3 += torch.sum(preds_3 == labels_1.data) labels_1 = Variable(labels_1.data.float()) loss_1 = criterion_1(outputs_1, labels_1) loss_2 = criterion_2(outputs_2, labels_2) sig_output_3 = Variable(sig_output_3.data, requires_grad=False) loss_3 = torch.abs(criterion_3(sig_output_1, sig_output_3)) loss = loss_1 + loss_2 + loss_3 * alpha loss.backward() optimizer.step() train_loss_1 += loss_1.data[0] train_loss_2 += loss_2.data[0] train_loss_3 += loss_3.data[0] train_elapsed_time = time.time() - train_start_time train_accuracy_1 = train_corrects_1 / num_train_all / 7 train_accuracy_2 = train_corrects_2 / num_train_all train_accuracy_3 = train_corrects_3 / num_train_all / 7 train_average_loss_1 = train_loss_1 / num_train_all / 7 train_average_loss_2 = train_loss_2 / num_train_all train_average_loss_3 = train_loss_3 / num_train_all # begin eval model.eval() val_loss_1 = 0.0 val_loss_2 = 0.0 val_loss_3 = 0.0 val_corrects_1 = 0 val_corrects_2 = 0 val_corrects_3 = 0 val_start_time = time.time() for data in val_loader: inputs, labels_1, labels_2 = data labels_2 = labels_2[(sequence_length - 1)::sequence_length] if use_gpu: inputs = Variable(inputs.cuda(), volatile=True) labels_1 = Variable(labels_1.cuda(), volatile=True) labels_2 = Variable(labels_2.cuda(), volatile=True) else: inputs = Variable(inputs, volatile=True) labels_1 = Variable(labels_1, volatile=True) labels_2 = Variable(labels_2, volatile=True) outputs_1, outputs_2, outputs_3 = model.forward(inputs) outputs_2 = outputs_2[(sequence_length - 1)::sequence_length] _, preds_2 = torch.max(outputs_2.data, 1) val_corrects_2 += torch.sum(preds_2 == labels_2.data) sig_output_1 = sigmoid_cuda(outputs_1) sig_output_3 = sigmoid_cuda(outputs_3) sig_average = (sig_output_1.data + sig_output_3.data) / 2 preds_1 = torch.cuda.ByteTensor(sig_output_1.data > 0.5) preds_1 = preds_1.long() val_corrects_1 += torch.sum(preds_1 == labels_1.data) preds_3 = torch.cuda.ByteTensor(sig_average > 0.5) preds_3 = preds_3.long() val_corrects_3 += torch.sum(preds_3 == labels_1.data) labels_1 = Variable(labels_1.data.float()) loss_1 = criterion_1(outputs_1, labels_1) loss_2 = criterion_2(outputs_2, labels_2) sig_output_3 = Variable(sig_output_3.data, requires_grad=False) loss_3 = torch.abs(criterion_3(sig_output_1, sig_output_3)) val_loss_1 += loss_1.data[0] val_loss_2 += loss_2.data[0] val_loss_3 += loss_3.data[0] val_elapsed_time = time.time() - val_start_time val_accuracy_1 = val_corrects_1 / (num_val_all * 7) val_accuracy_2 = val_corrects_2 / num_val_we_use val_accuracy_3 = val_corrects_3 / (num_val_all * 7) val_average_loss_1 = val_loss_1 / (num_val_all * 7) val_average_loss_2 = val_loss_2 / num_val_we_use val_average_loss_3 = val_loss_3 / num_val_all print('epoch: {:3d}' ' train time: {:2.0f}m{:2.0f}s' ' train accu_1: {:.4f}' ' train accu_3: {:.4f}' ' train accu_2: {:.4f}' ' train loss_1: {:4.4f}' ' train loss_2: {:4.4f}' ' train loss_3: {:4.4f}'.format( epoch, train_elapsed_time // 60, train_elapsed_time % 60, train_accuracy_1, train_accuracy_3, train_accuracy_2, train_average_loss_1, train_average_loss_2, train_average_loss_3)) print('epoch: {:3d}' ' valid time: {:2.0f}m{:2.0f}s' ' valid accu_1: {:.4f}' ' valid accu_3: {:.4f}' ' valid accu_2: {:.4f}' ' valid loss_1: {:4.4f}' ' valid loss_2: {:4.4f}' ' valid loss_3: {:4.4f}'.format( epoch, val_elapsed_time // 60, val_elapsed_time % 60, val_accuracy_1, val_accuracy_3, val_accuracy_2, val_average_loss_1, val_average_loss_2, val_average_loss_3)) if optimizer_choice == 0: if sgd_adjust_lr == 0: exp_lr_scheduler.step() elif sgd_adjust_lr == 1: exp_lr_scheduler.step(val_average_loss_1 + val_average_loss_2 + alpha * val_average_loss_3) if val_accuracy_2 > best_val_accuracy_2 and val_accuracy_1 > 0.95: best_val_accuracy_2 = val_accuracy_2 best_val_accuracy_1 = val_accuracy_1 correspond_train_acc_1 = train_accuracy_1 correspond_train_acc_2 = train_accuracy_2 best_model_wts = copy.deepcopy(model.state_dict()) elif val_accuracy_2 == best_val_accuracy_2 and val_accuracy_1 > 0.95: if val_accuracy_1 > best_val_accuracy_1: correspond_train_acc_1 = train_accuracy_1 correspond_train_acc_2 = train_accuracy_2 best_model_wts = copy.deepcopy(model.state_dict()) elif val_accuracy_1 == best_val_accuracy_1: if train_accuracy_2 > correspond_train_acc_2: correspond_train_acc_2 = train_accuracy_2 correspond_train_acc_1 = train_accuracy_1 best_model_wts = copy.deepcopy(model.state_dict()) elif train_accuracy_2 == correspond_train_acc_2: if train_accuracy_1 > best_val_accuracy_1: correspond_train_acc_1 = train_accuracy_1 best_model_wts = copy.deepcopy(model.state_dict()) if val_accuracy_2 > 0.885: save_val_1 = int("{:4.0f}".format(val_accuracy_1 * 10000)) save_val_2 = int("{:4.0f}".format(val_accuracy_2 * 10000)) save_train_1 = int("{:4.0f}".format(train_accuracy_1 * 10000)) save_train_2 = int("{:4.0f}".format(train_accuracy_2 * 10000)) public_name = "cnn_lstm_p2t" \ + "_epoch_" + str(epochs) \ + "_length_" + str(sequence_length) \ + "_opt_" + str(optimizer_choice) \ + "_mulopt_" + str(multi_optim) \ + "_flip_" + str(use_flip) \ + "_crop_" + str(crop_type) \ + "_batch_" + str(train_batch_size) \ + "_train1_" + str(save_train_1) \ + "_train2_" + str(save_train_2) \ + "_val1_" + str(save_val_1) \ + "_val2_" + str(save_val_2) model_name = public_name + ".pth" torch.save(best_model_wts, model_name) record_np[epoch, 0] = train_accuracy_1 record_np[epoch, 1] = train_accuracy_3 record_np[epoch, 2] = train_accuracy_2 record_np[epoch, 3] = train_average_loss_1 record_np[epoch, 4] = train_average_loss_2 record_np[epoch, 5] = train_average_loss_3 record_np[epoch, 6] = val_accuracy_1 record_np[epoch, 7] = val_accuracy_3 record_np[epoch, 7] = val_accuracy_2 record_np[epoch, 9] = val_average_loss_1 record_np[epoch, 10] = val_average_loss_2 record_np[epoch, 11] = val_average_loss_3 print('best accuracy_1: {:.4f} cor train accu_1: {:.4f}'.format( best_val_accuracy_1, correspond_train_acc_1)) print('best accuracy_2: {:.4f} cor train accu_2: {:.4f}'.format( best_val_accuracy_2, correspond_train_acc_2)) # save_val_1 = int("{:4.0f}".format(best_val_accuracy_1 * 10000)) # save_val_2 = int("{:4.0f}".format(best_val_accuracy_2 * 10000)) # save_train_1 = int("{:4.0f}".format(correspond_train_acc_1 * 10000)) # save_train_2 = int("{:4.0f}".format(correspond_train_acc_2 * 10000)) # public_name = "cnn_lstm_p2t" \ # + "_epoch_" + str(epochs) \ # + "_length_" + str(sequence_length) \ # + "_opt_" + str(optimizer_choice) \ # + "_mulopt_" + str(multi_optim) \ # + "_flip_" + str(use_flip) \ # + "_crop_" + str(crop_type) \ # + "_batch_" + str(train_batch_size) \ # + "_train1_" + str(save_train_1) \ # + "_train2_" + str(save_train_2) \ # + "_val1_" + str(save_val_1) \ # + "_val2_" + str(save_val_2) # model_name = public_name + ".pth" # torch.save(best_model_wts, model_name) record_name = public_name + ".npy" np.save(record_name, record_np)
def test_model(test_dataset, test_num_each): num_test = len(test_dataset) test_idx = [i for i in range(num_test)] print('num of test dataset: {:6d}'.format(num_test)) test_loader = DataLoader(test_dataset, batch_size=test_batch_size, sampler=test_idx, num_workers=workers, pin_memory=False) model = multi_resnet() model = DataParallel(model) model.load_state_dict(torch.load(model_name)) if use_gpu: model = model.cuda() criterion = nn.BCEWithLogitsLoss(size_average=False) sig_f = nn.Sigmoid() model.eval() test_loss = 0.0 test_corrects = 0 all_preds = [] test_start_time = time.time() for data in test_loader: inputs, labels_1, labels_2 = data if use_gpu: inputs = Variable(inputs.cuda(), volatile=True) labels = Variable(labels_1.cuda(), volatile=True) else: inputs = Variable(inputs, volatile=True) labels = Variable(labels_1, volatile=True) if crop_type == 0 or crop_type == 1: outputs = model.forward(inputs) elif crop_type == 5: inputs = inputs.permute(1, 0, 2, 3, 4).contiguous() inputs = inputs.view(-1, 3, 224, 224) outputs = model.forward(inputs) outputs = outputs.view(5, -1, 7) outputs = torch.mean(outputs, 0) elif crop_type == 10: inputs = inputs.permute(1, 0, 2, 3, 4).contiguous() inputs = inputs.view(-1, 3, 224, 224) outputs = model.forward(inputs) outputs = outputs.view(10, -1, 7) outputs = torch.mean(outputs, 0) for i in range(len(outputs)): all_preds.append(outputs[i].data.cpu().numpy().tolist()) sig_out = outputs.data.cpu() sig_out = sig_f(sig_out) predict = torch.ByteTensor(sig_out > 0.5) predict = predict.long() test_corrects += torch.sum(predict == labels.data.cpu()) labels = Variable(labels.data.float()) loss = criterion(outputs, labels) test_loss += loss.data[0] # print(test_corrects) test_elapsed_time = time.time() - test_start_time test_accuracy = test_corrects / num_test / 7 test_average_loss = test_loss / num_test / 7 save_test = int("{:4.0f}".format(test_accuracy * 10000)) pred_name = model_pure_name + '_test_' + str(save_test) + '_crop_' + str( crop_type) + '.pkl' with open(pred_name, 'wb') as f: pickle.dump(all_preds, f) print('test elapsed: {:2.0f}m{:2.0f}s' ' test loss: {:4.4f}' ' test accu: {:.4f}'.format(test_elapsed_time // 60, test_elapsed_time % 60, test_average_loss, test_accuracy))
def main(): parser = argparse.ArgumentParser() parser.add_argument('--device', default='0,1,2,3', type=str, required=False, help='设置使用哪些显卡') parser.add_argument('--model_config', default='config/model_config_small.json', type=str, required=False, help='选择模型参数') parser.add_argument('--tokenizer_path', default='cache/vocab_small.txt', type=str, required=False, help='选择词库') parser.add_argument('--raw_data_path', default='data/eval.json', type=str, required=False, help='原始语料') parser.add_argument('--tokenized_data_path', default='data/tokenized_eval/', type=str, required=False, help='tokenized语料存放位置') parser.add_argument('--raw', action='store_true', help='是否先做tokenize') parser.add_argument('--batch_size', default=8, type=int, required=False, help='batch size') parser.add_argument('--log_step', default=1, type=int, required=False, help='多少步汇报一次') parser.add_argument('--stride', default=768, type=int, required=False, help='取数据的窗口步长') parser.add_argument('--num_pieces', default=100, type=int, required=False, help='将训练语料分成多少份') parser.add_argument('--min_length', default=128, type=int, required=False, help='最短收录文章长度') parser.add_argument('--pretrained_model', default='', type=str, required=False, help='模型起点路径') parser.add_argument('--no_wordpiece', action='store_true', help='不做word piece切词') parser.add_argument('--output_dir', default='eval_result/', type=str, required=False, help='结果输出路径') args = parser.parse_args() print('args:\n' + args.__repr__()) if args.no_wordpiece: import tokenization_bert_without_wordpiece as tokenization_bert else: import tokenization_bert os.environ["CUDA_VISIBLE_DEVICES"] = args.device # 此处设置程序使用哪些显卡 model_config = pytorch_transformers.modeling_gpt2.GPT2Config.from_json_file( args.model_config) print('config:\n' + model_config.to_json_string()) n_ctx = model_config.n_ctx full_tokenizer = tokenization_bert.BertTokenizer( vocab_file=args.tokenizer_path) full_tokenizer.max_len = n_ctx device = 'cuda' if torch.cuda.is_available() else 'cpu' print('using device:', device) raw_data_path = args.raw_data_path tokenized_data_path = args.tokenized_data_path raw = args.raw # 选择是否从零开始构建数据集 batch_size = args.batch_size log_step = args.log_step stride = args.stride num_pieces = args.num_pieces min_length = args.min_length output_dir = args.output_dir if not os.path.exists(output_dir): os.mkdir(output_dir) if raw: print('building files') build_files(data_path=raw_data_path, tokenized_data_path=tokenized_data_path, num_pieces=num_pieces, full_tokenizer=full_tokenizer, min_length=min_length) print('files built') if not args.pretrained_model: print('you need to specify a trained model.') exit(1) else: model = pytorch_transformers.modeling_gpt2.GPT2LMHeadModel.from_pretrained( args.pretrained_model) model.eval() model.to(device) num_parameters = 0 parameters = model.parameters() for parameter in parameters: num_parameters += parameter.numel() print('number of parameters: {}'.format(num_parameters)) multi_gpu = False full_len = 0 print('calculating total steps') for i in tqdm(range(num_pieces)): with open(tokenized_data_path + 'tokenized_train_{}.txt'.format(i), 'r') as f: full_len += len([int(item) for item in f.read().strip().split()]) if torch.cuda.device_count() > 1: print("Let's use", torch.cuda.device_count(), "GPUs!") model = DataParallel(model) multi_gpu = True print('starting training') overall_step = 0 total_loss = 0 total_steps = 0 # eval now = datetime.now() print('time: {}'.format(now)) piece_num = 0 for i in range(num_pieces): with open(tokenized_data_path + 'tokenized_train_{}.txt'.format(i), 'r') as f: line = f.read().strip() tokens = line.split() tokens = [int(token) for token in tokens] start_point = 0 samples = [] while start_point < len(tokens) - n_ctx: samples.append(tokens[start_point:start_point + n_ctx]) start_point += stride start_point -= stride last = tokens[start_point + n_ctx:] last.extend([ full_tokenizer.convert_tokens_to_ids(['[PAD]']) * (n_ctx - len(last)) ]) random.shuffle(samples) for step in range(len(samples) // batch_size): # drop last # prepare data batch = samples[step * batch_size:(step + 1) * batch_size] batch_labels = [] batch_inputs = [] for ids in batch: int_ids_for_labels = [int(x) for x in ids] int_ids_for_inputs = [int(x) for x in ids] batch_labels.append(int_ids_for_labels) batch_inputs.append(int_ids_for_inputs) batch_labels = torch.tensor(batch_labels).long().to(device) batch_inputs = torch.tensor(batch_inputs).long().to(device) # forward pass outputs = model.forward(input_ids=batch_inputs, labels=batch_labels) loss, logits = outputs[:2] # get loss if multi_gpu: loss = loss.mean() total_loss += loss total_steps += 1 if (overall_step + 1) % log_step == 0: print('now time: {}:{}. Step {} of piece {}, ppl {}'.format( datetime.now().hour, datetime.now().minute, (step + 1), piece_num, torch.exp(loss))) piece_num += 1 if not os.path.exists(args.output_dir): os.mkdir(args.output_dir) else: with open(args.output_dir + 'result.txt', 'w') as f: f.write(np.exp(total_loss / total_steps))
def main(): parser = argparse.ArgumentParser() parser.add_argument( "--device", default="0,1,2,3", type=str, required=False, help="设置使用哪些显卡" ) parser.add_argument( "--model_config", default="config/model_config_small.json", type=str, required=False, help="选择模型参数", ) parser.add_argument( "--tokenizer_path", default="cache/vocab_small.txt", type=str, required=False, help="选择词库", ) parser.add_argument( "--raw_data_path", default="data/train.json", type=str, required=False, help="原始训练语料", ) parser.add_argument( "--tokenized_data_path", default="data/tokenized/", type=str, required=False, help="tokenized语料存放位置", ) parser.add_argument("--raw", action="store_true", help="是否先做tokenize") parser.add_argument("--epochs", default=5, type=int, required=False, help="训练循环") parser.add_argument( "--batch_size", default=8, type=int, required=False, help="训练batch size" ) parser.add_argument("--lr", default=1.5e-4, type=float, required=False, help="学习率") parser.add_argument( "--warmup_steps", default=2000, type=int, required=False, help="warm up步数" ) parser.add_argument( "--log_step", default=1, type=int, required=False, help="多少步汇报一次loss,设置为gradient accumulation的整数倍", ) parser.add_argument( "--stride", default=768, type=int, required=False, help="训练时取训练数据的窗口步长" ) parser.add_argument( "--gradient_accumulation", default=1, type=int, required=False, help="梯度积累" ) parser.add_argument("--fp16", action="store_true", help="混合精度") parser.add_argument("--fp16_opt_level", default="O1", type=str, required=False) parser.add_argument("--max_grad_norm", default=1.0, type=float, required=False) parser.add_argument( "--num_pieces", default=100, type=int, required=False, help="将训练语料分成多少份" ) parser.add_argument( "--min_length", default=128, type=int, required=False, help="最短收录文章长度" ) parser.add_argument( "--output_dir", default="model/", type=str, required=False, help="模型输出路径" ) parser.add_argument( "--pretrained_model", default="", type=str, required=False, help="模型训练起点路径" ) parser.add_argument( "--writer_dir", default="tensorboard_summary/", type=str, required=False, help="Tensorboard路径", ) parser.add_argument("--segment", action="store_true", help="中文以词为单位") parser.add_argument("--bpe_token", action="store_true", help="subword") parser.add_argument( "--encoder_json", default="tokenizations/encoder.json", type=str, help="encoder.json", ) parser.add_argument( "--vocab_bpe", default="tokenizations/vocab.bpe", type=str, help="vocab.bpe" ) args = parser.parse_args() print("args:\n" + args.__repr__()) if args.segment: from tokenizations import tokenization_bert_word_level as tokenization_bert else: from tokenizations import tokenization_bert os.environ["CUDA_VISIBLE_DEVICES"] = args.device # 此处设置程序使用哪些显卡 model_config = transformers.modeling_gpt2.GPT2Config.from_json_file( args.model_config ) print("config:\n" + model_config.to_json_string()) n_ctx = model_config.n_ctx if args.bpe_token: full_tokenizer = get_encoder(args.encoder_json, args.vocab_bpe) else: full_tokenizer = tokenization_bert.BertTokenizer(vocab_file=args.tokenizer_path) full_tokenizer.max_len = 999999 device = "cuda" if torch.cuda.is_available() else "cpu" print("using device:", device) raw_data_path = args.raw_data_path tokenized_data_path = args.tokenized_data_path raw = args.raw # 选择是否从零开始构建数据集 epochs = args.epochs batch_size = args.batch_size lr = args.lr warmup_steps = args.warmup_steps log_step = args.log_step stride = args.stride gradient_accumulation = args.gradient_accumulation fp16 = args.fp16 # 不支持半精度的显卡请勿打开 fp16_opt_level = args.fp16_opt_level max_grad_norm = args.max_grad_norm num_pieces = args.num_pieces min_length = args.min_length output_dir = args.output_dir tb_writer = SummaryWriter(log_dir=args.writer_dir) assert log_step % gradient_accumulation == 0 if not os.path.exists(output_dir): os.mkdir(output_dir) if raw: print("building files") build_files( data_path=raw_data_path, tokenized_data_path=tokenized_data_path, num_pieces=num_pieces, full_tokenizer=full_tokenizer, min_length=min_length, ) print("files built") if not args.pretrained_model: model = transformers.modeling_gpt2.GPT2LMHeadModel(config=model_config) else: model = transformers.modeling_gpt2.GPT2LMHeadModel.from_pretrained( args.pretrained_model ) model.train() model.to(device) num_parameters = 0 parameters = model.parameters() for parameter in parameters: num_parameters += parameter.numel() print("number of parameters: {}".format(num_parameters)) multi_gpu = False full_len = 0 print("calculating total steps") for i in tqdm(range(num_pieces)): with open(tokenized_data_path + "tokenized_train_{}.txt".format(i), "r") as f: full_len += len([int(item) for item in f.read().strip().split()]) total_steps = int(full_len / stride * epochs / batch_size / gradient_accumulation) print("total steps = {}".format(total_steps)) optimizer = transformers.AdamW(model.parameters(), lr=lr, correct_bias=True) scheduler = transformers.WarmupLinearSchedule( optimizer, warmup_steps=warmup_steps, t_total=total_steps ) if fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=fp16_opt_level) if torch.cuda.device_count() > 1: print("Let's use", torch.cuda.device_count(), "GPUs!") model = DataParallel(model, device_ids=[int(i) for i in args.device.split(",")]) multi_gpu = True print("starting training") overall_step = 0 running_loss = 0 saving_time = datetime.now() for epoch in range(epochs): print("epoch {}".format(epoch + 1)) now = datetime.now() print("time: {}".format(now)) x = np.linspace(0, num_pieces - 1, num_pieces, dtype=np.int32) random.shuffle(x) piece_num = 0 for i in x: with open( tokenized_data_path + "tokenized_train_{}.txt".format(i), "r" ) as f: line = f.read().strip() tokens = line.split() tokens = [int(token) for token in tokens] start_point = 0 samples = [] while start_point < len(tokens) - n_ctx: samples.append(tokens[start_point : start_point + n_ctx]) start_point += stride if start_point < len(tokens): samples.append(tokens[len(tokens) - n_ctx :]) random.shuffle(samples) for step in range(len(samples) // batch_size): # drop last # prepare data batch = samples[step * batch_size : (step + 1) * batch_size] batch_inputs = [] for ids in batch: int_ids = [int(x) for x in ids] batch_inputs.append(int_ids) batch_inputs = torch.tensor(batch_inputs).long().to(device) # forward pass outputs = model.forward(input_ids=batch_inputs, labels=batch_inputs) loss, logits = outputs[:2] # get loss if multi_gpu: loss = loss.mean() if gradient_accumulation > 1: loss = loss / gradient_accumulation # loss backward if fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), max_grad_norm ) else: loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm) # optimizer step if (overall_step + 1) % gradient_accumulation == 0: running_loss += loss.item() optimizer.step() optimizer.zero_grad() scheduler.step() if (overall_step + 1) % log_step == 0: tb_writer.add_scalar( "loss", loss.item() * gradient_accumulation, overall_step ) print( "now time: {}:{}. Step {} of piece {} of epoch {}, loss {}".format( datetime.now().hour, datetime.now().minute, step + 1, piece_num, epoch + 1, running_loss * gradient_accumulation / (log_step / gradient_accumulation), ) ) running_loss = 0 delta_time = datetime.now() - saving_time if delta_time.seconds > 1800: print("saving model for epoch {}".format(epoch + 1)) if not os.path.exists( output_dir + "model_epoch{}".format(epoch + 1) ): os.mkdir(output_dir + "model_epoch{}".format(epoch + 1)) model_to_save = model.module if hasattr(model, "module") else model model_to_save.save_pretrained( output_dir + "model_epoch{}".format(epoch + 1) ) saving_time = datetime.now() overall_step += 1 piece_num += 1 print("saving model for epoch {}".format(epoch + 1)) if not os.path.exists(output_dir + "model_epoch{}".format(epoch + 1)): os.mkdir(output_dir + "model_epoch{}".format(epoch + 1)) model_to_save = model.module if hasattr(model, "module") else model model_to_save.save_pretrained(output_dir + "model_epoch{}".format(epoch + 1)) # torch.save(scheduler.state_dict(), output_dir + 'model_epoch{}/scheduler.pt'.format(epoch + 1)) # torch.save(optimizer.state_dict(), output_dir + 'model_epoch{}/optimizer.pt'.format(epoch + 1)) print("epoch {} finished".format(epoch + 1)) then = datetime.now() print("time: {}".format(then)) print("time for one epoch: {}".format(then - now)) print("training finished") if not os.path.exists(output_dir + "final_model"): os.mkdir(output_dir + "final_model") model_to_save = model.module if hasattr(model, "module") else model model_to_save.save_pretrained(output_dir + "final_model")
def main(): parser = argparse.ArgumentParser() parser.add_argument('--device', default='0,1,2,3', type=str, required=False, help='设置使用哪些显卡') parser.add_argument('--model_config', default='config/model_config_small.json', type=str, required=False, help='选择模型参数') parser.add_argument('--tokenizer_path', default='cache/vocab_small.txt', type=str, required=False, help='选择词库') parser.add_argument('--raw_data_path', default='data/train.json', type=str, required=False, help='原始训练语料') parser.add_argument('--tokenized_data_path', default='data/tokenized/', type=str, required=False, help='tokenized语料存放位置') parser.add_argument('--raw', action='store_true', help='是否先做tokenize') parser.add_argument('--epochs', default=5, type=int, required=False, help='训练循环') parser.add_argument('--batch_size', default=8, type=int, required=False, help='训练batch size') parser.add_argument('--lr', default=1.5e-4, type=float, required=False, help='学习率') parser.add_argument('--warmup_steps', default=2000, type=int, required=False, help='warm up步数') parser.add_argument('--log_step', default=1, type=int, required=False, help='多少步汇报一次loss') parser.add_argument('--stride', default=768, type=int, required=False, help='训练时取训练数据的窗口步长') parser.add_argument('--gradient_accumulation', default=1, type=str, required=False, help='梯度积累') parser.add_argument('--fp16', action='store_true', help='混合精度') parser.add_argument('--fp16_opt_level', default='O1', type=str, required=False) parser.add_argument('--max_grad_norm', default=1.0, type=float, required=False) parser.add_argument('--num_pieces', default=100, type=int, required=False, help='将训练语料分成多少份') parser.add_argument('--output_dir', default='model/', type=str, required=False, help='模型输出路径') parser.add_argument('--pretrained_model', default='', type=str, required=False, help='模型训练起点路径') parser.add_argument('--no_wordpiece', action='store_true', help='不做word piece切词') args = parser.parse_args() print('args:\n' + args.__repr__()) if args.no_wordpiece: import tokenization_bert_without_wordpiece as tokenization_bert else: import tokenization_bert os.environ["CUDA_VISIBLE_DEVICES"] = args.device # 此处设置程序使用哪些显卡 model_config = pytorch_transformers.modeling_gpt2.GPT2Config.from_json_file( args.model_config) print('config:\n' + model_config.to_json_string()) n_ctx = model_config.n_ctx full_tokenizer = tokenization_bert.BertTokenizer( vocab_file=args.tokenizer_path) full_tokenizer.max_len = n_ctx device = 'cuda' if torch.cuda.is_available() else 'cpu' print('using device:', device) raw_data_path = args.raw_data_path tokenized_data_path = args.tokenized_data_path raw = args.raw # 选择是否从零开始构建数据集 epochs = args.epochs batch_size = args.batch_size lr = args.lr warmup_steps = args.warmup_steps log_step = args.log_step stride = args.stride gradient_accumulation = args.gradient_accumulation fp16 = args.fp16 # 不支持半精度的显卡请勿打开 fp16_opt_level = args.fp16_opt_level max_grad_norm = args.max_grad_norm num_pieces = args.num_pieces output_dir = args.output_dir if raw: print('building files') build_files(raw_data_path=raw_data_path, tokenized_data_path=tokenized_data_path, full_tokenizer=full_tokenizer, num_pieces=num_pieces) print('files built') if not args.pretrained_model: model = pytorch_transformers.modeling_gpt2.GPT2LMHeadModel( config=model_config) else: model = pytorch_transformers.modeling_gpt2.GPT2LMHeadModel.from_pretrained( args.pretrained_model) model.train() model.to(device) multi_gpu = False full_len = 0 print('calculating total steps') for i in tqdm(range(num_pieces)): with open(tokenized_data_path + 'tokenized_train_{}.txt'.format(i), 'r') as f: full_len += len([int(item) for item in f.read().strip().split()]) total_steps = int(full_len / stride * epochs / batch_size / gradient_accumulation) print('total steps = {}'.format(total_steps)) optimizer = pytorch_transformers.AdamW(model.parameters(), lr=lr, correct_bias=True) scheduler = pytorch_transformers.WarmupLinearSchedule( optimizer, warmup_steps=warmup_steps, t_total=total_steps) if fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=fp16_opt_level) if torch.cuda.device_count() > 1: print("Let's use", torch.cuda.device_count(), "GPUs!") model = DataParallel(model) multi_gpu = True print('starting training') for epoch in range(epochs): print('epoch {}'.format(epoch + 1)) now = datetime.now() print('time: {}'.format(now)) x = np.linspace(0, num_pieces - 1, num_pieces, dtype=np.int32) random.shuffle(x) piece_num = 0 for i in x: running_loss = 0 with open(tokenized_data_path + 'tokenized_train_{}.txt'.format(i), 'r') as f: line = f.read().strip() tokens = line.split() tokens = [int(token) for token in tokens] start_point = 0 samples = [] while start_point < len(tokens) - n_ctx: samples.append(tokens[start_point:start_point + n_ctx]) start_point += stride random.shuffle(samples) for step in range(len(samples) // batch_size): # prepare data batch = samples[step * batch_size:(step + 1) * batch_size] batch_labels = [] batch_inputs = [] for ids in batch: int_ids_for_labels = [int(x) for x in ids] int_ids_for_inputs = [int(x) for x in ids] batch_labels.append(int_ids_for_labels) batch_inputs.append(int_ids_for_inputs) batch_labels = torch.tensor(batch_labels).long().to(device) batch_inputs = torch.tensor(batch_inputs).long().to(device) # forward pass outputs = model.forward(input_ids=batch_inputs, labels=batch_labels) loss, logits = outputs[:2] # get loss if multi_gpu: loss = loss.mean() if gradient_accumulation > 1: loss = loss / gradient_accumulation # loss backward if fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), max_grad_norm) else: loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm) # optimizer step if (step + 1) % gradient_accumulation == 0: running_loss += loss.item() scheduler.step() optimizer.step() optimizer.zero_grad() if (step + 1) % log_step == 0: print( 'now time: {}:{}. Step {} of piece {} of epoch {}, loss {}' .format( datetime.now().hour, datetime.now().minute, (step + 1) // gradient_accumulation, piece_num, epoch + 1, running_loss * gradient_accumulation / log_step)) running_loss = 0 piece_num += 1 print('saving model for epoch {}'.format(epoch + 1)) if not os.path.exists(output_dir + 'model_epoch{}'.format(epoch + 1)): os.mkdir(output_dir + 'model_epoch{}'.format(epoch + 1)) model_to_save = model.module if hasattr(model, 'module') else model model_to_save.save_pretrained(output_dir + 'model_epoch{}'.format(epoch + 1)) # torch.save(scheduler.state_dict(), output_dir + 'model_epoch{}/scheduler.pt'.format(epoch + 1)) # torch.save(optimizer.state_dict(), output_dir + 'model_epoch{}/optimizer.pt'.format(epoch + 1)) print('epoch {} finished'.format(epoch + 1)) then = datetime.now() print('time: {}'.format(then)) print('time for one epoch: {}'.format(then - now)) print('training finished') if not os.path.exists(output_dir + 'final_model'): os.mkdir(output_dir + 'final_model') model_to_save = model.module if hasattr(model, 'module') else model model_to_save.save_pretrained(output_dir + 'final_model')
def train_model(train_dataset, train_num_each, val_dataset, val_num_each): num_train = len(train_dataset) num_val = len(val_dataset) train_useful_start_idx = get_useful_start_idx(sequence_length, train_num_each) val_useful_start_idx = get_useful_start_idx(sequence_length, val_num_each) num_train_we_use = len(train_useful_start_idx) // num_gpu * num_gpu num_val_we_use = len(val_useful_start_idx) // num_gpu * num_gpu # num_train_we_use = 4 # num_val_we_use = 800 train_we_use_start_idx = train_useful_start_idx[0:num_train_we_use] val_we_use_start_idx = val_useful_start_idx[0:num_val_we_use] train_idx = [] for i in range(num_train_we_use): for j in range(sequence_length): train_idx.append(train_we_use_start_idx[i] + j) val_idx = [] for i in range(num_val_we_use): for j in range(sequence_length): val_idx.append(val_we_use_start_idx[i] + j) num_train_all = len(train_idx) num_val_all = len(val_idx) print('num train start idx : {:6d}'.format(len(train_useful_start_idx))) print('last idx train start: {:6d}'.format(train_useful_start_idx[-1])) print('num of train dataset: {:6d}'.format(num_train)) print('num of train we use : {:6d}'.format(num_train_we_use)) print('num of all train use: {:6d}'.format(num_train_all)) print('num valid start idx : {:6d}'.format(len(val_useful_start_idx))) print('last idx valid start: {:6d}'.format(val_useful_start_idx[-1])) print('num of valid dataset: {:6d}'.format(num_val)) print('num of valid we use : {:6d}'.format(num_val_we_use)) print('num of all valid use: {:6d}'.format(num_val_all)) train_loader = DataLoader(train_dataset, batch_size=train_batch_size, sampler=train_idx, num_workers=workers, pin_memory=False) val_loader = DataLoader(val_dataset, batch_size=val_batch_size, sampler=val_idx, num_workers=workers, pin_memory=False) model = dense_lstm() sig_f = nn.Sigmoid() if use_gpu: model = model.cuda() sig_f = sig_f.cuda() model = DataParallel(model) criterion_1 = nn.BCEWithLogitsLoss(size_average=False) criterion_2 = nn.CrossEntropyLoss(size_average=False) if multi_optim == 0: if optimizer_choice == 0: optimizer = optim.SGD(model.parameters(), lr=learning_rate, momentum=momentum, dampening=dampening, weight_decay=weight_decay, nesterov=use_nesterov) if sgd_adjust_lr == 0: exp_lr_scheduler = lr_scheduler.StepLR(optimizer, step_size=sgd_step, gamma=sgd_gamma) elif sgd_adjust_lr == 1: exp_lr_scheduler = lr_scheduler.ReduceLROnPlateau( optimizer, 'min') elif optimizer_choice == 1: optimizer = optim.Adam(model.parameters(), lr=learning_rate) elif multi_optim == 1: if optimizer_choice == 0: optimizer = optim.SGD([ { 'params': model.module.features.parameters() }, { 'params': model.module.lstm.parameters(), 'lr': learning_rate }, { 'params': model.module.fc.parameters(), 'lr': learning_rate }, { 'params': model.module.fc2.parameters(), 'lr': learning_rate }, ], lr=learning_rate / 10, momentum=momentum, dampening=dampening, weight_decay=weight_decay, nesterov=use_nesterov) if sgd_adjust_lr == 0: exp_lr_scheduler = lr_scheduler.StepLR(optimizer, step_size=sgd_step, gamma=sgd_gamma) elif sgd_adjust_lr == 1: exp_lr_scheduler = lr_scheduler.ReduceLROnPlateau( optimizer, 'min') elif optimizer_choice == 1: optimizer = optim.Adam([ { 'params': model.module.features.parameters() }, { 'params': model.module.lstm.parameters(), 'lr': learning_rate }, { 'params': model.module.fc.parameters(), 'lr': learning_rate }, { 'params': model.module.fc2.parameters(), 'lr': learning_rate }, ], lr=learning_rate / 10) best_model_wts = copy.deepcopy(model.state_dict()) best_val_accuracy_1 = 0.0 best_val_accuracy_2 = 0.0 # judge by accu2 correspond_train_acc_1 = 0.0 correspond_train_acc_2 = 0.0 record_np = np.zeros([epochs, 8]) for epoch in range(epochs): # np.random.seed(epoch) np.random.shuffle(train_we_use_start_idx) train_idx = [] for i in range(num_train_we_use): for j in range(sequence_length): train_idx.append(train_we_use_start_idx[i] + j) train_loader = DataLoader(train_dataset, batch_size=train_batch_size, sampler=train_idx, num_workers=workers, pin_memory=False) model.train() train_loss_1 = 0.0 train_loss_2 = 0.0 train_corrects_1 = 0 train_corrects_2 = 0 train_start_time = time.time() for data in train_loader: inputs, labels_1, labels_2 = data if use_gpu: inputs = Variable(inputs.cuda()) labels_1 = Variable(labels_1.cuda()) labels_2 = Variable(labels_2.cuda()) else: inputs = Variable(inputs) labels_1 = Variable(labels_1) labels_2 = Variable(labels_2) optimizer.zero_grad() outputs_1, outputs_2 = model.forward(inputs) _, preds_2 = torch.max(outputs_2.data, 1) sig_out = sig_f(outputs_1.data) preds_1 = torch.ByteTensor(sig_out.cpu() > 0.5) preds_1 = preds_1.long() train_corrects_1 += torch.sum(preds_1 == labels_1.data.cpu()) labels_1 = Variable(labels_1.data.float()) loss_1 = criterion_1(outputs_1, labels_1) loss_2 = criterion_2(outputs_2, labels_2) loss = loss_1 + loss_2 loss.backward() optimizer.step() train_loss_1 += loss_1.data[0] train_loss_2 += loss_2.data[0] train_corrects_2 += torch.sum(preds_2 == labels_2.data) train_elapsed_time = time.time() - train_start_time train_accuracy_1 = train_corrects_1 / num_train_all / 7 train_accuracy_2 = train_corrects_2 / num_train_all train_average_loss_1 = train_loss_1 / num_train_all / 7 train_average_loss_2 = train_loss_2 / num_train_all # begin eval model.eval() val_loss_1 = 0.0 val_loss_2 = 0.0 val_corrects_1 = 0 val_corrects_2 = 0 val_start_time = time.time() for data in val_loader: inputs, labels_1, labels_2 = data labels_2 = labels_2[(sequence_length - 1)::sequence_length] if use_gpu: inputs = Variable(inputs.cuda(), volatile=True) labels_1 = Variable(labels_1.cuda(), volatile=True) labels_2 = Variable(labels_2.cuda(), volatile=True) else: inputs = Variable(inputs, volatile=True) labels_1 = Variable(labels_1, volatile=True) labels_2 = Variable(labels_2, volatile=True) if crop_type == 0 or crop_type == 1: outputs_1, outputs_2 = model.forward(inputs) elif crop_type == 5: inputs = inputs.permute(1, 0, 2, 3, 4).contiguous() inputs = inputs.view(-1, 3, 224, 224) outputs_1, outputs_2 = model.forward(inputs) outputs_1 = outputs_1.view(5, -1, 7) outputs_1 = torch.mean(outputs_1, 0) outputs_2 = outputs_2.view(5, -1, 7) outputs_2 = torch.mean(outputs_2, 0) elif crop_type == 10: inputs = inputs.permute(1, 0, 2, 3, 4).contiguous() inputs = inputs.view(-1, 3, 224, 224) outputs_1, outputs_2 = model.forward(inputs) outputs_1 = outputs_1.view(10, -1, 7) outputs_1 = torch.mean(outputs_1, 0) outputs_2 = outputs_2.view(10, -1, 7) outputs_2 = torch.mean(outputs_2, 0) outputs_2 = outputs_2[sequence_length - 1::sequence_length] _, preds_2 = torch.max(outputs_2.data, 1) sig_out = sig_f(outputs_1.data) preds_1 = torch.ByteTensor(sig_out.cpu() > 0.5) preds_1 = preds_1.long() val_corrects_1 += torch.sum(preds_1 == labels_1.data.cpu()) labels_1 = Variable(labels_1.data.float()) loss_1 = criterion_1(outputs_1, labels_1) val_loss_1 += loss_1.data[0] loss_2 = criterion_2(outputs_2, labels_2) val_loss_2 += loss_2.data[0] val_corrects_2 += torch.sum(preds_2 == labels_2.data) val_elapsed_time = time.time() - val_start_time val_accuracy_1 = val_corrects_1 / (num_val_all * 7) val_accuracy_2 = val_corrects_2 / num_val_we_use val_average_loss_1 = val_loss_1 / (num_val_all * 7) val_average_loss_2 = val_loss_2 / num_val_we_use print('epoch: {:4d}' ' train time: {:2.0f}m{:2.0f}s' ' train loss_1: {:4.4f}' ' train accu_1: {:.4f}' ' valid time: {:2.0f}m{:2.0f}s' ' valid loss_1: {:4.4f}' ' valid accu_1: {:.4f}'.format( epoch, train_elapsed_time // 60, train_elapsed_time % 60, train_average_loss_1, train_accuracy_1, val_elapsed_time // 60, val_elapsed_time % 60, val_average_loss_1, val_accuracy_1)) print('epoch: {:4d}' ' train time: {:2.0f}m{:2.0f}s' ' train loss_2: {:4.4f}' ' train accu_2: {:.4f}' ' valid time: {:2.0f}m{:2.0f}s' ' valid loss_2: {:4.4f}' ' valid accu_2: {:.4f}'.format( epoch, train_elapsed_time // 60, train_elapsed_time % 60, train_average_loss_2, train_accuracy_2, val_elapsed_time // 60, val_elapsed_time % 60, val_average_loss_2, val_accuracy_2)) if optimizer_choice == 0: if sgd_adjust_lr == 0: exp_lr_scheduler.step() elif sgd_adjust_lr == 1: exp_lr_scheduler.step(val_average_loss_1 + val_average_loss_2) if val_accuracy_2 > best_val_accuracy_2 and val_accuracy_1 > 0.95: best_val_accuracy_2 = val_accuracy_2 best_val_accuracy_1 = val_accuracy_1 correspond_train_acc_1 = train_accuracy_1 correspond_train_acc_2 = train_accuracy_2 best_model_wts = copy.deepcopy(model.state_dict()) elif val_accuracy_2 == best_val_accuracy_2 and val_accuracy_1 > 0.95: if val_accuracy_1 > best_val_accuracy_1: correspond_train_acc_1 = train_accuracy_1 correspond_train_acc_2 = train_accuracy_2 best_model_wts = copy.deepcopy(model.state_dict()) elif val_accuracy_1 == best_val_accuracy_1: if train_accuracy_2 > correspond_train_acc_2: correspond_train_acc_2 = train_accuracy_2 correspond_train_acc_1 = train_accuracy_1 best_model_wts = copy.deepcopy(model.state_dict()) elif train_accuracy_2 == correspond_train_acc_2: if train_accuracy_1 > best_val_accuracy_1: correspond_train_acc_1 = train_accuracy_1 best_model_wts = copy.deepcopy(model.state_dict()) record_np[epoch, 0] = train_accuracy_1 record_np[epoch, 1] = train_accuracy_2 record_np[epoch, 2] = train_average_loss_1 record_np[epoch, 3] = train_average_loss_2 record_np[epoch, 4] = val_accuracy_1 record_np[epoch, 5] = val_accuracy_2 record_np[epoch, 6] = val_average_loss_1 record_np[epoch, 7] = val_average_loss_2 print('best accuracy_1: {:.4f} cor train accu_1: {:.4f}'.format( best_val_accuracy_1, correspond_train_acc_1)) print('best accuracy_2: {:.4f} cor train accu_2: {:.4f}'.format( best_val_accuracy_2, correspond_train_acc_2)) save_val_1 = int("{:4.0f}".format(best_val_accuracy_1 * 10000)) save_val_2 = int("{:4.0f}".format(best_val_accuracy_2 * 10000)) save_train_1 = int("{:4.0f}".format(correspond_train_acc_1 * 10000)) save_train_2 = int("{:4.0f}".format(correspond_train_acc_2 * 10000)) public_name = "dense_lstm" \ + "_epoch_" + str(epochs) \ + "_length_" + str(sequence_length) \ + "_opt_" + str(optimizer_choice) \ + "_mulopt_" + str(multi_optim) \ + "_flip_" + str(use_flip) \ + "_crop_" + str(crop_type) \ + "_batch_" + str(train_batch_size) \ + "_train1_" + str(save_train_1) \ + "_train2_" + str(save_train_2) \ + "_val1_" + str(save_val_1) \ + "_val2_" + str(save_val_2) model_name = public_name + ".pth" torch.save(best_model_wts, model_name) record_name = public_name + ".npy" np.save(record_name, record_np)