def train(load_model: str, save_model: str, train_dataset: str, test_dataset: str, no_train: bool, no_test: bool, epochs: int, batch_size: int, learning_rate: float): device = torch.device('cuda:0' if cuda.is_available() else 'cpu') click.secho('Using device={}'.format(device), fg='blue') net = Net() net.to(device) if load_model is not None: click.secho('Loading model from \'{}\''.format(load_model), fg='yellow') net.load_state_dict(torch.load(load_model, map_location=device)) if not no_train: click.echo('Training model using {}'.format(train_dataset)) net.train() train_net(net, data_path=train_dataset, batch_size=batch_size, num_epochs=epochs, learning_rate=learning_rate) if not no_train and save_model is not None: click.secho('Saving model as \'{}\''.format(save_model), fg='yellow') torch.save(net.state_dict(), save_model) if not no_test: click.echo('Testing model using {}'.format(test_dataset)) net.eval() accuracy = test_net(net, data_path=test_dataset, batch_size=batch_size) color = 'green' if accuracy > 97. else 'red' click.secho('Accuracy={}'.format(accuracy), fg=color)
def train(train_data, val_data, fold_idx=None): train_data = MyDataset(train_data, train_transform) train_loader = DataLoader(train_data, batch_size=config.batch_size, shuffle=True) val_data = MyDataset(val_data, val_transform) val_loader = DataLoader(val_data, batch_size=config.batch_size, shuffle=False) model = Net(model_name).to(device) # criterion = nn.CrossEntropyLoss() criterion = FocalLoss(0.5) # optimizer = torch.optim.Adam(model.parameters(), lr=1e-3) # scheduler = optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.1) optimizer = Ranger(model.parameters(), lr=1e-3, weight_decay=0.0005) # scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.1) scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=4) if fold_idx is None: print('start') model_save_path = os.path.join(config.model_path, '{}.bin'.format(model_name)) else: print('start fold: {}'.format(fold_idx + 1)) model_save_path = os.path.join(config.model_path, '{}_fold{}.bin'.format(model_name, fold_idx)) # if os.path.isfile(model_save_path): # print('加载之前的训练模型') # model.load_state_dict(torch.load(model_save_path)) best_val_score = 0 best_val_score_cnt = 0 last_improved_epoch = 0 adjust_lr_num = 0 for cur_epoch in range(config.epochs_num): start_time = int(time.time()) model.train() print('epoch:{}, step:{}'.format(cur_epoch + 1, len(train_loader))) cur_step = 0 for batch_x, batch_y in train_loader: batch_x, batch_y = batch_x.to(device), batch_y.to(device) optimizer.zero_grad() probs = model(batch_x) train_loss = criterion(probs, batch_y) train_loss.backward() optimizer.step() cur_step += 1 if cur_step % config.train_print_step == 0: train_acc = accuracy(probs, batch_y) msg = 'the current step: {0}/{1}, train loss: {2:>5.2}, train acc: {3:>6.2%}' print(msg.format(cur_step, len(train_loader), train_loss.item(), train_acc[0].item())) val_loss, val_score = evaluate(model, val_loader, criterion) if val_score >= best_val_score: if val_score == best_val_score: best_val_score_cnt += 1 best_val_score = val_score torch.save(model.state_dict(), model_save_path) improved_str = '*' last_improved_epoch = cur_epoch else: improved_str = '' msg = 'the current epoch: {0}/{1}, val loss: {2:>5.2}, val acc: {3:>6.2%}, cost: {4}s {5}' end_time = int(time.time()) print(msg.format(cur_epoch + 1, config.epochs_num, val_loss, val_score, end_time - start_time, improved_str)) if cur_epoch - last_improved_epoch >= config.patience_epoch or best_val_score_cnt >= 3: if adjust_lr_num >= config.adjust_lr_num: print("No optimization for a long time, auto stopping...") break print("No optimization for a long time, adjust lr...") # scheduler.step() last_improved_epoch = cur_epoch # 加上,不然会连续更新的 adjust_lr_num += 1 best_val_score_cnt = 0 scheduler.step() del model gc.collect() if fold_idx is not None: model_score[fold_idx] = best_val_score
def train(train_data, val_data, fold_idx=None): train_data = MyDataset(train_data, train_transform) train_loader = DataLoader(train_data, batch_size=config.batch_size, shuffle=True) val_data = MyDataset(val_data, val_transform) val_loader = DataLoader(val_data, batch_size=config.batch_size, shuffle=False) model = Net(model_name).to(device) criterion = nn.CrossEntropyLoss() # criterion = FocalLoss(0.5) optimizer = torch.optim.Adam(model.parameters(), lr=1e-3) # optimizer = torch.optim.Adagrad(model.parameters(), lr=1e-3) scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.1) # config.model_save_path = os.path.join(config.model_path, '{}.bin'.format(model_name)) best_val_acc = 0 last_improved_epoch = 0 if fold_idx is None: print('start') model_save_path = os.path.join(config.model_path, '{}.bin'.format(model_name)) else: print('start fold: {}'.format(fold_idx + 1)) model_save_path = os.path.join(config.model_path, '{}_fold{}.bin'.format(model_name, fold_idx)) for cur_epoch in range(config.epochs_num): start_time = int(time.time()) model.train() print('epoch: ', cur_epoch + 1) cur_step = 0 for batch_x, batch_y in train_loader: batch_x, batch_y = batch_x.to(device), batch_y.to(device) optimizer.zero_grad() probs = model(batch_x) train_loss = criterion(probs, batch_y) train_loss.backward() optimizer.step() cur_step += 1 if cur_step % config.train_print_step == 0: train_acc = accuracy(probs, batch_y) msg = 'the current step: {0}/{1}, train loss: {2:>5.2}, train acc: {3:>6.2%}' print(msg.format(cur_step, len(train_loader), train_loss.item(), train_acc[0].item())) val_loss, val_acc = evaluate(model, val_loader, criterion) if val_acc >= best_val_acc: best_val_acc = val_acc torch.save(model.state_dict(), model_save_path) improved_str = '*' last_improved_epoch = cur_epoch else: improved_str = '' # msg = 'the current epoch: {0}/{1}, train loss: {2:>5.2}, train acc: {3:>6.2%}, ' \ # 'val loss: {4:>5.2}, val acc: {5:>6.2%}, {6}' msg = 'the current epoch: {0}/{1}, val loss: {2:>5.2}, val acc: {3:>6.2%}, cost: {4}s {5}' end_time = int(time.time()) print(msg.format(cur_epoch + 1, config.epochs_num, val_loss, val_acc, end_time - start_time, improved_str)) scheduler.step() if cur_epoch - last_improved_epoch > config.patience_epoch: print("No optimization for a long time, auto-stopping...") break del model gc.collect()
def adjust_learning_rate(optimizer, lr, gamma, step): lr = lr * (gamma**(step)) for param_group in optimizer.param_groups: param_group['lr'] = lr if __name__ == "__main__": # 1.获取命令行参数、创建网络、加载网络参数 args = getArgs() model = Net('train') print('-- Loading weights into state dict...') pretrained_dict = torch.load( args.weight_path, map_location='cuda' if torch.cuda.is_available() else 'cpu') model_dict = model.state_dict() pretrained_dict = { k: v for k, v in pretrained_dict.items() if np.shape(model_dict[k]) == np.shape(v) } model_dict.update(pretrained_dict) model.load_state_dict(model_dict) print('-- Loading weights finished.') # 2.多GPU并行 if torch.cuda.is_available(): model = torch.nn.DataParallel(model) cudnn.benchmark = True model = model.cuda() # 3.创建计算loss的类 criterion = MultiBoxLoss()