def main(): global args, train_writer, test_writer args = parser.parse_args() args.cuda = not args.no_cuda and torch.cuda.is_available() torch.manual_seed(args.seed) if args.cuda: torch.cuda.manual_seed(args.seed) device = torch.device("cuda" if args.cuda else "cpu") # tensorboard logging train_writer = SummaryWriter(comment='train') test_writer = SummaryWriter(comment='test') # dataset num_class, img_dim, train_loader, test_loader = get_setting(args) # model # A, B, C, D = 64, 8, 16, 16 A, B, C, D = 32, 32, 32, 32 model = capsules(A=A, B=B, C=C, D=D, E=num_class, iters=args.em_iters, add_decoder=args.add_decoder, img_dim=img_dim).to(device) print("Number of trainable parameters: {}".format( sum(param.numel() for param in model.parameters()))) criterion = CapsuleLoss(alpha=args.alpha, mode='bce', num_class=num_class, add_decoder=args.add_decoder) optimizer = optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.weight_decay) best_loss, best_score = test(test_loader, model, criterion, 0, device) for epoch in range(1, args.epochs + 1): scores = train(train_loader, model, criterion, optimizer, epoch, device) if epoch % args.test_intvl == 0: test_loss, test_score = test(test_loader, model, criterion, epoch * len(train_loader), device) if test_loss < best_loss or test_score > best_score: snapshot(model, args.snapshot_folder, epoch) best_loss = min(best_loss, test_loss) best_score = max(best_score, test_score) print('best test score: {:.6f}'.format(best_score)) train_writer.close() test_writer.close() # save end model snapshot(model, args.snapshot_folder, 'end_{}'.format(args.epochs))
def train(epochNum): writer = SummaryWriter('../log/' + date + '/ResNet50/') # 创建 /log/日期/ResNet50的组织形式 train_dataset, val_dataset = CropDataset.split_Dataset( data_dir, train_val_ratio, IMAGE_SIZE, trian_transform, val_transform) train_dataLoader = DataLoader(train_dataset, BATCH_SIZE, num_workers=16, shuffle=True) val_dataLoader = DataLoader(val_dataset, BATCH_SIZE, num_workers=1, shuffle=False) model = getmodel() criterion = nn.CrossEntropyLoss().cuda() min_loss = 4.1 print('min_loss is :%f' % (min_loss)) min_acc = 0.80 patience = 0 lr = 0.0 momentum = 0.0 for epoch in range(epochNum): print('Epoch {}/{}'.format(epoch, epochNum - 1)) print('-' * 10) #第一轮首先训练全连接层 if epoch == 0 or epoch == 1 or epoch == 2: lr = 1e-3 optimizer = torch.optim.Adam(model.fresh_params(), lr=lr, amsgrad=True, weight_decay=1e-4) else: optimizer = torch.optim.Adam(model.parameters(), lr=lr, amsgrad=True, weight_decay=1e-4) if epoch == 3: lr = 1e-3 momentum = 0.9 print('set lr=:%f,momentum=%f' % (lr, momentum)) if patience == 2 and lr == 1e-3: patience = 0 model.load_state_dict( torch.load('../model/ResNet50/' + date + '_loss_best.pth')['state_dict']) lr = lr / 10 print('loss has increased lr divide 10 lr now is :%f' % (lr)) if patience == 2 and lr == 1e-4: patience = 0 epochNum = epoch + 1 # 保存训练过程中的loss和acc running_loss = utils.RunningMean() running_corrects = utils.RunningMean() for batch_idx, (inputs, labels) in enumerate(train_dataLoader): model.train(True) # 模型进入训练模式 n_batchsize = inputs.size(0) optimizer.zero_grad() # 清空所有参数的梯度 inputs = inputs.cuda() labels = labels.cuda() outputs = model(inputs) loss = criterion(outputs, labels) running_loss.update(loss.item(), 1) # 将这一个batch的loss保存起来 _, preds = torch.max(outputs.data, 1) running_corrects.update( torch.sum(preds == labels.data).data, n_batchsize) # 将这个batch的准确度保存起来 loss.backward() optimizer.step() # 每10个batch显示一次训练结果信息 if batch_idx % 10 == 9: print('(%s)[epoch:%d,batch:%d]:acc: %f,loss:%f' % (str(datetime.datetime.now()), epoch, batch_idx, running_corrects.value, running_loss.value)) niter = epoch * len(train_dataset) / BATCH_SIZE + batch_idx writer.add_scalar('Train/Acc', running_corrects.value, niter) writer.add_scalar('Train/Loss', running_loss.value, niter) # 如果batch大于300,则每300个batch进行一次验证 if batch_idx % 300 == 299: lx, px = utils.predict(model, val_dataLoader) log_loss = criterion(px, lx) log_loss = log_loss.item() _, preds = torch.max(px, dim=1) accuracy = torch.mean((preds == lx).float()) writer.add_scalar('Val/Acc', accuracy, niter) writer.add_scalar('Val/Loss', log_loss, niter) print( '(%s)[epoch:%d,batch:%d]: val_acc:%f,val_loss:%f,val_total_len:%d' % (epoch, batch_idx, accuracy, log_loss, len(val_dataset))) print('(%s)[epoch:%d] :acc: %f,loss:%f,lr:%f,patience:%d' % (str(datetime.datetime.now()), epoch, running_corrects.value, running_loss.value, lr, patience)) # 训练完后进行验证集验证 lx, px = utils.predict(model, val_dataLoader) log_loss = criterion(px, lx) log_loss = log_loss.item() _, preds = torch.max(px, dim=1) accuracy = torch.mean((preds == lx).float()) writer.add_scalar('Val/Acc', accuracy, (epoch + 1) * len(train_dataset) / BATCH_SIZE) writer.add_scalar('Val/Loss', log_loss, (epoch + 1) * len(train_dataset) / BATCH_SIZE) print('(%s)[epoch:%d]: val_acc:%f,val_loss:%f,' % (str(datetime.datetime.now()), epoch, accuracy, log_loss)) # 若验证集误差小于设定的min_loss,则保存模型快照 if log_loss < min_loss: try: fileName = date + '_loss_best.pth' utils.snapshot( '../model/ResNet50/', fileName, { 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict(), 'val_loss': log_loss, 'val_correct': accuracy }) patience = 0 min_loss = log_loss print('save new model loss,now loss is ', min_loss) except IOError: print("Error: 没有找到文件或读取文件失败") else: patience += 1 # 若精确度大于设定的min+acc,则保存模型快照 if accuracy > min_acc: try: fileName = date + '_acc_best.pth' utils.snapshot( '../model/ResNet50/', fileName, { 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict(), 'val_loss': log_loss, 'val_correct': accuracy }) min_acc = accuracy print('save new model acc,now acc is ', min_acc.item()) except IOError: print("Error: 没有找到文件或读取文件失败")
def trainWithRawData(path, epochNum): try: print('[+] loading modelParams...', end='', flush=True) modelParams = torch.load(path) print('Done') except IOError: print("Error: 没有找到文件或读取文件失败") writer = SummaryWriter('../log/' + date + '/ResNet50/') # 创建 /log/日期/ResNet50的组织形式 train_dataset, val_dataset = CropDataset.split_Dataset( data_dir, train_val_ratio, IMAGE_SIZE, trian_transform, val_transform) train_dataLoader = DataLoader(train_dataset, BATCH_SIZE, num_workers=16, shuffle=True) val_dataLoader = DataLoader(val_dataset, BATCH_SIZE, num_workers=1, shuffle=False) model = getmodel() criterion = nn.CrossEntropyLoss().cuda() model.load_state_dict(modelParams['state_dict']) min_loss = modelParams['val_loss'] print('val_correct is %f' % (modelParams['val_correct'])) print('min_loss is :%f' % (min_loss)) min_acc = max(modelParams['val_correct'], 0.81) optinizerSave = modelParams['optimizer'] patience = 0 lr = 1e-4 momentum = 0.9 for epoch in range(epochNum): print('Epoch {}/{}'.format(epoch, epochNum - 1)) print('-' * 10) if patience == 3: patience = 0 model.load_state_dict( torch.load('../model/ResNet50/' + date + '_loss_best.pth')['state_dict']) lr = lr / 5 print('loss has increased , lr now is :%f' % (lr)) optimizer = torch.optim.SGD(params=model.parameters(), lr=lr, momentum=0.9) else: optimizer = torch.optim.SGD(params=model.parameters(), lr=lr, momentum=0.9) # 保存训练过程中的loss和acc running_loss = utils.RunningMean() running_corrects = utils.RunningMean() for batch_idx, (inputs, labels) in enumerate(train_dataLoader): model.train(True) n_batchsize = inputs.size(0) inputs = inputs.cuda() labels = labels.cuda() optimizer.zero_grad() outputs = model(inputs) _, preds = torch.max(outputs.data, 1) loss = criterion(outputs, labels) running_loss.update(loss.item(), 1) running_corrects.update( torch.sum(preds == labels.data).data, n_batchsize) loss.backward() optimizer.step() # 每10个batch显示一次训练结果信息 if batch_idx % 10 == 9: print('(%s)[epoch:%d,batch:%d]:acc: %f,loss:%f' % (str(datetime.datetime.now()), epoch, batch_idx, running_corrects.value, running_loss.value)) niter = epoch * len(train_dataset) / BATCH_SIZE + batch_idx writer.add_scalar('Train/Acc', running_corrects.value, niter) writer.add_scalar('Train/Loss', running_loss.value, niter) # 如果batch大于300,则每300个batch进行一次验证 if batch_idx % 300 == 299: lx, px = utils.predict(model, val_dataLoader) log_loss = criterion(px, lx) log_loss = log_loss.item() _, preds = torch.max(px, dim=1) accuracy = torch.mean((preds == lx).float()) writer.add_scalar('Val/Acc', accuracy, niter) writer.add_scalar('Val/Loss', log_loss, niter) print( '(%s)[epoch:%d,batch:%d]: val_acc:%f,val_loss:%f,val_total_len:%d' % (epoch, batch_idx, accuracy, log_loss, len(val_dataset))) print('(%s)[epoch:%d] :acc: %f,loss:%f,lr:%f,patience:%d' % (str(datetime.datetime.now()), epoch, running_corrects.value, running_loss.value, lr, patience)) # 训练完后进行验证集验证 lx, px = utils.predict(model, val_dataLoader) log_loss = criterion(px, lx) log_loss = log_loss.item() _, preds = torch.max(px, dim=1) accuracy = torch.mean((preds == lx).float()) writer.add_scalar('Val/Acc', accuracy, (epoch + 1) * len(train_dataset) / BATCH_SIZE) writer.add_scalar('Val/Loss', log_loss, (epoch + 1) * len(train_dataset) / BATCH_SIZE) print('(%s)[epoch:%d]: val_acc:%f,val_loss:%f,' % (str(datetime.datetime.now()), epoch, accuracy, log_loss)) # 若验证集误差小于设定的min_loss,则保存模型快照 if log_loss < min_loss: try: fileName = date + '_loss_best.pth' utils.snapshot( '../model/ResNet50/', fileName, { 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict(), 'val_loss': log_loss, 'val_correct': accuracy }) patience = 0 min_loss = log_loss print('save new model loss,now loss is ', min_loss) except IOError: print("Error: 没有找到文件或读取文件失败") else: patience += 1 # 若精确度大于设定的min+acc,则保存模型快照 if accuracy > min_acc: try: fileName = date + '_acc_best.pth' utils.snapshot( '../model/ResNet50/', fileName, { 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict(), 'val_loss': log_loss, 'val_correct': accuracy }) min_acc = accuracy print('save new model acc,now acc is ', min_acc.item()) except IOError: print("Error: 没有找到文件或读取文件失败")
DICT ={"ran_epoch":epoch,"n_current_steps":optimizer.n_current_steps,"delta":optimizer.delta} if type(optimizer) == ScheduledOptim else {"ran_epoch":epoch} POPEN.update_ini_file(DICT,logger) # -----------| compare the result |----------- if (best_loss > val_total_loss) : # update best performance best_loss = min(best_loss,val_total_loss) best_acc = max(best_acc,val_avg_acc) best_epoch = epoch # save utils.snapshot(POPEN.vae_pth_path, { 'epoch': epoch + 1, 'validation_acc': val_avg_acc, # 'state_dict': model.state_dict(), 'state_dict': model, 'validation_loss': val_total_loss, 'optimizer': optimizer.state_dict(), }) # update the popen POPEN.update_ini_file({'run_name':run_name, "ran_epoch":epoch, "best_acc":best_acc}, logger) elif (epoch - best_epoch >= 30)&((type(optimizer) == ScheduledOptim)): optimizer.increase_delta() elif (epoch - best_epoch >= 60)&(epoch > POPEN.max_epoch/2): # at the late phase of training
def main(): global args args = parser.parse_args() print() print('Command-line argument values:') for key, value in vars(args).items(): print('-', key, ':', value) print() params = [ args.model, path_to_save_string(args.dataset), args.viewpoint_modulo, args.batch_size, args.epochs, args.lr, args.weight_decay, args.seed, args.routing_iters ] model_name = '_'.join([str(x) for x in params]) + '.pth' header = 'model,dataset,viewpoint_modulo,batch_size,epochs,lr,weight_decay,seed,em_iters,accuracy' snapshot_path = os.path.join('.', 'snapshots', model_name) data_path = os.path.join('.', 'results', 'training_data', model_name) result_path = os.path.join('.', 'results', 'pytorch_train.csv') make_dirs_if_not_exist([snapshot_path, data_path, result_path]) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) model, criterion, optimizer, scheduler = load_model( args.model, device_ids=args.device_ids, lr=args.lr, routing_iters=args.routing_iters) num_class, train_loader, test_loader = load_datasets( args.dataset, args.batch_size, args.test_batch_size, args.viewpoint_modulo) best_acc = 0 training_accuracies = [] test_accuracies = [] if args.append: model.load_state_dict(torch.load(snapshot_path)) try: for epoch in range(1, args.epochs + 1): print() acc = train(train_loader, model, criterion, optimizer, epoch, epochs=args.epochs, log_interval=args.log_interval) training_accuracies.append(acc) scheduler.step(acc) print('Epoch accuracy was %.1f%%. Learning rate is %.9f.' % (acc, optimizer.state_dict()['param_groups'][0]['lr'])) if epoch % args.test_interval == 0: test_acc, __, __, __ = test(test_loader, model, criterion, chunk=args.test_size) test_accuracies.append(test_acc) if test_acc > best_acc: best_acc = test_acc except KeyboardInterrupt: print('Cancelled training after %d epochs' % (epoch - 1)) args.epochs = epoch - 1 acc, predictions, labels, logits = test(test_loader, model, criterion, chunk=1) print(f'Accuracy: {acc:.2f}% (best: {best_acc:.2f}%)') to_write = params + [acc.cpu().numpy()] append_to_csv(result_path, to_write, header=header) snapshot(snapshot_path, model) #torch.save((accuracies, labels, predictions), data_path) if args.learn_curve != '': make_dirs_if_not_exist(args.learn_curve) torch.save((training_accuracies, test_accuracies), args.learn_curve)
# print some metrics train_samples_size = len(train_loader) * BATCH_SIZE valid_samples_size = len(valid_loader) * BATCH_SIZE loss_train_epoch = loss_train / train_samples_size loss_valid_epoch = loss_valid / valid_samples_size error_train_epoch = 100 - 100 * (acc_train / train_samples_size) error_valid_epoch = 100 - 100 * (acc_valid / valid_samples_size) error_history.append((error_train_epoch, error_valid_epoch)) loss_history.append((loss_train_epoch, loss_valid_epoch)) print( 'Epoch: {} train loss: {:.5f} valid loss: {:.5f} train error: {:.2f} % valid error: {:.2f} %' .format(epoch, loss_train_epoch, loss_valid_epoch, error_train_epoch, error_valid_epoch)) # check if model is better if error_valid_epoch < best_error[1]: best_error = (epoch, error_valid_epoch) snapshot(SAVED_MODELS_DIR, RUN_TIME, RUN_NAME, True, epoch, error_valid_epoch, model.state_dict(), model.optimizer.state_dict()) # check that the model is not doing worst over the time if best_error[0] + PATIENCE < epoch: print('Overfitting. Stopped at epoch {}.'.format(epoch)) break epoch += 1 plot_loss(RUN_TIME, RUN_NAME, loss_history) plot_error(RUN_TIME, RUN_NAME, error_history)
def train(): # Load data and prepare training samples numpyImages, numpyGT = load_data() dataQueue = Queue(30) # max 50 images in queue dataPreparation = [None] * cfg.nProc # thread creation for proc in range(cfg.nProc): dataPreparation[proc] = Process(target=prepare_data_thread, args=(dataQueue, numpyImages, numpyGT)) dataPreparation[proc].daemon = True dataPreparation[proc].start() def data_gen(): for _ in range(cfg.numIterations * cfg.batchSize): defImg, defLab, _ = dataQueue.get() yield defImg, defLab print("Load data.") # tensorflow data loader h, w, d = params["VolSize"] dataset = tf.data.Dataset.from_generator( data_gen, (tf.float32, tf.int32), (tf.TensorShape([h, w, d, 1]), tf.TensorShape([h, w, d]))) dataset = dataset.batch(batch_size=cfg.batchSize) print("Build model.") # build model model = vnet.VNet([h, w, d, 1], cfg.batchSize, cfg.ncls) learning_rate = cfg.baseLR learning_rate = K.optimizers.schedules.ExponentialDecay( learning_rate, cfg.decay_steps, cfg.decay_rate, True) optim = K.optimizers.SGD(learning_rate, momentum=0.99) criterion = K.losses.SparseCategoricalCrossentropy(from_logits=True) @tf.function def train_step(x, y): # Forward with tf.GradientTape() as tape: prediction = model(x) losses = criterion(y, prediction) # Backward with tf.name_scope("Gradients"): gradients = tape.gradient(losses, model.trainable_variables) optim.apply_gradients(zip(gradients, model.trainable_variables)) return losses, prediction # File writer writer, logdir = utils.summary_writer(cfg) # Trace graph tf.summary.trace_on(graph=True) train_step(tf.zeros([1, h, w, d, 1]), tf.zeros([1, h, w, d])) # dry run for tracing graph (step=1) tf.summary.trace_export("OpGraph", 0) print("Start training.") save_path = logdir / "snapshots" total_loss = 0 dice = None for trImg, trLab in dataset: loss, pred = train_step(trImg, trLab) step = optim.iterations.numpy() # (step start from 2) loss_val = loss.numpy() # Loss moving average total_loss = loss_val if step < 5 else \ cfg.moving_average * total_loss + (1 - cfg.moving_average) * loss_val # Logging if (step < 500 and step % 10 == 0) or step % cfg.log_interval == 0: dice = utils.compute_dice(trLab, pred) print(f"Step: {step}, Loss: {loss_val:.4f}, Dice: {dice:.4f}, " f"LR: {learning_rate(step).numpy():.2E}") # Summary scalars and images tf.summary.scalar("loss", total_loss, step=step) tf.summary.scalar("dice", dice, step=step) tf.summary.image("trImg", trImg[..., d // 2, :], step=step) tf.summary.image("pred", pred[..., d // 2, :], step=step) # Take snapshots if step == 2 or step % cfg.snap_shot_interval == 0: filepath = utils.snapshot(model, save_path, step) print(f"Model weights saved (Path: {filepath}).") # Ending filepath = utils.snapshot(model, save_path, optim.iterations.numpy()) print(f"Model weights saved ({filepath}).\nTraining ended.") writer.close()
for batch_i, (sent, length, label) in enumerate(train_loader): sent = sent.to(train_device) length = length.to(train_device) label = label.to(train_device) pred = model(sent, length) loss = criterion(pred, label) # compute gradient and do optimizer step optimizer.zero_grad() loss.backward() optimizer.step() print(f'INFO: [{epoch:02d}/{batch_i:04d}], avgloss: {loss.item():.4f}', end=', ') # training acc train_acc.update((pred, label)) pr.update((pred, label)) print(f'Training acc: {train_acc.compute() * 100:.2f}%') # testing acc model.eval() model.to(torch.device('cpu')) for batch_i, (sent, length, label) in enumerate(test_loader): with torch.no_grad(): pred = model(sent, length) test_acc.update((pred, label)) print(f' Testing acc: {test_acc.compute() * 100:.2f}%') # saving models if test_acc.compute() > 0.9: snapshot(model, epoch, args.save_path) p, r = pr.compute() print(f'P: {p} \n R: {r}')