def train_model(self): best_acc = 0.0 print("Beginning Training for", self.epochs, " Epochs") for epoch in range(1, self.epochs + 1): if epoch == 80: self.lr = 0.01 for group in self.optimizer.param_groups: group['lr'] = self.lr elif epoch == 140: self.lr = 0.001 for group in self.optimizer.param_groups: group['lr'] = self.lr train_utils.train(self, epoch) acc, loss = train_utils.evaluate(self) # acc = round(acc.item(), 4) # Save best performance model if best_acc < acc: best_model_wts = copy.deepcopy(self.model.state_dict()) best_epoch = epoch best_acc = acc best_loss = loss print(f"Saving best model: Loss={best_loss}, Acc={best_acc}, Ep={best_epoch}") # Save Best model torch.save(best_model_wts, self.checkpoint_path.format(epoch=best_epoch, acc=best_acc)) # Record Metrics self.overall_log.append( {"Experiment": self.exp_name, "Epoch": best_epoch, "Test_Acc": round(best_acc * 100, 2), "Test_Loss": best_loss}) train_utils.record_overall_metrics(self, ['Experiment', 'Epoch', "Test_Acc", "Test_Loss"])
def fine_tune(self): best_acc = 0.0 self.lr = 0.01 self.optimizer = torch.optim.SGD(self.model.parameters(), lr=self.lr, momentum=0.9, weight_decay=5e-4) print("Beginning Training for", self.epochs, " Epochs") for epoch in range(1, 41): if epoch == 10: self.lr = self.lr * 0.1 for group in self.optimizer.param_groups: group['lr'] = self.lr elif epoch == 20: self.lr = self.lr * 0.1 for group in self.optimizer.param_groups: group['lr'] = self.lr train_utils.train(self, epoch) acc, loss = train_utils.evaluate(self) acc = round(acc.item(), 4) # Save best performance model if best_acc < acc: best_model_wts = copy.deepcopy(self.model.state_dict()) best_epoch = epoch best_acc = acc best_loss = loss # Save Best model # torch.save(best_model_wts, self.checkpoint_path.format(epoch=best_epoch, acc=round(best_acc * 100, 2))) # Record Metrics self.overall_log.append( {"Experiment": self.exp_name, "Epoch": best_epoch, "Test_Acc": best_acc, "Test_Loss": best_loss}) train_utils.record_overall_metrics(self, ['Experiment', 'Epoch', "Test_Acc", "Test_Loss"])
def train_model(self): best_acc = 0.0 print("Beginning Training for", self.epochs, " Epochs") for epoch in range(1, self.epochs + 1): if epoch == 80: self.lr = 0.01 for group in self.optimizer.param_groups: group['lr'] = self.lr elif epoch == 140: self.lr = 0.001 for group in self.optimizer.param_groups: group['lr'] = self.lr train_utils.train(self, epoch) acc, loss = train_utils.evaluate(self) acc = round(acc.item(), 4) loss = round(loss, 4) # Save best performance model if best_acc < acc: best_model_wts = copy.deepcopy(self.model.state_dict()) best_epoch = epoch best_acc = acc best_loss = loss # Save Best model # torch.save(best_model_wts, self.model_path.format(task=self.task, epoch=best_epoch, acc=round(best_acc * 100, 2))) # Record Metrics train_utils.record_metrics(self) self.overall_log.append( {"Task": self.task, "Epoch": best_epoch, "Test_Acc": round(best_acc * 100, 2), "Test_Loss": best_loss}) train_utils.record_overall_metrics(self)
def main(): start_time = time() in_arg = get_args_train() data_dir = in_arg.data_dir device = get_device(in_arg.gpu) # print(device) dataloaders = get_dataloaders(data_dir) criterion = get_criterion() model = get_model(device=device, arch=in_arg.arch, hidden_units=in_arg.hidden_units, data_dir=in_arg.data_dir, save_dir=in_arg.save_dir) # print(model) optimizer = get_optimizer(model, in_arg.learning_rate) # print(optimizer) train(model, criterion, optimizer, epochs=in_arg.epochs, device=device, train_loader=dataloaders['train'], valid_loader=dataloaders['valid']) tot_time = time() - start_time print(f"\n** Total Elapsed Runtime: {tot_time:.3f} seconds")
def finetune_classifier(self, task, ittr="0"): print('-' * 50) print("Training task:\t", task) self.data_loaders = train_utils.CIFAR_dl_task(self, task, self.per_task_norm) best_acc = 0.0 # Setup Model model = self.backbone_model for param in model.parameters(): param.requires_grad = False model.fc = nn.Linear(512, 5) self.model = model.to(self.device) self.lr = 0.01 self.optimizer = torch.optim.SGD(self.model.parameters(), lr=self.lr, momentum=0.9, weight_decay=5e-4) print("Finetuning for", self.epochs, " Epochs") for epoch in range(1, self.epochs + 1): if epoch == 10: self.lr = self.lr * 0.1 for group in self.optimizer.param_groups: group['lr'] = self.lr elif epoch == 20: self.lr = self.lr * 0.1 for group in self.optimizer.param_groups: group['lr'] = self.lr train_utils.train(self, epoch) acc, loss = train_utils.evaluate(self) acc = round(acc.item(), 4) loss = round(loss, 4) # Save best performance model if best_acc < acc: best_model_wts = copy.deepcopy(self.model.state_dict()) best_acc = acc best_loss = loss best_epoch = epoch # Save Best model torch.save( best_model_wts, self.classifier_path.format(exp=ittr, task=task, epoch=best_epoch, acc=round(best_acc * 100, 2))) # Record Metrics self.classifier_results.append({ "Task": task, "Acc": round(best_acc * 100, 2), "Loss": best_loss })
def main(): config = [(64, 3, 1, 1), (64, 3, 1, 1), (1, 3, 1, 1)] #config = [(64, 9, 1, 4), (32, 1, 1, 0), (3, 5, 1, 2)] #config = [(64, 9, 1, 0), (32, 1, 1, 0), (3, 5, 1, 0)] # config: (output_ch, kernel_size, stride, padding_size) model = SRCNN(config).to(DEVICE) loss_function = nn.MSELoss(reduction='mean') optimizer = optim.Adam(model.parameters(), lr=0.0001) test_data = generate_data('test') for epoch in range(EPOCH): train_data = generate_data('train') train(model, train_data, loss_function, optimizer, DEVICE) test(model, test_data, loss_function, epoch, DEVICE)
def training(): device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') data_root = "/home/glazkova/ProbabilisticUnet/data" img_transform_func = transforms.Compose([ transforms.Resize((256, 512), interpolation=PIL.Image.BILINEAR), transforms.ToTensor(), ]) labels_transform_func = transforms.Compose([ transforms.Resize((256, 512), interpolation=PIL.Image.NEAREST), transforms.Lambda(lambda x: id_to_train_id[x]), transforms.ToTensor() ]) train_dataset = TransformedCityDataset(root=data_root, mode="fine", split="train", target_type="semantic") test_dataset = datasets.Cityscapes(root=data_root, mode="fine", split="val", target_type="semantic", transform=img_transform_func, target_transform=labels_transform_func) train_loader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=train_batch_size) test_loader = torch.utils.data.DataLoader(dataset=test_dataset, batch_size=test_batch_size) #iter_num = 240000 #n_epochs = iter_num // (len(train_dataset) // batch_size) n_epochs = 100 model = ProbUNet(num_classes, latent_space_size) #model.load_state_dict(torch.load("results/model")) model.cuda() opt = torch.optim.Adam(model.parameters(), lr=0.0001) scheduler = StepLR(opt, step_size=5, gamma=0.9) train_utils.train(model, opt, scheduler, n_epochs, train_loader, test_loader, save_path="results/final_3D/")
def main_(model_main): #def main(CFG): model = model_main.to(CFG['device']) # 模型训练、补训、测试 ################################################################# if CFG['train_or_test'] =='train': #optimizer = optim.Adadelta(model.parameters(), lr=CFG['lr']) # 求解器 optimizer = optim.SGD(model.parameters(), lr=CFG['lr'], momentum=CFG['momentum'],) scheduler = StepLR(optimizer, step_size=2, gamma=CFG['gamma']) # 学习速率衰减方式 result_trace = np.zeros([1,7]) loss_trace = np.zeros([1,12]) # 逐个记录每个epoch的loss和acc acc_trace = np.zeros([1,12]) for epoch in range(0,20): # 遍历每个epoch start_time = datetime.datetime.now() # 训练开始时间 train_loss_epoch_i, train_acc_epoch_i = train(CFG, model, train_loader, optimizer, epoch) # end_time = datetime.datetime.now(); time_cost = (end_time - start_time).seconds ; print('耗时:',time_cost) # 训练耗时 test_loss_epoch_i, test_acc_epoch_i, f4t_and_label = test(CFG, model, test_loader, is_print=True) result_epoch_i = [epoch, train_acc_epoch_i[0], train_loss_epoch_i[0], test_acc_epoch_i[0],test_loss_epoch_i[0], scheduler.get_lr()[0],time_cost] result_trace = np.vstack([result_trace, np.array(result_epoch_i).reshape(1,len(result_epoch_i))]) loss_trace = np.vstack([loss_trace, np.array([train_loss_epoch_i + test_loss_epoch_i]).reshape(1,12)]) acc_trace = np.vstack([acc_trace, np.array([train_acc_epoch_i + test_acc_epoch_i]).reshape(1,12)]) if epoch>0: scheduler.step() if CFG['save_model'] : pt_name = '[STI]_' + model.name+ '_'+CFG['dataset_choose'] +'_epoch'+str(0)+'to'+str(epoch) +'_sample'+str(CFG['NO_train'][0])+'to'+str(CFG['NO_train'][1]) # 文件主命名, STI表示Source Trained In # plot_curve(result_trace[:,0],[result_trace[:,1],result_trace[:,3]],'结果图/'+pt_name+'_ACC.png', xlabel='Epoch',ylabel='ACC',title='ACC',legend=['Training_Accuracy','Testing_Accuracy']) pt = {'model':model.state_dict(), 'optimizer':optimizer.state_dict(),'scheduler':scheduler.state_dict(),'CFG':CFG, 'model_name':model.name, 'result_trace':result_trace, 'loss_trace':loss_trace, 'acc_trace':acc_trace } torch.save(pt, pt_name + '_' + device.type + '_rand' + str(CFG['random_seed']) + '.pt')
def main(): generator = srgan.SRGAN_gen().to(device) discriminator = srgan.SRGAN_dis().to(device) params = list(generator.parameters()) + list(discriminator.parameters()) optimizer = optim.Adam(params, lr=1e-4) trainset = TrainDataset() train_loader = DataLoader(dataset=trainset, batch_size=BATCH_SIZE, shuffle=True) test_data = Image.open('./SR_dataset/Set5/001_HR.png') test_data = transforms.ToTensor()(test_data) test_data = test_data.unsqueeze(0) test_data = test_data.to(device) for epoch in range(10000): train(generator, discriminator, optimizer, train_loader, device, epoch) if epoch % 1000 == 0: test(generator, discriminator, test_data, epoch, device)
def fit(): ( train_img, test_img, train_labels, test_labels, train_orig_labels, test_orig_targets, ) = model_selection.train_test_split(IMAGES, LABELS_ENCODED, LABELS_NAMES, test_size=0.1, random_state=2020) train_dataset = OcrDataset(image_path=train_img, labels=train_labels, resize=(IMAGE_HEIGHT, IMAGE_WIDTH)) train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=BATCH_SIZE, num_workers=NUM_WORKERS, shuffle=True) test_dataset = OcrDataset(image_path=test_img, labels=test_labels, resize=(IMAGE_HEIGHT, IMAGE_WIDTH)) test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=BATCH_SIZE, num_workers=NUM_WORKERS, shuffle=False) model = OcrModel_v0(num_characters=len(labels_encoded.classes_)) model.to(DEVICE) optimizer = torch.optim.Adam(model.parameters(), lr=3e-4) scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.8, patience=2, verbose=True) for epoch in range(EPOCHS): train_loss = train(model, train_loader, optimizer) valid_preds, valid_loss = evaluate(model, test_loader) valid_final_preds = [] for pred in valid_preds: # print(pred) cur_preds = decode_preds(pred, labels_encoded) valid_final_preds.extend(cur_preds) show_preds_list = list(zip(test_orig_targets, valid_final_preds))[1:3] pprint(show_preds_list) pprint("-" * 90) pprint( f"Epoch: {epoch} | Train loss = {train_loss} | Valid loss = {valid_loss} |" ) pprint("-" * 90)
def main(args): train_loader, val_loader = custom_data_loader.customDataloader(args) model = custom_model.buildModel(args) optimizer, scheduler, records = solver_utils.configOptimizer(args, model) criterion = solver_utils.Criterion(args) recorder = recorders.Records(args.log_dir, records) tf_train_writer, tf_test_writer = tfboard.tensorboard_init() for epoch in range(args.start_epoch, args.epochs+1): scheduler.step() recorder.insertRecord('train', 'lr', epoch, scheduler.get_lr()[0]) train_utils.train(args, train_loader, model, criterion, optimizer, log, epoch, recorder, tf_train_writer) if epoch % args.save_intv == 0: model_utils.saveCheckpoint(args.cp_dir, epoch, model, optimizer, recorder.records, args) if epoch % args.val_intv == 0: test_utils.test(args, 'val', val_loader, model, log, epoch, recorder, tf_test_writer)
def my_main(_run, lr, weight_decay, message, use_gpu, epochs, save_images, experiment_folder): print(message) print("Use gpu: {}".format(use_gpu)) # print(_run) # create_dirs() model = ColorNet() criterion = nn.MSELoss() if use_gpu: criterion = criterion.cuda() model = model.cuda() optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay) train_folder = "places365_standard/train" val_folder = "places365_standard/val" train_loader = get_train_loader(train_folder) validation_loader = get_val_loader(val_folder) os.makedirs(experiment_folder + "outputs/color", exist_ok=True) os.makedirs(experiment_folder + "outputs/gray", exist_ok=True) os.makedirs(experiment_folder + "checkpoints", exist_ok=True) best_losses = 1e10 print("Epochs: {}".format(epochs)) for epoch in range(epochs): # Train for one epoch, then validate train(train_loader, model, criterion, optimizer, epoch, _run) with torch.no_grad(): losses = validate(validation_loader, model, criterion, save_images, epoch, _run) # Save checkpoint and replace old best model if current model is better if losses < best_losses: best_losses = losses torch.save( model.state_dict(), experiment_folder + "checkpoints/model-epoch-{}-losses-{:.3f}.pth".format( epoch + 1, losses), )
print("train_dataset.labels.shape", train_dataset.labels.shape) print("test_dataset.labels.shape", test_dataset.labels.shape) print("train_dataset",train_dataset) print("test_dataset",test_dataset) # create models if "densenet" in cfg.model: model = xrv.models.DenseNet(num_classes=train_dataset.labels.shape[1], in_channels=1, **xrv.models.get_densenet_params(cfg.model)) elif "resnet101" in cfg.model: model = torchvision.models.resnet101(num_classes=train_dataset.labels.shape[1], pretrained=False) #patch for single channel model.conv1 = torch.nn.Conv2d(1, 64, kernel_size=7, stride=2, padding=3, bias=False) elif "shufflenet_v2_x2_0" in cfg.model: model = torchvision.models.shufflenet_v2_x2_0(num_classes=train_dataset.labels.shape[1], pretrained=False) #patch for single channel model.conv1[0] = torch.nn.Conv2d(1, 24, kernel_size=3, stride=2, padding=1, bias=False) else: raise Exception("no model") train_utils.train(model, train_dataset, cfg) print("Done") # test_loader = torch.utils.data.DataLoader(test_dataset, # batch_size=cfg.batch_size, # shuffle=cfg.shuffle, # num_workers=0, pin_memory=False)
def main(tiny_images=None, model="cnn", augment=False, use_scattering=False, batch_size=2048, mini_batch_size=256, lr=1, lr_start=None, optim="SGD", momentum=0.9, noise_multiplier=1, max_grad_norm=0.1, epochs=100, bn_noise_multiplier=None, max_epsilon=None, data_size=550000, delta=1e-6, logdir=None): logger = Logger(logdir) device = get_device() bs = batch_size assert bs % mini_batch_size == 0 n_acc_steps = bs // mini_batch_size train_data, test_data = get_data("cifar10", augment=augment) train_loader = torch.utils.data.DataLoader(train_data, batch_size=100, shuffle=False, num_workers=4, pin_memory=True) test_loader = torch.utils.data.DataLoader(test_data, batch_size=100, shuffle=False, num_workers=4, pin_memory=True) if isinstance(tiny_images, torch.utils.data.Dataset): train_data_aug = tiny_images else: print("loading tiny images...") train_data_aug, _ = get_data("cifar10_500K", augment=augment, aux_data_filename=tiny_images) scattering, K, (h, w) = None, None, (None, None) pre_scattered = False if use_scattering: scattering, K, (h, w) = get_scatter_transform("cifar10_500K") scattering.to(device) # if the whole data fits in memory, pre-compute the scattering if use_scattering and data_size <= 50000: loader = torch.utils.data.DataLoader(train_data_aug, batch_size=100, shuffle=False, num_workers=4) train_data_aug = get_scattered_dataset(loader, scattering, device, data_size) pre_scattered = True assert data_size <= len(train_data_aug) num_sup = min(data_size, 50000) num_batches = int(np.ceil(50000 / mini_batch_size)) # cifar-10 equivalent train_batch_sampler = SemiSupervisedSampler(data_size, num_batches, mini_batch_size) train_loader_aug = torch.utils.data.DataLoader( train_data_aug, batch_sampler=train_batch_sampler, num_workers=0 if pre_scattered else 4, pin_memory=not pre_scattered) rdp_norm = 0 if model == "cnn": if use_scattering: save_dir = f"bn_stats/cifar10_500K" os.makedirs(save_dir, exist_ok=True) bn_stats, rdp_norm = scatter_normalization( train_loader, scattering, K, device, data_size, num_sup, noise_multiplier=bn_noise_multiplier, orders=ORDERS, save_dir=save_dir) model = CNNS["cifar10"](K, input_norm="BN", bn_stats=bn_stats) model = model.to(device) if not pre_scattered: model = nn.Sequential(scattering, model) else: model = CNNS["cifar10"](in_channels=3, internal_norm=False) elif model == "linear": save_dir = f"bn_stats/cifar10_500K" os.makedirs(save_dir, exist_ok=True) bn_stats, rdp_norm = scatter_normalization( train_loader, scattering, K, device, data_size, num_sup, noise_multiplier=bn_noise_multiplier, orders=ORDERS, save_dir=save_dir) model = ScatterLinear(K, (h, w), input_norm="BN", bn_stats=bn_stats) model = model.to(device) if not pre_scattered: model = nn.Sequential(scattering, model) else: raise ValueError(f"Unknown model {model}") model.to(device) if pre_scattered: test_loader = get_scattered_loader(test_loader, scattering, device) print(f"model has {get_num_params(model)} parameters") if optim == "SGD": optimizer = torch.optim.SGD(model.parameters(), lr=lr, momentum=momentum) else: optimizer = torch.optim.Adam(model.parameters(), lr=lr) privacy_engine = PrivacyEngine( model, bs, data_size, alphas=ORDERS, noise_multiplier=noise_multiplier, max_grad_norm=max_grad_norm, ) privacy_engine.attach(optimizer) best_acc = 0 flat_count = 0 for epoch in range(0, epochs): print(f"\nEpoch: {epoch} ({privacy_engine.steps} steps)") train_loss, train_acc = train(model, train_loader_aug, optimizer, n_acc_steps=n_acc_steps) test_loss, test_acc = test(model, test_loader) if noise_multiplier > 0: print(f"sample_rate={privacy_engine.sample_rate}, " f"mul={privacy_engine.noise_multiplier}, " f"steps={privacy_engine.steps}") rdp_sgd = get_renyi_divergence( privacy_engine.sample_rate, privacy_engine.noise_multiplier) * privacy_engine.steps epsilon, _ = get_privacy_spent(rdp_norm + rdp_sgd, target_delta=delta) epsilon2, _ = get_privacy_spent(rdp_sgd, target_delta=delta) print(f"ε = {epsilon:.3f} (sgd only: ε = {epsilon2:.3f})") if max_epsilon is not None and epsilon >= max_epsilon: return else: epsilon = None logger.log_epoch(epoch, train_loss, train_acc, test_loss, test_acc, epsilon) logger.log_scalar("epsilon/train", epsilon, epoch) logger.log_scalar("cifar10k_loss/train", train_loss, epoch) logger.log_scalar("cifar10k_acc/train", train_acc, epoch) if test_acc > best_acc: best_acc = test_acc flat_count = 0 else: flat_count += 1 if flat_count >= 20: print("plateau...") return
batch_size=BATCH_SIZE, shuffle=True, collate_fn=pad_collate) model = BilstmAspectAttPool(Configs1()) initialize_weights(model) print(model) model = model.to(device) criterion = nn.CrossEntropyLoss() optimizer = optim.Adam(model.parameters(), lr=lr) best_valid_loss = float('inf') for epoch in range(EPOCHS): start_time = time.time() train_loss = train(model, train_loader, optimizer, criterion, CLIP, device) valid_loss = evaluate(model, test_loader, criterion, device) end_time = time.time() epoch_mins, epoch_secs = epoch_time(start_time, end_time) if valid_loss < best_valid_loss: best_valid_loss = valid_loss torch.save(model.state_dict(), model_name) print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s') print( f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}' ) print(
def main(): args = parse_args() args.num_gpus = len(get_available_gpus()) eval(args.NET + '_setup')(args) set_seed(args.seed) setup(args) # Create model and optimizer if args.resume or args.eval or args.benchmark: last_epoch, best_epoch, best_val_loss, num_params, \ enc_params, dec_params = parse_experiment(args.odir) i = last_epoch if args.eval or args.benchmark: i = best_epoch args.resume = model_at(args, i) model, stats = tf_resume(args, i) else: check_overwrite(os.path.join(args.odir, 'trainlog.txt')) model = eval(args.NET + '_create_model')(args) stats = [] print('Will save to ' + args.odir) if not os.path.exists(args.odir): os.makedirs(args.odir) if not os.path.exists(args.odir + '/models'): os.makedirs(args.odir + '/models') with open(os.path.join(args.odir, 'cmdline.txt'), 'w') as f: f.write(" ".join([ "'" + a + "'" if (len(a) == 0 or a[0] != '-') else a for a in sys.argv ])) args.model = model args.step = eval(args.NET + '_step') # Training loop epoch = args.start_epoch train_data_queue, train_data_processes = data_setup(args, 'train', args.nworkers, repeat=True) if args.eval == 0: for epoch in range(args.start_epoch, args.epochs): print('Epoch {}/{} ({}):'.format(epoch + 1, args.epochs, args.odir)) loss = train(args, epoch, train_data_queue, train_data_processes)[0] if (epoch + 1) % args.test_nth_epoch == 0 or epoch + 1 == args.epochs: loss_val = test('val', args)[0] print('-> Train Loss: {}, \tVal loss: {}'.format( loss, loss_val)) stats.append({ 'epoch': epoch + 1, 'loss': loss, 'loss_val': loss_val }) else: loss_val = 0 print('-> Train loss: {}'.format(loss)) stats.append({'epoch': epoch + 1, 'loss': loss}) if (epoch + 1) % args.save_nth_epoch == 0 or epoch + 1 == args.epochs: with open(os.path.join(args.odir, 'trainlog.txt'), 'w') as outfile: json.dump(stats, outfile) save_model(args, epoch) if (epoch + 1) % args.test_nth_epoch == 0 and epoch + 1 < args.epochs: split = 'val' predictions = samples(split, args, 20) cache_pred(predictions, split, args) metrics(split, args, epoch) if math.isnan(loss): break if len(stats) > 0: with open(os.path.join(args.odir, 'trainlog.txt'), 'w') as outfile: json.dump(stats, outfile) kill_data_processes(train_data_queue, train_data_processes) split = 'val' predictions = samples(split, args, 20) cache_pred(predictions, split, args) metrics(split, args, epoch) if args.benchmark: benchmark_results('test', args)
def fit(self, X, y): x_train, x_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2) # from some github repo... torch.multiprocessing.set_sharing_strategy('file_system') args = self.args args.input_dim = X.shape[1] args.output_dim = 1 args.task = 'regression' use_cuda = not args.no_cuda and torch.cuda.is_available() torch.manual_seed(args.seed) np.random.seed(args.seed) device = torch.device("cuda" if use_cuda else "cpu") self.device = device train_loader = basic_loader(x_train, y_train, args.batch_size) valid_loader = basic_loader(x_valid, y_valid, args.batch_size, train_shuffle=False) # train_loader, valid_loader, test_loader = get_data_loaders(args.dataset, args.batch_size, # sub_task=args.sub_task, dim=args.input_dim) # if args.dataset in ['sider_split/', 'tox21_split/']: # args.dataset = args.dataset[:-1] + '-' + str(args.sub_task) print('batch number: train={}, valid={}'.format( len(train_loader), len(valid_loader))) model = Net(input_dim=args.input_dim, output_dim=args.output_dim, hidden_dim=args.hidden_dim, num_layer=args.depth, num_back_layer=args.back_n, dense=True, drop_type=args.drop_type, net_type=args.net_type, approx=args.anneal, device=device).to(device) self.model = model if args.optimizer == 'SGD': optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum, nesterov=True) elif args.optimizer == 'AMSGrad': optimizer = optim.Adam(model.parameters(), lr=args.lr, amsgrad=True) scheduler = StepLR(optimizer, step_size=args.lr_step_size, gamma=args.gamma) best_score = -1e30 start_epoch = 1 # start from epoch 1 or last checkpoint epoch if args.anneal == 'approx': args.net_type = 'approx_' + args.net_type best_model_name = './checkpoint/{}/{}/best_seed{}_depth{}_ckpt.t7'.format( args.dataset.strip('/'), args.net_type, args.seed, args.depth) last_model_name = './checkpoint/{}/{}/last_seed{}_depth{}_ckpt.t7'.format( args.dataset.strip('/'), args.net_type, args.seed, args.depth) best_log_file = 'log/' + args.dataset.strip( '/') + '/{}/depth{}_backn{}_drop{}_p{}_best.log'.format( args.net_type, args.depth, args.back_n, args.drop_type, args.p) last_log_file = 'log/' + args.dataset.strip( '/') + '/{}/depth{}_backn{}_drop{}_p{}_last.log'.format( args.net_type, args.depth, args.back_n, args.drop_type, args.p) model_dir = './checkpoint/{}/{}/'.format(args.dataset.strip('/'), args.net_type) if not os.path.exists(model_dir): os.makedirs(model_dir) log_dir = 'log/' + args.dataset.strip('/') + '/{}/'.format( args.net_type) if not os.path.exists(log_dir): os.makedirs(log_dir) if not os.path.isdir('checkpoint'): os.mkdir('checkpoint') for epoch in range(start_epoch, args.epochs + start_epoch): scheduler.step(epoch) alpha = get_alpha(epoch, args.epochs) train_approximate_loss = train(args, model, device, train_loader, optimizer, epoch, args.anneal, alpha) # used for plotting learning curves train_loss, train_score = test(args, model, device, train_loader, 'train') valid_loss, valid_score = test(args, model, device, valid_loader, 'valid') # test_loss, test_score = test(args, model, device, test_loader, 'test') print(train_score, valid_score) # early stopping version if valid_score > best_score: self.best_state = model.state_dict() state = {'model': model.state_dict()} torch.save(state, best_model_name) best_score = valid_score # "convergent" version state = {'model': model.state_dict()} torch.save(state, last_model_name) # print('Training finished. Loading models from validation...') # for model_name, log_file, setting in zip([best_model_name, last_model_name], [best_log_file, last_log_file], # ['best', 'last']): # print('\nLoading the {} model...'.format(setting)) # # checkpoint = torch.load(model_name) # model.load_state_dict(checkpoint['model']) # train_loss, train_score = test(args, model, device, train_loader, 'train') # valid_loss, valid_score = test(args, model, device, valid_loader, 'valid') # test_loss, test_score = test(args, model, device, test_loader, 'test ') return self
def main(dataset, augment=False, use_scattering=False, size=None, batch_size=2048, mini_batch_size=256, sample_batches=False, lr=1, optim="SGD", momentum=0.9, nesterov=False, noise_multiplier=1, max_grad_norm=0.1, epochs=100, input_norm=None, num_groups=None, bn_noise_multiplier=None, max_epsilon=None, logdir=None, early_stop=True, seed=0): torch.manual_seed(seed) logger = Logger(logdir) device = get_device() train_data, test_data = get_data(dataset, augment=augment) if use_scattering: scattering, K, _ = get_scatter_transform(dataset) scattering.to(device) else: scattering = None K = 3 if len(train_data.data.shape) == 4 else 1 bs = batch_size assert bs % mini_batch_size == 0 n_acc_steps = bs // mini_batch_size # Batch accumulation and data augmentation with Poisson sampling isn't implemented if sample_batches: assert n_acc_steps == 1 assert not augment train_loader = torch.utils.data.DataLoader(train_data, batch_size=mini_batch_size, shuffle=True, num_workers=1, pin_memory=True) test_loader = torch.utils.data.DataLoader(test_data, batch_size=mini_batch_size, shuffle=False, num_workers=1, pin_memory=True) rdp_norm = 0 if input_norm == "BN": # compute noisy data statistics or load from disk if pre-computed save_dir = f"bn_stats/{dataset}" os.makedirs(save_dir, exist_ok=True) bn_stats, rdp_norm = scatter_normalization( train_loader, scattering, K, device, len(train_data), len(train_data), noise_multiplier=bn_noise_multiplier, orders=ORDERS, save_dir=save_dir) model = CNNS[dataset](K, input_norm="BN", bn_stats=bn_stats, size=size) else: model = CNNS[dataset](K, input_norm=input_norm, num_groups=num_groups, size=size) model.to(device) if use_scattering and augment: model = nn.Sequential(scattering, model) train_loader = torch.utils.data.DataLoader(train_data, batch_size=mini_batch_size, shuffle=True, num_workers=1, pin_memory=True, drop_last=True) else: # pre-compute the scattering transform if necessery train_loader = get_scattered_loader(train_loader, scattering, device, drop_last=True, sample_batches=sample_batches) test_loader = get_scattered_loader(test_loader, scattering, device) print(f"model has {get_num_params(model)} parameters") if optim == "SGD": optimizer = torch.optim.SGD(model.parameters(), lr=lr, momentum=momentum, nesterov=nesterov) else: optimizer = torch.optim.Adam(model.parameters(), lr=lr) privacy_engine = PrivacyEngine( model, batch_size=bs, sample_size=len(train_data), alphas=ORDERS, noise_multiplier=noise_multiplier, max_grad_norm=max_grad_norm, ) privacy_engine.attach(optimizer) best_acc = 0 flat_count = 0 results = dict(train_zeon=[], train_xent=[], test_zeon=[], test_xent=[], epoch=[]) for epoch in range(0, epochs): print(f"\nEpoch: {epoch}") train_loss, train_acc = train(model, train_loader, optimizer, n_acc_steps=n_acc_steps) test_loss, test_acc = test(model, test_loader) results['train_zeon'].append(train_acc) results['train_xent'].append(train_loss) results['test_zeon'].append(test_acc) results['test_xent'].append(test_loss) results['epoch'].append(epoch) if noise_multiplier > 0: rdp_sgd = get_renyi_divergence( privacy_engine.sample_rate, privacy_engine.noise_multiplier) * privacy_engine.steps epsilon, _ = get_privacy_spent(rdp_norm + rdp_sgd) epsilon2, _ = get_privacy_spent(rdp_sgd) print(f"ε = {epsilon:.3f} (sgd only: ε = {epsilon2:.3f})") if max_epsilon is not None and epsilon >= max_epsilon: return else: epsilon = None logger.log_epoch(epoch, train_loss, train_acc, test_loss, test_acc, epsilon) logger.log_scalar("epsilon/train", epsilon, epoch) # stop if we're not making progress if test_acc > best_acc: best_acc = test_acc flat_count = 0 else: flat_count += 1 if flat_count >= 20 and early_stop: print("plateau...") break # Write to file. record = { **results, **{ 'best_acc': best_acc, 'seed': seed, 'dataset': dataset } } record_path = os.path.join('.', 'record', f'{dataset}-{seed}.json') os.makedirs(os.path.dirname(record_path), exist_ok=True) with open(record_path, 'w') as f: json.dump(record, f, indent=4) import logging logging.warning(f'Wrote to file: {record_path}')
######################################################################## batch_size = 400 # calculate batch_size load_batch = 100 # load batch_size(not calculate) device = torch.device("cuda:0") # device lr = 0.001 # learning_rate # load train and test name , train:test=4:1 if os.path.exists(r'./train_test_names.data'): train_test = pickle.load(open('./train_test_names.data', "rb")) else: train_test = train_utils.get_train_test_name(dns_home) train_noisy_names, train_clean_names, test_noisy_names, test_clean_names = \ train_utils.get_all_names(train_test, dns_home=dns_home) train_dataset = loader.WavDataset(train_noisy_names, train_clean_names, frame_dur=37.5) test_dataset = loader.WavDataset(test_noisy_names, test_clean_names, frame_dur=37.5) # dataloader train_dataloader = DataLoader(train_dataset, batch_size=load_batch, shuffle=True) test_dataloader = DataLoader(test_dataset, batch_size=load_batch, shuffle=True) dccrn = model_cov_bn.DCCRN_( n_fft=512, hop_len=int(6.25 * 16000 / 1000), net_params=net_config.get_net_params(), batch_size=batch_size, device=device, win_length=int((25 * 16000 / 1000))).to(device) optimizer = torch.optim.Adam(dccrn.parameters(), lr=lr) criterion = SiSnr() train_utils.train(model=dccrn, optimizer=optimizer, criterion=criterion, train_iter=train_dataloader, test_iter=test_dataloader, max_epoch=500, device=device, batch_size=batch_size, log_path=save_file, just_test=False)
args = parser.parse_args() cifar_dir = args.cifar_root fig_path = args.fig_path validation_split = args.val_split batch_size = args.batch_size epochs = args.epochs weight_path = args.weight_path weight_decay = args.weight_decay lr = args.lr SEED = args.seed # set random seed (default as 1234) # split train, val, test from `get_data` function train_loader, val_loader, test_loader = get_data(cifar_dir=cifar_dir, batch_size=batch_size, augment=True, validation_split=validation_split) # load model model = VGG_lite() # define loss loss = nn.CrossEntropyLoss() # train the model model, history = train(model, train_loader, val_loader, epochs, loss, batch_size, optimizer='adam', weight_decay=weight_decay, lr=lr) # save the model accordeing to `weight_path` from parser (default to './weights/final.pth') torch.save(model.state_dict(), weight_path) plot_history(history, fig_path) # save figures acc, cm, cm_norm = evaluate(model, test_loader) # evaluate trained model plot_cm(cm, cm_norm, fig_path) # save confusion matrix figures print('Test Accuracy: {}%'.format(round(acc*100, 4))) # print the model test accuracy
def my_main( _run, lr, weight_decay, message, use_gpu, epochs, save_images, experiment_folder, batch_size, save_exp, ): print("Epochs: {}".format(epochs)) # args["seed"] = _run.config["seed"] device = torch.device("cuda" if use_gpu else "cpu") dataloader_kwargs = {"pin_memory": True} if use_gpu else {} # if save_exp: os.makedirs(experiment_folder + "outputs/color") os.makedirs(experiment_folder + "outputs/gray") os.makedirs(experiment_folder + "checkpoints") best_losses = 1e10 seed = int(time.time()) args = { "num_processes": 4, "batch_size": 64, "lr": lr, "weight_decay": weight_decay, "log_interval": 100, "use_gpu": use_gpu, "epochs": epochs, "seed": seed, "experiment_folder": experiment_folder, } train_folder = "places365_standard/train" val_folder = "places365_standard/val" trained = False options = dict({"num_classes": (2 * 224 * 224)}) model = AlexNet().to(device) print(model) # model = nn.DataParallel(model) # model.share_memory() # gradients are allocated lazily, so they are not shared here processes = [] time1 = time.time() train( 1, args, model, device, dataloader_kwargs, train_folder, nn.CrossEntropyLoss, val_folder, ) time2 = time.time() print("{:s} function took {:.3f} ms".format("train", (time2 - time1) * 1000.0))
def main(feature_path=None, batch_size=2048, mini_batch_size=256, lr=1, optim="SGD", momentum=0.9, nesterov=False, noise_multiplier=1, max_grad_norm=0.1, max_epsilon=None, epochs=100, logdir=None): logger = Logger(logdir) device = get_device() # get pre-computed features x_train = np.load(f"{feature_path}_train.npy") x_test = np.load(f"{feature_path}_test.npy") train_data, test_data = get_data("cifar10", augment=False) y_train = np.asarray(train_data.targets) y_test = np.asarray(test_data.targets) trainset = torch.utils.data.TensorDataset(torch.from_numpy(x_train), torch.from_numpy(y_train)) testset = torch.utils.data.TensorDataset(torch.from_numpy(x_test), torch.from_numpy(y_test)) bs = batch_size assert bs % mini_batch_size == 0 n_acc_steps = bs // mini_batch_size train_loader = torch.utils.data.DataLoader(trainset, batch_size=mini_batch_size, shuffle=True, num_workers=1, pin_memory=True, drop_last=True) test_loader = torch.utils.data.DataLoader(testset, batch_size=mini_batch_size, shuffle=False, num_workers=1, pin_memory=True) n_features = x_train.shape[-1] try: mean = np.load(f"{feature_path}_mean.npy") var = np.load(f"{feature_path}_var.npy") except FileNotFoundError: mean = np.zeros(n_features, dtype=np.float32) var = np.ones(n_features, dtype=np.float32) bn_stats = (torch.from_numpy(mean).to(device), torch.from_numpy(var).to(device)) model = nn.Sequential(StandardizeLayer(bn_stats), nn.Linear(n_features, 10)).to(device) if optim == "SGD": optimizer = torch.optim.SGD(model.parameters(), lr=lr, momentum=momentum, nesterov=nesterov) else: optimizer = torch.optim.Adam(model.parameters(), lr=lr) privacy_engine = PrivacyEngine( model, sample_rate=bs / len(train_data), alphas=ORDERS, noise_multiplier=noise_multiplier, max_grad_norm=max_grad_norm, ) privacy_engine.attach(optimizer) for epoch in range(0, epochs): print(f"\nEpoch: {epoch}") train_loss, train_acc = train(model, train_loader, optimizer, n_acc_steps=n_acc_steps) test_loss, test_acc = test(model, test_loader) if noise_multiplier > 0: rdp_sgd = get_renyi_divergence( privacy_engine.sample_rate, privacy_engine.noise_multiplier) * privacy_engine.steps epsilon, _ = get_privacy_spent(rdp_sgd) print(f"ε = {epsilon:.3f}") if max_epsilon is not None and epsilon >= max_epsilon: return else: epsilon = None logger.log_epoch(epoch, train_loss, train_acc, test_loss, test_acc, epsilon)
start = time.time() now = datetime.datetime.today().strftime("%Y-%m-%d %H:%M:%S") # 학습 정보 출력 print("----------------") print("학습을 시작합니다.") print("현재 시각:", now) print("학습 데이터 : %d개" % len(train_data)) print("----------------") print() # 학습 시작 try: for iter in range(1, n_iter + 1): input, target = get_batch_set() loss = train(input, target) total_loss += loss # 현재 학습 과정 출력 if iter % print_every == 0: avg_loss = total_loss / print_every sys.stdout.write( "%d %d%% (%s) %.4f\n" % (iter, iter / n_iter * 100, time_since(start), avg_loss)) losses.append(avg_loss) total_loss = 0 lyrics = generate_lyrics(['사랑', '발라드']) print(lyrics) print() sys.stdout.write("학습이 완료되었습니다.\n")
def trainEvalLM(args): fn = 'corpus.{}.data'.format(hashlib.md5(args.data.encode()).hexdigest()) if os.path.exists(fn): print('Loading cached dataset...') corpus = torch.load(fn) else: print('Producing dataset...') corpus = data.Corpus(args.data) torch.save(corpus, fn) if torch.cuda.is_available(): args.cuda = True ntokens = len(corpus.dictionary) eval_batch_size = 10 train_data = batchify(corpus.train, args.batch_size, args) val_data = batchify(corpus.valid, eval_batch_size, args) # Build the model and loss function model = lmModel.RNNModel(args.model, ntokens, args.emsize, args.nhid, args.nlayers, args.dropout, args.tied, g=args.g, k=args.k) criterion = nn.CrossEntropyLoss() if torch.cuda.is_available(): model = model.cuda() criterion = criterion.cuda() #compute network parameters params = list(model.parameters()) total_params = np.sum([np.prod(p.size()) for p in params]) print( '\033[1;32;40mTotal parameters (in million):\033[0m\033[1;31;40m {:0.2f} \033[0m\n' .format(total_params / 1e6, 2)) optimizer = torch.optim.SGD(params, lr=args.lr, weight_decay=args.wdecay) start_epoch = 1 if args.resume: print('Resuming model ...') model, criterion, optimizer, start_epoch = model_load(args.resume) optimizer.param_groups[0]['lr'] = args.lr model.dropout = args.dropout # At any point you can hit Ctrl + C to break out of training early. try: #Create folder for saving model and log files args.saveDir += '_' + args.model # ===================== if not os.path.isdir(args.saveDir): os.mkdir(args.saveDir) save_str = 'nl_' + str(args.nlayers) + '_nh_' + str( args.nhid) + '_g_' + str(args.g) + '_k_' + str(args.k) args.save = args.saveDir + '/model_' + save_str + '.pt' logFileLoc = args.saveDir + '/logs_' + save_str + '.txt' logger = open(logFileLoc, 'w') logger.write(str(args)) logger.write('\n Total parameters (in million): {:0.2f}'.format( total_params / 1e6, 2)) logger.write('\n\n') logger.write( "\n%s\t%s\t%s\t%s\t%s" % ('Epoch', 'Loss(Tr)', 'Loss(val)', 'ppl (tr)', 'ppl (val)')) logger.flush() best_val_loss = [] stored_loss = 100000000 # Loop over epochs. for epoch in range(start_epoch, args.epochs + 1): epoch_start_time = time.time() train_loss = train(args, model, criterion, optimizer, epoch, train_data, ntokens) ### TRAIN WITH ASGD if 't0' in optimizer.param_groups[0]: tmp = {} for prm in model.parameters(): tmp[prm] = prm.data.clone() prm.data = optimizer.state[prm]['ax'].clone() val_loss = evaluate(args, model, criterion, val_data, ntokens, eval_batch_size) print('-' * 89) print( '| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | ' 'valid ppl {:8.2f}'.format( epoch, (time.time() - epoch_start_time), val_loss, math.exp(val_loss))) print('-' * 89) logger.write("\n%d\t\t%.4f\t\t%.4f\t\t%.4f\t\t%.4f" % (epoch, train_loss, val_loss, math.exp(train_loss), math.exp(val_loss))) logger.flush() if val_loss < stored_loss: model_save(args.save, model, criterion, optimizer, epoch) print('Saving Averaged (new best validation)') stored_loss = val_loss for prm in model.parameters(): prm.data = tmp[prm].clone() else: val_loss = evaluate(args, model, criterion, val_data, ntokens, eval_batch_size) print('-' * 89) print( '| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | ' 'valid ppl {:8.2f}'.format( epoch, (time.time() - epoch_start_time), val_loss, math.exp(val_loss))) print('-' * 89) logger.write("\n%d\t\t%.4f\t\t%.4f\t\t%.4f\t\t%.4f" % (epoch, train_loss, val_loss, math.exp(train_loss), math.exp(val_loss))) logger.flush() if val_loss < stored_loss: model_save(args.save, model, criterion, optimizer, epoch) print('Saving model (new best validation)') stored_loss = val_loss if 't0' not in optimizer.param_groups[0] and ( len(best_val_loss) > args.nonmono and val_loss > min(best_val_loss[:-args.nonmono])): print('Switching to ASGD') optimizer = torch.optim.ASGD(model.parameters(), lr=args.lr, t0=0, lambd=0., weight_decay=args.wdecay) best_val_loss.append(val_loss) except KeyboardInterrupt: print('-' * 89) print('Exiting from training early')
def main(): parser = argparse.ArgumentParser() parser.add_argument('--DATASET_PATH', type=str, default='/home/zhangdong/database/DUTS/') parser.add_argument('--WEIGHTS_PATH', type=str, default='/home/yangle/DAVIS/result/models/') parser.add_argument('--EXPERIMENT', type=str, default='/home/yangle/DAVIS/result/TrainNet/') parser.add_argument('--N_EPOCHS', type=int, default=200) parser.add_argument('--MAX_PATIENCE', type=int, default=30) parser.add_argument('--batch_size', type=int, default=32) parser.add_argument('--seed', type=int, default=0) parser.add_argument('--N_CLASSES', type=int, default=2) parser.add_argument('--LEARNING_RATE', type=float, default=1e-4) parser.add_argument('--LR_DECAY', type=float, default=0.995) parser.add_argument('--DECAY_LR_EVERY_N_EPOCHS', type=int, default=1) parser.add_argument('--WEIGHT_DECAY', type=float, default=0.0001) parser.add_argument('--CUDNN', type=bool, default=True) args = parser.parse_args() torch.cuda.manual_seed(args.seed) cudnn.benchmark = args.CUDNN normalize = transforms.Normalize(mean=saliency.mean, std=saliency.std) train_joint_transformer_img = transforms.Compose([joint_transforms.JointResize(224)]) mask_size_list = [14, 28, 56, 112, 224] train_dset = saliency.Saliency( args.DATASET_PATH, 'train',train_joint_transformer_img, mask_size_list, transform=transforms.Compose([transforms.ToTensor(), normalize, ])) train_loader = torch.utils.data.DataLoader( train_dset, batch_size=args.batch_size, shuffle=True) test_joint_transforms_img = transforms.Compose([joint_transforms.JointResize(224)]) val_dset = saliency.Saliency( args.DATASET_PATH, 'val',test_joint_transforms_img, mask_size_list, transform=transforms.Compose([transforms.ToTensor(),normalize])) val_loader = torch.utils.data.DataLoader( val_dset, batch_size=args.batch_size, shuffle=False) print("TrainImages: %d" % len(train_loader.dataset.imgs)) print("ValImages: %d" % len(val_loader.dataset.imgs)) # example_inputs, example_targets = next(iter(train_loader)) # print("InputsBatchSize: ", example_inputs.size()) # print("TargetsBatchSize: ", len(example_targets)) # print("\nInput (size, max, min) ---") # # input # i = example_inputs[0] # print(i.size()) # print(i.max()) # print(i.min()) # print("Target (size, max, min) ---") # # target # for mask in example_targets: # print(mask.size()) # print(mask.max()) # print(mask.min()) # initialize ResNet18 from the pre-trained classification model resnet = torchvision.models.resnet50(pretrained=True) pre_trained_dict = resnet.state_dict() model = SegNet.resnet50() model_dict = model.state_dict() # 1. filter out unnecessary keys pre_trained_dict = {k: v for k, v in pre_trained_dict.items() if k in model_dict} # 2. overwrite entries in the existing state dict model_dict.update(pre_trained_dict) # 3. load the new state dict model.load_state_dict(model_dict) model = model.cuda() #model = torch.nn.DataParallel(model).cuda() print(' + Number of params: {}'.format( sum([p.data.nelement() for p in model.parameters()]))) # model.apply(utils.weights_init) optimizer = optim.RMSprop(model.parameters(), lr=args.LEARNING_RATE, weight_decay=args.WEIGHT_DECAY, eps=1e-12) criterion = nn.NLLLoss2d().cuda() exp_dir = args.EXPERIMENT + 'test' if os.path.exists(exp_dir): shutil.rmtree(exp_dir) exp = experiment.Experiment('test', args.EXPERIMENT) exp.init() START_EPOCH = exp.epoch END_EPOCH = START_EPOCH + args.N_EPOCHS for epoch in range(START_EPOCH, END_EPOCH): since = time.time() # ### Train ### trn_loss, trn_err = utils.train(model, train_loader, optimizer, criterion, epoch) print('Epoch {:d}: Train - Loss: {:.4f}\tErr: {:.4f}'.format(epoch, trn_loss, trn_err)) time_elapsed = time.time() - since print('Train Time {:.0f}m {:.0f}s'.format( time_elapsed // 60, time_elapsed % 60)) ### Test ### val_loss, val_err = utils.test(model, val_loader, criterion, epoch) print('Val - Loss: {:.4f}, Error: {:.4f}'.format(val_loss, val_err)) time_elapsed = time.time() - since print('Total Time {:.0f}m {:.0f}s\n'.format( time_elapsed // 60, time_elapsed % 60)) ### Save Metrics ### exp.save_history('train', trn_loss, trn_err) exp.save_history('val', val_loss, val_err) ### Checkpoint ### exp.save_weights(model, trn_loss, val_loss, trn_err, val_err) exp.save_optimizer(optimizer, val_loss) ## Early Stopping ## if (epoch - exp.best_val_loss_epoch) > args.MAX_PATIENCE: print(("Early stopping at epoch %d since no " +"better loss found since epoch %.3").format(epoch, exp.best_val_loss)) break # Adjust Lr ###--old method utils.adjust_learning_rate(args.LEARNING_RATE, args.LR_DECAY, optimizer, epoch, args.DECAY_LR_EVERY_N_EPOCHS) exp.epoch += 1
default=10, help='The number of iterations between every logging.') parser.add_argument( '--loaderjob', type=int, default=4, help='The number of processes to launch for MultiprocessIterator.') parser.add_argument( '--resume', help='The path to the trainer snapshot to resume from. ' 'If unspecified, no snapshot will be resumed') args = parser.parse_args() with open(args.label_names, 'r') as f: label_names = tuple(yaml.load(f)) if args.val is not None: train_data = OriginalDetectionDataset(args.train, label_names) val_data = OriginalDetectionDataset(args.val, label_names) else: # If --val is not supplied, the train data is split into two # with ratio 8:2. dataset = OriginalDetectionDataset(args.train, label_names) train_data, val_data = chainer.datasets.split_dataset_random( dataset, int(len(dataset) * 0.8)) step_points = [args.step_size] train(train_data, val_data, label_names, args.iteration, args.lr, step_points, args.batchsize, args.gpu, args.out, args.val_iteration, args.log_iteration, args.loaderjob, args.resume)
# In[18]: # Main optimizer = optim.Adam(model.parameters(), lr=args.lr) info = {'highest F1': 100, 'saved epoch': None} # In[19]: print('STARTING TRAINING') for epoch in range(1, args.epochs + 1): train(args, model, device, train_loader, optimizer, epoch, start_time=time.time()) f1 = get_mean_F1(model, validation_loader) print('after epoch {} got f1 score of {}'.format(epoch, f1)) if f1 > info['highest F1']: info['highest F1'] = np.copy(f1) info['saved epoch'] = epoch test(args, model, device, test_loader, epoch, trainDataset, testDataset, path_submission) torch.save(model, path_model) print('currently best model --> saved') print('TRAINING DONE') print(info)
if args.snapshot is not None: vocab = pickle.load(open(args.snapshot + '.vocab', 'rb')) else: vocab = None # load data train_data_dict, dev_data_dict, test_data_dict, vocab = data_utils.load_dataset(args, vocab) # Load model model = model_utils.get_model(vocab, args) if args.mode == 'train_r2a': ''' Training R2A on labeled source and unlabeled target ''' dev_res, saved_path, model = train_utils.train(train_data_dict, dev_data_dict, model, args) # saving the vocabulary if args.save: with open(saved_path+'.vocab', 'wb') as f: pickle.dump(vocab, f, pickle.HIGHEST_PROTOCOL) # evaluate performance on the source train & dev set tar_train = None if args.tar_dataset == '' else train_data_dict[args.tar_dataset] tar_dev = None if args.tar_dataset == '' else dev_data_dict[args.tar_dataset] print("\n=== train ====") train_res = [] for task in args.src_dataset: cur_res = train_utils.evaluate_task( train_data_dict[task], task, tar_train, model, None, args)
# ------------------------------------------------------------------------ if args.snapshot is None: model = model_utils.get_model(vocab, args) else: # load saved model print('\nLoading model from [%s]...' % args.snapshot) try: model = torch.load(args.snapshot) except Exception as e: print(e) exit(1) print("Load complete") # Train the model on train_data, use dev_data for early stopping model, dev_res = train_utils.train(train_data, dev_data, model, args) # Evaluate the trained model print("Evaluate on train set") train_res = train_utils.evaluate(train_data, model, args) print("Evaluate on test set") test_res = train_utils.evaluate(test_data, model, args, roc=True) if args.result_path: directory = args.result_path[:args.result_path.rfind('/')] if not os.path.exists(directory): os.makedirs(directory) result = { 'train_loss': train_res[0],
required=False, help='Freeze the model after training.') parser.add_argument( '--binarization', choices=['deterministic-binary', 'stochastic-binary', 'disabled'], action='store', required=False, default='deterministic-binary', help='binarization mode') return parser.parse_args() if __name__ == '__main__': tf.set_random_seed(_RANDOM_SEED) parsed_args = args_parser(sys.argv) dataset = get_dataset(parsed_args.dataset, parsed_args.epochs, parsed_args.batch_size) train( parsed_args.epochs, parsed_args.batch_size, dataset, get_model_fn(parsed_args.model, parsed_args.binarization), get_optimiser_fn(parsed_args.model, parsed_args.epochs, parsed_args.batch_size, dataset), parsed_args.resume_from_latest_checkpoint, parsed_args.tag, parsed_args.freeze)