def main(args): use_cuda = torch.cuda.is_available() device = torch.device("cuda" if use_cuda else "cpu") down_layers, up_layers = create_unet(args.cfg_file) print(down_layers, up_layers) unet = GeneralUnet(down_layers, up_layers) unet = unet.to(device) input_transform = transforms.Compose([transforms.Resize((args.resize, args.resize)), transforms.ToTensor()]) label_transform = transforms.Compose([transforms.Resize((args.resize, args.resize)), transforms.ToTensor()]) train_dataset = ReconstructionDataset.ReconstructionDataset(args.train_csv, args.train_input_dir, args.train_gt_dir, input_transforms=input_transform, label_transforms= label_transform) val_dataset = ReconstructionDataset.ReconstructionDataset(args.val_csv, args.val_input_dir, args.val_gt_dir, input_transforms=input_transform, label_transforms= label_transform) train_loader = torch.utils.data.DataLoader( train_dataset, batch_size= args.batch_size, shuffle= True, num_workers= 6, pin_memory= True ) val_loader = torch.utils.data.DataLoader( val_dataset, batch_size= args.batch_size, shuffle= True, num_workers= 6, pin_memory= True ) # optimizer = torch.optim.Adam(unet.parameters(), lr=0.5, betas=(0.9, 0.999), eps=1e-08, weight_decay=0, amsgrad=False) optimizer = torch.optim.SGD(unet.parameters(), lr= .05, momentum=.9, weight_decay= 0) criterion = torch.nn.MSELoss().to(device) scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=[5,10,15,20,25,30,35]) best_loss = float('inf') for epoch in range(1, args.epochs + 1): scheduler.step() train(unet, optimizer, criterion, device, train_loader, epoch, args.log_interval) test_loss = test(unet, device, val_loader, epoch, args.log_interval) if test_loss < best_loss: best_loss = test_loss is_best = True # save the model every epoch save_checkpoint({ 'epoch' : epoch, 'model_state_dict': unet.state_dict(), 'optimizer' : optimizer.state_dict(), 'loss' : test_loss }, is_best) is_best = False
def train_rnn_model(train_loader, val_loader, num_features, num_epochs, use_cuda, path_output): """ Use train and validation loader to train the variable RNN model Input: train_loader, val_loader Output: trained best model """ device = torch.device( "cuda" if torch.cuda.is_available() and use_cuda else "cpu") torch.manual_seed(1) if device.type == "cuda": torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False model = VariableRNN(num_features) criterion = nn.CrossEntropyLoss() optimizer = optim.Adam(model.parameters()) model.to(device) criterion.to(device) best_val_acc = 0.0 train_losses, train_accuracies = [], [] valid_losses, valid_accuracies = [], [] for epoch in range(num_epochs): train_loss, train_accuracy = train(model, device, train_loader, criterion, optimizer, epoch) valid_loss, valid_accuracy, valid_results = evaluate( model, device, val_loader, criterion) train_losses.append(train_loss) valid_losses.append(valid_loss) train_accuracies.append(train_accuracy) valid_accuracies.append(valid_accuracy) is_best = valid_accuracy > best_val_acc if is_best: best_val_acc = valid_accuracy torch.save( model, os.path.join(path_output, "VariableRNN.pth"), _use_new_zipfile_serialization=False, ) best_model = torch.load(os.path.join(path_output, "VariableRNN.pth")) return ( best_model, train_losses, valid_losses, train_accuracies, valid_accuracies, valid_results, )
def main(args): use_cuda = torch.cuda.is_available() device = torch.device("cuda" if use_cuda else "cpu") model = Model(args.cfg_file).to(device) # model = models.resnet18().to(device) data_transforms = transforms.Compose( [transforms.Resize((args.resize, args.resize)), transforms.ToTensor()] ) train_dataset = ClassificationDataset.ClassificationDataset(args.train_csv, args.root_dir, transform= data_transforms) val_dataset = ClassificationDataset.ClassificationDataset(args.val_csv, args.root_dir, transform= data_transforms) train_loader = torch.utils.data.DataLoader( train_dataset, batch_size= args.batch_size, shuffle= True, num_workers= 6, pin_memory= True ) val_loader = torch.utils.data.DataLoader( val_dataset, batch_size= args.batch_size, shuffle= True, num_workers= 6, pin_memory= True ) optimizer = torch.optim.SGD(model.parameters(), lr= args.lr, momentum= .9) criterion = torch.nn.CrossEntropyLoss() for epoch in range(args.epochs): train(model, optimizer, criterion, device, train_loader, epoch, args.log_interval) test_loss = test(model, device, val_loader, epoch, args.log_interval, loss_fn= 'ce')
def pytorch_model(): depth = [3, 5, 9] hidden_nodes = [5, 10, 25, 50, 100] init_params = [['tanh', 'xavier'], ['relu', 'he']] epoch = 10 train_dataset = BankNoteDataset('./bank-note/train.csv', 4) test_dataset = BankNoteDataset('./bank-note/test.csv', 4) train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=12, shuffle=True, num_workers=6, pin_memory=True) test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=12, shuffle=True, num_workers=6, pin_memory=True) for (activation, initialization) in init_params: for d in depth: for nodes in hidden_nodes: print('Depth:', d, ', Nodes:', nodes, ', Activation:', activation, ', Initialization:', initialization) model = torch_nn(4, 2, nodes, d, activation, initialization) optimizer = torch.optim.Adam(model.parameters()) criterion = torch.nn.CrossEntropyLoss() train_loss = 0 test_loss = 0 for epoch in range(1, epoch + 1): train_loss += train(model, optimizer, criterion, train_loader, epoch) test_loss += test(model, test_loader, epoch) print('Train Loss: {:.6f}'.format(train_loss)) print('Test Loss: {:.6f}'.format(test_loss))
hparams = { 'name': 'knrm', 'batch_size': 100, 'title_size': 20, 'his_size': 50, 'npratio': 4, 'embedding_dim': 300, 'kernel_num': 11, 'metrics': 'group_auc,ndcg@5,ndcg@10,mean_mrr', 'attrs': ['title'], } hparams = load_hparams(hparams) device = torch.device(hparams['device']) vocab, loader_train, loader_test, loader_validate = prepare(hparams, validate=True) knrmModel = KNRMModel(vocab=vocab, hparams=hparams).to(device) if hparams['mode'] == 'test': knrmModel.load_state_dict(torch.load(hparams['save_path'])) print("testing...") evaluate(knrmModel, hparams, loader_test) elif hparams['mode'] == 'train': train(knrmModel, hparams, loader_train, loader_test, loader_validate, tb=True)
def main(): global args parser = arg_parser() args = parser.parse_args() cudnn.benchmark = True num_classes, train_list_name, val_list_name, test_list_name, filename_seperator, image_tmpl, filter_video, label_file = get_dataset_config( args.dataset, args.use_lmdb) args.num_classes = num_classes if args.gpu: os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu if args.modality == 'rgb': args.input_channels = 3 elif args.modality == 'flow': args.input_channels = 2 * 5 model, arch_name = build_model(args) mean = model.mean(args.modality) std = model.std(args.modality) # overwrite mean and std if they are presented in command if args.mean is not None: if args.modality == 'rgb': if len(args.mean) != 3: raise ValueError( "When training with rgb, dim of mean must be three.") elif args.modality == 'flow': if len(args.mean) != 1: raise ValueError( "When training with flow, dim of mean must be three.") mean = args.mean if args.std is not None: if args.modality == 'rgb': if len(args.std) != 3: raise ValueError( "When training with rgb, dim of std must be three.") elif args.modality == 'flow': if len(args.std) != 1: raise ValueError( "When training with flow, dim of std must be three.") std = args.std model = model.cuda() model.eval() if args.threed_data: dummy_data = (3, args.groups, args.input_size, args.input_size) else: dummy_data = (3 * args.groups, args.input_size, args.input_size) model_summary = torchsummary.summary(model, input_size=dummy_data) torch.cuda.empty_cache() if args.show_model: print(model) print(model_summary) return 0 model = torch.nn.DataParallel(model).cuda() if args.pretrained is not None: print("=> using pre-trained model '{}'".format(arch_name)) checkpoint = torch.load(args.pretrained, map_location='cpu') if args.transfer: new_dict = {} for k, v in checkpoint['state_dict'].items(): # TODO: a better approach: if k.replace("module.", "").startswith("fc"): continue new_dict[k] = v else: new_dict = checkpoint['state_dict'] model.load_state_dict(new_dict, strict=False) else: print("=> creating model '{}'".format(arch_name)) # define loss function (criterion) and optimizer train_criterion = nn.CrossEntropyLoss().cuda() val_criterion = nn.CrossEntropyLoss().cuda() # Data loading code video_data_cls = VideoDataSetLMDB if args.use_lmdb else VideoDataSet val_list = os.path.join(args.datadir, val_list_name) val_augmentor = get_augmentor(False, args.input_size, mean, std, args.disable_scaleup, threed_data=args.threed_data, version=args.augmentor_ver, scale_range=args.scale_range) val_dataset = video_data_cls(args.datadir, val_list, args.groups, args.frames_per_group, num_clips=args.num_clips, modality=args.modality, image_tmpl=image_tmpl, dense_sampling=args.dense_sampling, transform=val_augmentor, is_train=False, test_mode=False, seperator=filename_seperator, filter_video=filter_video) val_loader = build_dataflow(val_dataset, is_train=False, batch_size=args.batch_size, workers=args.workers) log_folder = os.path.join(args.logdir, arch_name) if not os.path.exists(log_folder): os.makedirs(log_folder) if args.evaluate: logfile = open(os.path.join(log_folder, 'evaluate_log.log'), 'a') flops, params = extract_total_flops_params(model_summary) print(model_summary) val_top1, val_top5, val_losses, val_speed = validate( val_loader, model, val_criterion) print( 'Val@{}: \tLoss: {:4.4f}\tTop@1: {:.4f}\tTop@5: {:.4f}\tSpeed: {:.2f} ms/batch\tFlops: {}\tParams: {}' .format(args.input_size, val_losses, val_top1, val_top5, val_speed * 1000.0, flops, params), flush=True) print( 'Val@{}: \tLoss: {:4.4f}\tTop@1: {:.4f}\tTop@5: {:.4f}\tSpeed: {:.2f} ms/batch\tFlops: {}\tParams: {}' .format(args.input_size, val_losses, val_top1, val_top5, val_speed * 1000.0, flops, params), flush=True, file=logfile) return train_list = os.path.join(args.datadir, train_list_name) train_augmentor = get_augmentor(True, args.input_size, mean, std, threed_data=args.threed_data, version=args.augmentor_ver, scale_range=args.scale_range) train_dataset = video_data_cls(args.datadir, train_list, args.groups, args.frames_per_group, num_clips=args.num_clips, modality=args.modality, image_tmpl=image_tmpl, dense_sampling=args.dense_sampling, transform=train_augmentor, is_train=True, test_mode=False, seperator=filename_seperator, filter_video=filter_video) train_loader = build_dataflow(train_dataset, is_train=True, batch_size=args.batch_size, workers=args.workers) sgd_polices = model.parameters() optimizer = torch.optim.SGD(sgd_polices, args.lr, momentum=args.momentum, weight_decay=args.weight_decay, nesterov=args.nesterov) if args.lr_scheduler == 'step': scheduler = lr_scheduler.StepLR(optimizer, args.lr_steps[0], gamma=0.1) elif args.lr_scheduler == 'multisteps': scheduler = lr_scheduler.MultiStepLR(optimizer, args.lr_steps, gamma=0.1) elif args.lr_scheduler == 'cosine': scheduler = lr_scheduler.CosineAnnealingLR(optimizer, args.epochs, eta_min=0) elif args.lr_scheduler == 'plateau': scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, 'min', verbose=True) best_top1 = 0.0 tensorboard_logger.configure(os.path.join(log_folder)) # optionally resume from a checkpoint if args.resume: logfile = open(os.path.join(log_folder, 'log.log'), 'a') if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume) args.start_epoch = checkpoint['epoch'] best_top1 = checkpoint['best_top1'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) try: scheduler.load_state_dict(checkpoint['scheduler']) except: pass print("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) else: raise ValueError("Checkpoint is not found: {}".format(args.resume)) else: if os.path.exists(os.path.join(log_folder, 'log.log')): shutil.copyfile( os.path.join(log_folder, 'log.log'), os.path.join(log_folder, 'log.log.{}'.format(int(time.time())))) logfile = open(os.path.join(log_folder, 'log.log'), 'w') command = " ".join(sys.argv) print(command, flush=True) print(args, flush=True) print(model, flush=True) print(model_summary, flush=True) print(command, file=logfile, flush=True) print(args, file=logfile, flush=True) if args.resume == '': print(model, file=logfile, flush=True) print(model_summary, flush=True, file=logfile) for epoch in range(args.start_epoch, args.epochs): if args.lr_scheduler == 'plateau': scheduler.step(val_losses, epoch) else: scheduler.step(epoch) try: # get_lr get all lrs for every layer of current epoch, assume the lr for all layers are identical lr = scheduler.optimizer.param_groups[0]['lr'] except: lr = None # set current learning rate # train for one epoch train_top1, train_top5, train_losses, train_speed, speed_data_loader, train_steps = \ train(train_loader, model, train_criterion, optimizer, epoch + 1, display=args.print_freq, label_smoothing=args.label_smoothing, clip_gradient=args.clip_gradient) print( 'Train: [{:03d}/{:03d}]\tLoss: {:4.4f}\tTop@1: {:.4f}\tTop@5: {:.4f}\tSpeed: {:.2f} ms/batch\tData loading: {:.2f} ms/batch' .format(epoch + 1, args.epochs, train_losses, train_top1, train_top5, train_speed * 1000.0, speed_data_loader * 1000.0), file=logfile, flush=True) print( 'Train: [{:03d}/{:03d}]\tLoss: {:4.4f}\tTop@1: {:.4f}\tTop@5: {:.4f}\tSpeed: {:.2f} ms/batch\tData loading: {:.2f} ms/batch' .format(epoch + 1, args.epochs, train_losses, train_top1, train_top5, train_speed * 1000.0, speed_data_loader * 1000.0), flush=True) # evaluate on validation set val_top1, val_top5, val_losses, val_speed = validate( val_loader, model, val_criterion) print( 'Val : [{:03d}/{:03d}]\tLoss: {:4.4f}\tTop@1: {:.4f}\tTop@5: {:.4f}\tSpeed: {:.2f} ms/batch' .format(epoch + 1, args.epochs, val_losses, val_top1, val_top5, val_speed * 1000.0), file=logfile, flush=True) print( 'Val : [{:03d}/{:03d}]\tLoss: {:4.4f}\tTop@1: {:.4f}\tTop@5: {:.4f}\tSpeed: {:.2f} ms/batch' .format(epoch + 1, args.epochs, val_losses, val_top1, val_top5, val_speed * 1000.0), flush=True) # remember best prec@1 and save checkpoint is_best = val_top1 > best_top1 best_top1 = max(val_top1, best_top1) save_dict = { 'epoch': epoch + 1, 'arch': arch_name, 'state_dict': model.state_dict(), 'best_top1': best_top1, 'optimizer': optimizer.state_dict(), 'scheduler': scheduler.state_dict() } save_checkpoint(save_dict, is_best, filepath=log_folder) if lr is not None: tensorboard_logger.log_value('learning-rate', lr, epoch + 1) tensorboard_logger.log_value('val-top1', val_top1, epoch + 1) tensorboard_logger.log_value('val-loss', val_losses, epoch + 1) tensorboard_logger.log_value('train-top1', train_top1, epoch + 1) tensorboard_logger.log_value('train-loss', train_losses, epoch + 1) tensorboard_logger.log_value('best-val-top1', best_top1, epoch + 1) logfile.close()
test_loss, test_acc = evaluation(args, 1, model, loader_test, dataset='test') print('\nvalidation acc : {:.5f}'.format(np.mean(val_acc))) print('test acc : {:.5f}'.format(np.mean(test_acc))) elif args.command == 'train': # Step 3: Train and validation print("\nTraining...") print('Total epoch : ', args.n_epochs) best_acc = 0.0 best_loss = np.inf wait = 0 for ep in range(args.epoch_start, args.n_epochs): if 'cycle' in args.alg: ## training tr_loss_orig, tr_acc, tr_loss_reverse = train(args, ep, model, model_scheduler, loader_train) tr_loss = np.mean(tr_loss_orig) + np.mean(tr_loss_reverse) ## validation val_loss_orig, val_acc, val_loss_reverse = evaluation(args, ep, model, loader_val, dataset='val') val_loss = np.mean(val_loss_orig) + np.mean(val_loss_reverse) else: ## training tr_loss, tr_acc = train(args, ep, model, model_scheduler, loader_train) ## validation val_loss, val_acc = evaluation(args, ep, model, loader_val, dataset='val') print('\nepoch:{}, tr_loss:{:.5f}, tr_acc:{:.5f}, val_loss:{:.5f}, val_acc:{:.5f}' .format(ep+1, np.mean(tr_loss), np.mean(tr_acc), np.mean(val_loss), np.mean(val_acc))) # Model Save and Stop Criterion # save : val_acc가 best 보다 잘 나왔을 때
def main_worker(gpu, ngpus_per_node, args): cudnn.benchmark = args.cudnn_benchmark args.gpu = gpu num_classes, train_list_name, val_list_name, test_list_name, filename_seperator, image_tmpl, filter_video, label_file = get_dataset_config( args.dataset, args.use_lmdb) args.num_classes = num_classes if args.gpu is not None: print("Use GPU: {} for training".format(args.gpu)) if args.distributed: if args.dist_url == "env://" and args.rank == -1: args.rank = int(os.environ["RANK"]) if args.multiprocessing_distributed: # For multiprocessing distributed training, rank needs to be the # global rank among all the processes args.rank = args.rank * ngpus_per_node + gpu dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size, rank=args.rank) if args.modality == 'rgb': args.input_channels = 3 elif args.modality == 'flow': args.input_channels = 2 * 5 model, arch_name = build_model(args) mean = model.mean(args.modality) std = model.std(args.modality) # overwrite mean and std if they are presented in command if args.mean is not None: if args.modality == 'rgb': if len(args.mean) != 3: raise ValueError( "When training with rgb, dim of mean must be three.") elif args.modality == 'flow': if len(args.mean) != 1: raise ValueError( "When training with flow, dim of mean must be three.") mean = args.mean if args.std is not None: if args.modality == 'rgb': if len(args.std) != 3: raise ValueError( "When training with rgb, dim of std must be three.") elif args.modality == 'flow': if len(args.std) != 1: raise ValueError( "When training with flow, dim of std must be three.") std = args.std model = model.cuda(args.gpu) model.eval() if args.threed_data: dummy_data = (args.input_channels, args.groups, args.input_size, args.input_size) else: dummy_data = (args.input_channels * args.groups, args.input_size, args.input_size) if args.rank == 0: model_summary = torchsummary.summary(model, input_size=dummy_data) torch.cuda.empty_cache() if args.show_model and args.rank == 0: print(model) print(model_summary) return 0 if args.distributed: # For multiprocessing distributed, DistributedDataParallel constructor # should always set the single device scope, otherwise, # DistributedDataParallel will use all available devices. if args.gpu is not None: torch.cuda.set_device(args.gpu) model.cuda(args.gpu) # When using a single GPU per process and per # DistributedDataParallel, we need to divide the batch size # ourselves based on the total number of GPUs we have # the batch size should be divided by number of nodes as well args.batch_size = int(args.batch_size / args.world_size) args.workers = int(args.workers / ngpus_per_node) if args.sync_bn: process_group = torch.distributed.new_group( list(range(args.world_size))) model = torch.nn.SyncBatchNorm.convert_sync_batchnorm( model, process_group) model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.gpu]) else: model.cuda() # DistributedDataParallel will divide and allocate batch_size to all # available GPUs if device_ids are not set model = torch.nn.parallel.DistributedDataParallel(model) elif args.gpu is not None: torch.cuda.set_device(args.gpu) model = model.cuda(args.gpu) else: # DataParallel will divide and allocate batch_size to all available GPUs # assign rank to 0 model = torch.nn.DataParallel(model).cuda() args.rank = 0 if args.pretrained is not None: if args.rank == 0: print("=> using pre-trained model '{}'".format(arch_name)) if args.gpu is None: checkpoint = torch.load(args.pretrained, map_location='cpu') else: checkpoint = torch.load(args.pretrained, map_location='cuda:{}'.format(args.gpu)) if args.transfer: new_dict = {} for k, v in checkpoint['state_dict'].items(): # TODO: a better approach: if k.replace("module.", "").startswith("fc"): continue new_dict[k] = v else: new_dict = checkpoint['state_dict'] model.load_state_dict(new_dict, strict=False) del checkpoint # dereference seems crucial torch.cuda.empty_cache() else: if args.rank == 0: print("=> creating model '{}'".format(arch_name)) # define loss function (criterion) and optimizer train_criterion = nn.CrossEntropyLoss().cuda(args.gpu) val_criterion = nn.CrossEntropyLoss().cuda(args.gpu) # Data loading code val_list = os.path.join(args.datadir, val_list_name) val_augmentor = get_augmentor( False, args.input_size, scale_range=args.scale_range, mean=mean, std=std, disable_scaleup=args.disable_scaleup, threed_data=args.threed_data, is_flow=True if args.modality == 'flow' else False, version=args.augmentor_ver) video_data_cls = VideoDataSetLMDB if args.use_lmdb else VideoDataSet val_dataset = video_data_cls(args.datadir, val_list, args.groups, args.frames_per_group, num_clips=args.num_clips, modality=args.modality, image_tmpl=image_tmpl, dense_sampling=args.dense_sampling, transform=val_augmentor, is_train=False, test_mode=False, seperator=filename_seperator, filter_video=filter_video) val_loader = build_dataflow(val_dataset, is_train=False, batch_size=args.batch_size, workers=args.workers, is_distributed=args.distributed) log_folder = os.path.join(args.logdir, arch_name) if args.rank == 0: if not os.path.exists(log_folder): os.makedirs(log_folder) if args.evaluate: val_top1, val_top5, val_losses, val_speed = validate(val_loader, model, val_criterion, gpu_id=args.gpu) if args.rank == 0: logfile = open(os.path.join(log_folder, 'evaluate_log.log'), 'a') flops, params = extract_total_flops_params(model_summary) print( 'Val@{}: \tLoss: {:4.4f}\tTop@1: {:.4f}\tTop@5: {:.4f}\tSpeed: {:.2f} ms/batch\tFlops: {}\tParams: {}' .format(args.input_size, val_losses, val_top1, val_top5, val_speed * 1000.0, flops, params), flush=True) print( 'Val@{}: \tLoss: {:4.4f}\tTop@1: {:.4f}\tTop@5: {:.4f}\tSpeed: {:.2f} ms/batch\tFlops: {}\tParams: {}' .format(args.input_size, val_losses, val_top1, val_top5, val_speed * 1000.0, flops, params), flush=True, file=logfile) return train_list = os.path.join(args.datadir, train_list_name) train_augmentor = get_augmentor( True, args.input_size, scale_range=args.scale_range, mean=mean, std=std, disable_scaleup=args.disable_scaleup, threed_data=args.threed_data, is_flow=True if args.modality == 'flow' else False, version=args.augmentor_ver) train_dataset = video_data_cls(args.datadir, train_list, args.groups, args.frames_per_group, num_clips=args.num_clips, modality=args.modality, image_tmpl=image_tmpl, dense_sampling=args.dense_sampling, transform=train_augmentor, is_train=True, test_mode=False, seperator=filename_seperator, filter_video=filter_video) train_loader = build_dataflow(train_dataset, is_train=True, batch_size=args.batch_size, workers=args.workers, is_distributed=args.distributed) sgd_polices = model.parameters() optimizer = torch.optim.SGD(sgd_polices, args.lr, momentum=args.momentum, weight_decay=args.weight_decay, nesterov=args.nesterov) if args.lr_scheduler == 'step': scheduler = lr_scheduler.StepLR(optimizer, args.lr_steps[0], gamma=0.1) elif args.lr_scheduler == 'multisteps': scheduler = lr_scheduler.MultiStepLR(optimizer, args.lr_steps, gamma=0.1) elif args.lr_scheduler == 'cosine': scheduler = lr_scheduler.CosineAnnealingLR(optimizer, args.epochs, eta_min=0) elif args.lr_scheduler == 'plateau': scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, 'min', verbose=True) best_top1 = 0.0 # optionally resume from a checkpoint if args.resume: if args.rank == 0: logfile = open(os.path.join(log_folder, 'log.log'), 'a') if os.path.isfile(args.resume): if args.rank == 0: print("=> loading checkpoint '{}'".format(args.resume)) if args.gpu is None: checkpoint = torch.load(args.resume, map_location='cpu') else: checkpoint = torch.load(args.resume, map_location='cuda:{}'.format( args.gpu)) args.start_epoch = checkpoint['epoch'] # TODO: handle distributed version best_top1 = checkpoint['best_top1'] if args.gpu is not None: if not isinstance(best_top1, float): best_top1 = best_top1.to(args.gpu) model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) try: scheduler.load_state_dict(checkpoint['scheduler']) except: pass if args.rank == 0: print("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) del checkpoint # dereference seems crucial torch.cuda.empty_cache() else: raise ValueError("Checkpoint is not found: {}".format(args.resume)) else: if os.path.exists(os.path.join(log_folder, 'log.log')) and args.rank == 0: shutil.copyfile( os.path.join(log_folder, 'log.log'), os.path.join(log_folder, 'log.log.{}'.format(int(time.time())))) if args.rank == 0: logfile = open(os.path.join(log_folder, 'log.log'), 'w') if args.rank == 0: command = " ".join(sys.argv) tensorboard_logger.configure(os.path.join(log_folder)) print(command, flush=True) print(args, flush=True) print(model, flush=True) print(command, file=logfile, flush=True) print(model_summary, flush=True) print(args, file=logfile, flush=True) if args.resume == '' and args.rank == 0: print(model, file=logfile, flush=True) print(model_summary, flush=True, file=logfile) for epoch in range(args.start_epoch, args.epochs): # train for one epoch train_top1, train_top5, train_losses, train_speed, speed_data_loader, train_steps = \ train(train_loader, model, train_criterion, optimizer, epoch + 1, display=args.print_freq, label_smoothing=args.label_smoothing, clip_gradient=args.clip_gradient, gpu_id=args.gpu, rank=args.rank) if args.distributed: dist.barrier() # evaluate on validation set val_top1, val_top5, val_losses, val_speed = validate(val_loader, model, val_criterion, gpu_id=args.gpu) # update current learning rate if args.lr_scheduler == 'plateau': scheduler.step(val_losses) else: scheduler.step(epoch + 1) if args.distributed: dist.barrier() # only logging at rank 0 if args.rank == 0: print( 'Train: [{:03d}/{:03d}]\tLoss: {:4.4f}\tTop@1: {:.4f}\tTop@5: {:.4f}\tSpeed: {:.2f} ms/batch\tData loading: {:.2f} ms/batch' .format(epoch + 1, args.epochs, train_losses, train_top1, train_top5, train_speed * 1000.0, speed_data_loader * 1000.0), file=logfile, flush=True) print( 'Train: [{:03d}/{:03d}]\tLoss: {:4.4f}\tTop@1: {:.4f}\tTop@5: {:.4f}\tSpeed: {:.2f} ms/batch\tData loading: {:.2f} ms/batch' .format(epoch + 1, args.epochs, train_losses, train_top1, train_top5, train_speed * 1000.0, speed_data_loader * 1000.0), flush=True) print( 'Val : [{:03d}/{:03d}]\tLoss: {:4.4f}\tTop@1: {:.4f}\tTop@5: {:.4f}\tSpeed: {:.2f} ms/batch' .format(epoch + 1, args.epochs, val_losses, val_top1, val_top5, val_speed * 1000.0), file=logfile, flush=True) print( 'Val : [{:03d}/{:03d}]\tLoss: {:4.4f}\tTop@1: {:.4f}\tTop@5: {:.4f}\tSpeed: {:.2f} ms/batch' .format(epoch + 1, args.epochs, val_losses, val_top1, val_top5, val_speed * 1000.0), flush=True) # remember best prec@1 and save checkpoint is_best = val_top1 > best_top1 best_top1 = max(val_top1, best_top1) save_dict = { 'epoch': epoch + 1, 'arch': arch_name, 'state_dict': model.state_dict(), 'best_top1': best_top1, 'optimizer': optimizer.state_dict(), 'scheduler': scheduler.state_dict() } save_checkpoint(save_dict, is_best, filepath=log_folder) try: # get_lr get all lrs for every layer of current epoch, assume the lr for all layers are identical lr = scheduler.optimizer.param_groups[0]['lr'] except Exception as e: lr = None if lr is not None: tensorboard_logger.log_value('learning-rate', lr, epoch + 1) tensorboard_logger.log_value('val-top1', val_top1, epoch + 1) tensorboard_logger.log_value('val-loss', val_losses, epoch + 1) tensorboard_logger.log_value('train-top1', train_top1, epoch + 1) tensorboard_logger.log_value('train-loss', train_losses, epoch + 1) tensorboard_logger.log_value('best-val-top1', best_top1, epoch + 1) if args.distributed: dist.barrier() if args.rank == 0: logfile.close()
def main(): parser = argparse.ArgumentParser(description="Generate 汉字 via generative adversarial network.") # Dataset parser.add_argument("--size", type=int, default=32, help="Font size.") parser.add_argument("--from_unicode", type=int, help="Starting point of the unicode.") parser.add_argument("--to_unicode", type=int, help="Ending point of the unicode.") parser.add_argument("--font", type=str, required=True, help="Path to the font file.") parser.add_argument("--num_workers", type=int, default=4, help="Number of data loading workers.") # Optimization parser.add_argument("--epochs", type=int, default=100, help="Number of epochs.") parser.add_argument("--batch_size", type=int, default=32, help="Batch size.") parser.add_argument("--gpu_ids", type=str, default='', help="GPUs for running this script.") parser.add_argument("--rand_dim", type=int, default=128, help="Dimension of the random vector.") parser.add_argument("--num_fakes", type=int, default=16, help="Use num_fakes generated images to train the discriminator.") parser.add_argument("--flip_rate", type=float, default=0.8, help="Label flipping rate.") parser.add_argument("--g_lr", type=float, default=0.01, help="Learning rate for generator.") parser.add_argument("--d_lr", type=float, default=0.01, help="Learning rate for discriminator.") parser.add_argument("--factor", type=float, default=0.2, help="Factor by which the learning rate will be reduced.") parser.add_argument("--patience", type=int, default=10, help="Number of epochs with no improvement after which learning rate will be reduced.") parser.add_argument("--threshold", type=float, default=0.1, help="Threshold for measuring the new optimum, to only focus on significant changes. ") # Misc parser.add_argument("--log_dir", type=str, default="../run/", help="Where to save the log?") parser.add_argument("--log_name", type=str, required=True, help="Name of the log folder.") parser.add_argument("--show_freq", type=int, default=64, help="How frequently to show generated images?") parser.add_argument("--seed", type=int, default=0, help="Random seed.") args = parser.parse_args() assert args.show_freq > 0 assert 0.0 <= args.flip_rate <= 1.0 # Check before run. if not os.path.exists(args.log_dir): os.mkdir(args.log_dir) log_dir = os.path.join(args.log_dir, args.log_name) # Setting up logger log_file = datetime.now().strftime("%Y-%m-%d-%H-%M-%S.log") sys.stdout = Logger(os.path.join(log_dir, log_file)) print(args) for s in args.gpu_ids: try: int(s) except ValueError as e: print("Invalid gpu id:{}".format(s)) raise ValueError os.environ['CUDA_VISIBLE_DEVICES'] = ','.join(args.gpu_ids) if args.gpu_ids: if torch.cuda.is_available(): use_gpu = True torch.cuda.manual_seed_all(args.seed) else: use_gpu = False else: use_gpu = False torch.manual_seed(args.seed) dataloader, size = build_dataloader(args.batch_size, args.num_workers, use_gpu, args.font, args.size, args.from_unicode, args.to_unicode) model = GAN(args.num_fakes, args.rand_dim, size, use_gpu) criterion = BCELoss() d_optimizer = torch.optim.SGD(model.discriminator.parameters(), lr=args.d_lr, momentum=0.9) g_optimizer = torch.optim.SGD(model.generator.parameters(), lr=args.g_lr, momentum=0.9) d_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(d_optimizer, mode="min", factor=args.factor, patience=args.patience, verbose=True, threshold=args.threshold) g_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(g_optimizer, mode="min", factor=args.factor, patience=args.patience, verbose=True, threshold=args.threshold) optimizer = d_optimizer, g_optimizer scheduler = d_scheduler, g_scheduler if use_gpu: model = model.cuda() model = torch.nn.DataParallel(model) print("Start training...") start = datetime.now() with SummaryWriter(log_dir) as writer: for epoch in range(args.epochs): for i, param_group in enumerate(d_optimizer.param_groups): d_learning_rate = float(param_group["lr"]) writer.add_scalar("d_lr_group_{0}".format(i), d_learning_rate, global_step=epoch) for i, param_group in enumerate(g_optimizer.param_groups): g_learning_rate = float(param_group["lr"]) writer.add_scalar("g_lr_group_{0}".format(i), g_learning_rate, global_step=epoch) train(model, dataloader, criterion, optimizer, use_gpu, writer, epoch, scheduler, args.num_fakes, args.flip_rate, args.show_freq) torch.save(model, os.path.join(log_dir, "latest.pth")) elapsed_time = str(datetime.now() - start) print("Finish training. Total elapsed time %s." % elapsed_time)
# print('Pool Size: {}'.format(pool_size)) print('Epochs: {}'.format(epochs)) print('Seed: {}'.format(seed)) print('=' * 30 + '\n') for epoch in range(epochs): print('=' * 30) print('Starting epoch {} of {}'.format(epoch, epochs)) X_train, y_train = shuffle(X_train, y_train) loss_train = 0.0 loss_test = 0.0 acc_train = 0.0 acc_test = 0.0 for i, (image, label) in enumerate(zip(X_train, y_train)): loss, acc = train(image, label, model, alpha) loss_train += loss acc_train += acc for i, (image, label) in enumerate(zip(X_test, y_test)): _, loss, acc = forward(image, label, model) loss_test += loss acc_test += acc loss_train_seq.append(loss_train / train_examples) loss_test_seq.append(loss_test / test_examples) acc_train_seq.append(acc_train / train_examples) acc_test_seq.append(acc_test / test_examples) print('Test Loss: {}\nTest Accuracy: {} %'.format( loss_test_seq[-1], acc_test_seq[-1] * 100))
def main(): # Training settings parser = argparse.ArgumentParser(description='CIFAR10 Example') parser.add_argument('--root', type=str, metavar='S', help='Path to the root.') parser.add_argument('--init-num-labelled', type=int, default=None, metavar='N', help='Initial number of labelled examples.') parser.add_argument('--batch-size', type=int, default=100, metavar='N', help='total batch size for training (default: 100)') parser.add_argument('--init-epochs', type=int, metavar='N', help='number of epochs to train for active learning.') parser.add_argument('--train-on-updated', default=False, type=str2bool, metavar='BOOL', help='Train on updated data? (default: False)') parser.add_argument('--active-learning', default=False, type=str2bool, metavar='BOOL', help='Run proposed active learning? (default: False)') parser.add_argument( '--skip', type=int, default=0, metavar='N', help= 'Skip the first N epochs when computing the accumulated prediction changes.' ) parser.add_argument('--test-batch-size', type=int, default=500, metavar='N', help='input batch size for testing (default: 1000)') parser.add_argument('--epochs', type=int, metavar='N', help='number of epochs to train.') parser.add_argument('--lr', type=float, default=0.1, metavar='LR', help='learning rate (default: 0.01)') parser.add_argument('--momentum', type=float, default=0.0, metavar='M', help='SGD momentum (default: 0.0)') parser.add_argument('--seed', type=int, metavar='S', help='Seed for random number generator.') parser.add_argument( '--log-interval', type=int, default=1, metavar='N', help='how many batches to wait before logging training status') parser.add_argument('--num-workers', type=int, default=1, metavar='N', help='Number of workers for dataloader (default: 1)') parser.add_argument('--num-to-sample', type=int, metavar='N', help='Number of unlabelled exmples to be sampled') parser.add_argument( '--validate', default=False, type=str2bool, metavar='BOOL', help='Use validation set instead of test set? (default: False)') parser.add_argument('--output', default='default_ouput.csv', type=str, metavar='S', help='File name for the output.') args = parser.parse_args() torch.manual_seed(args.seed) # set seed for pytorch use_cuda = torch.cuda.is_available() args.num_classes = 10 device = torch.device("cuda" if use_cuda else "cpu") kwargs = { 'num_workers': args.num_workers, 'pin_memory': True } if use_cuda else {} ##################### Active learning sampling using prediction change (fluctuation) ############################### if args.active_learning: train_dataset = cifar10.CIFAR10(root=args.root, dataset='train', init_n_labeled=args.init_num_labelled, seed=args.seed, download=True, transform=transforms.Compose( [transforms.ToTensor()]), target_transform=None, indices_name=None) #initialise indices train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=args.batch_size, shuffle=False, sampler=SubsetRandomSampler(train_dataset.l_indices), **kwargs) test_loader = torch.utils.data.DataLoader( cifar10.CIFAR10(args.root, 'test', seed=args.seed, transform=transforms.Compose( [transforms.ToTensor()])), batch_size=args.test_batch_size, shuffle=False, **kwargs) # train on the initial labelled set global_step = 0 model = resnet18(pretrained=False, progress=False, num_classes=args.num_classes).to(device) optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum) method = PredictionChange(u_indices=train_dataset.u_indices, model=model, dataset=train_dataset, data_name='cifar10') logs = { "train_losses": [], "train_acces": [], "test_acces": [], "pred_changes": [] } logger = Logger(logs) for epoch in range(1, args.init_epochs + 1): start = time.time() global_step, train_loss, train_acc = utils.train( args, model, device, train_loader, optimizer, epoch, global_step) print('Training one epoch took: {:.4f} seconds.\n'.format( time.time() - start)) test_acc, _ = utils.test(model, device, test_loader) print('Computing prediction changes...') pred_change = method.compute_pred_changes(model) logger.append(train_losses=train_loss, train_acces=train_acc, test_acces=test_acc, pred_changes=pred_change) # save the logs train_dataset.save_logs(logger.logs) ############################### Training on updated indices ######################################### if args.train_on_updated: import os # create Dataset object and load initial indices. train_dataset = cifar10.CIFAR10( root=args.root, dataset='train', init_n_labeled=args.init_num_labelled, seed=args.seed, download=False, transform=transforms.Compose([transforms.ToTensor()]), target_transform=None, indices_name="init_indices.npz") #load initial indices from file logs_path = os.path.join(train_dataset.init_folder, 'logs.npz') print("Updating indices using log file: {}...".format(logs_path)) start = time.time() # sampling using proposed prediction change method method = PredictionChange(u_indices=train_dataset.u_indices, dataset=train_dataset, data_name='CIFAR-10') sample = method.select_batch_from_logs(N=args.num_to_sample, skip=args.skip, path=logs_path, key="pred_changes") # update and save updated indices filename_updated_indices = "updated_indices_N_{}_skip_{}".format( args.num_to_sample, args.skip) method.update_indices(dataset=train_dataset, indices=sample, filename=filename_updated_indices) print('Active learning sampling took: {:.4f} seconds.\n'.format( time.time() - start)) print("Training on updated labelled training set...") train_dataset = cifar10.CIFAR10( root=args.root, dataset='train', init_n_labeled=args.init_num_labelled, seed=args.seed, download=False, transform=transforms.Compose([transforms.ToTensor()]), target_transform=None, indices_name=filename_updated_indices + ".npz") #load updated indices from file train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=args.batch_size, shuffle=False, sampler=SubsetRandomSampler(train_dataset.l_indices), **kwargs) if args.validate: test_or_validate = 'validation' else: test_or_validate = 'test' test_loader = torch.utils.data.DataLoader( cifar10.CIFAR10(args.root, test_or_validate, seed=args.seed, transform=transforms.Compose( [transforms.ToTensor()])), batch_size=args.test_batch_size, shuffle=False, **kwargs) model = resnet18(pretrained=False, progress=False, num_classes=args.num_classes).to(device) optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum) global_step = 0 for epoch in range(1, args.epochs + 1): start = time.time() ### global_step, _, _ = utils.train(args, model, device, train_loader, optimizer, epoch, global_step) print('\nTraining one epoch took: {:.4f} seconds.\n'.format( time.time() - start)) ### test_acc, _ = utils.test(model, device, test_loader) with open(args.output, 'a') as write_file: writer = csv.writer(write_file) writer.writerow([args.seed, test_acc])
#from model import model from utils import utils from data_load import Data_loading from evaluation import evaluation from visualization import visualization import numpy as np if __name__ == "__main__": """ utils input arguments: (option for data set, option for using different model, option for doing different tasks, option for choosing random walk strategy,option of whether to add structure) """ utils = utils(6, 3, 1, 2, 1) utils.config_train_test() utils.config_model() utils.init_walk_prob() utils.train() evl = evaluation(utils, 1) #evl.evaluate(utils) #vis = visualization(utils,evl) #vis.get_2d_rep() #vis.plot_2d()
def main(): parser = argparse.ArgumentParser(description="Standalone Center Loss.") # Dataset parser.add_argument("--dataset", type=str, default="fashion-mnist", choices=["mnist", "fashion-mnist", "cifar-10"]) parser.add_argument("--num_workers", type=int, default=4, help="Number of data loading workers.") # Optimization parser.add_argument("--epochs", type=int, default=100, help="Number of epochs.") parser.add_argument("--batch_size", type=int, default=128, help="Batch size.") parser.add_argument("--gpu_ids", type=str, default='', help="GPUs for running this script.") parser.add_argument("--lr", type=float, default=0.01, help="Learning rate for gradient descent.") parser.add_argument("--weight_intra", type=float, default=1.0, help="Weight for intra loss.") parser.add_argument('--weight_inter', type=float, default=0.1, help="Weight for inter loss.") parser.add_argument("--factor", type=float, default=0.2, help="Factor by which the learning rate will be reduced.") parser.add_argument("--patience", type=int, default=10, help="Number of epochs with no improvement after which learning rate will be reduced.") parser.add_argument("--threshold", type=float, default=0.1, help="Threshold for measuring the new optimum, to only focus on significant changes. ") # Model parser.add_argument("--model", type=str, default="resnet", choices=["resnet"]) parser.add_argument("--feat_dim", type=int, default=128, help="Dimension of the feature.") # Misc parser.add_argument("--log_dir", type=str, default="./run/", help="Where to save the log?") parser.add_argument("--log_name", type=str, required=True, help="Name of the log folder.") parser.add_argument("--seed", type=int, default=0, help="Random seed.") parser.add_argument("--eval_freq", type=int, default=1, help="How frequently to evaluate the model?") parser.add_argument("--vis", action="store_true", help="Whether to visualize the features?") args = parser.parse_args() # Check before run. if not os.path.exists(args.log_dir): os.mkdir(args.log_dir) log_dir = os.path.join(args.log_dir, args.log_name) # Setting up logger log_file = datetime.now().strftime("%Y-%m-%d-%H-%M-%S_{}.log".format(args.dataset)) sys.stdout = Logger(os.path.join(log_dir, log_file)) print(args) for s in args.gpu_ids: try: int(s) except ValueError as e: print("Invalid gpu id:{}".format(s)) raise ValueError os.environ['CUDA_VISIBLE_DEVICES'] = ','.join(args.gpu_ids) if args.gpu_ids: if torch.cuda.is_available(): use_gpu = True cudnn.benchmark = True torch.cuda.manual_seed_all(args.seed) else: use_gpu = False else: use_gpu = False torch.manual_seed(args.seed) trainloader, testloader, input_shape, classes = load_dataset(args.dataset, args.batch_size, use_gpu, args.num_workers) model = build_model(args.model, input_shape, args.feat_dim, len(classes)) criterion = StandaloneCenterLoss(len(classes), feat_dim=args.feat_dim, use_gpu=use_gpu) optimizer = torch.optim.SGD(list(model.parameters()) + list(criterion.parameters()), lr=args.lr, momentum=0.9) scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode="max", factor=args.factor, patience=args.patience, verbose=True, threshold=args.threshold) if use_gpu: model = model.cuda() model = torch.nn.DataParallel(model) print("Start training...") start = datetime.now() with SummaryWriter(log_dir) as writer: for epoch in range(args.epochs): train(model, trainloader, criterion, args.weight_intra, args.weight_inter, optimizer, use_gpu, writer, epoch, args.epochs, args.vis, args.feat_dim, classes) if epoch % args.eval_freq == 0 or epoch == args.epochs - 1: eval(model, testloader, criterion, scheduler, use_gpu, writer, epoch, args.epochs, args.vis, args.feat_dim, classes) elapsed_time = str(datetime.now() - start) print("Finish training. Total elapsed time %s." % elapsed_time)
def main(args): if args.seed is not None: paddle.fluid.Program.random_seed = args.seed np.random.seed(args.seed) if args.gpu < 0: device = "cpu" else: device = "cuda:{args.gpu}" # Load dataset # data = load_data(device, args) # g, labels, num_classes, train_nid, val_nid, test_nid = data labels = np.load("./data/lables.npy") num_classes = np.load("./data/num_classes.npy") train_nid = np.load("./data/train_nid.npy") val_nid = np.load("./data/val_nid.npy") test_nid = np.load("./data/test_nid.npy") evaluator = get_evaluator(args.dataset) # Preprocess neighbor-averaged features over sampled relation subgraphs rel_subsets = [] with paddle.no_grad(): feats = [] for i in range(args.R + 1): #数据集请自行在OGB官网下载,并按照官网教程生产训练集,或者在AiStudio上查询data88697 feature = np.load(f'../data/data88697/feat{i}.npy') feats.append(paddle.to_tensor(feature)) # feats = preprocess_features(g, rel_subsets, args, device) print("Done preprocessing") # labels = labels.to(device) # Release the graph since we are not going to use it later g = None # Set up logging logging.basicConfig(format='[%(levelname)s] %(message)s', level=logging.INFO) logging.info(str(args)) _, num_feats, in_feats = feats[0].shape logging.info("new input size: {} {}".format(num_feats, in_feats)) # Create model num_hops = args.R + 1 # include self feature hop 0 model = nn.Sequential( WeightedAggregator(num_feats, in_feats, num_hops), SIGN(in_feats, args.num_hidden, num_classes, num_hops, args.ff_layer, args.dropout, args.input_dropout)) if len(labels.shape) == 1: # single label multi-class loss_fcn = nn.NLLLoss() else: # multi-label multi-class loss_fcn = nn.KLDivLoss(reduction='batchmean') print('!' * 100) optimizer = paddle.optimizer.Adam(parameters=model.parameters(), weight_decay=args.weight_decay) # optimizer = paddle.optimizer.Adam(parameters=model.parameters()) # Start training best_epoch = 0 best_val = 0 f = open('log.txt', 'w+') for epoch in range(1, args.num_epochs + 1): start = time.time() print(epoch) train(model, feats, labels, train_nid, loss_fcn, optimizer, args.batch_size) if epoch % args.eval_every == 0: with paddle.no_grad(): train_res, val_res, test_res = test(model, feats, labels, train_nid, val_nid, test_nid, evaluator, args.eval_batch_size) end = time.time() val_acc = val_res[0] log = "Epoch {}, Times(s): {:.4f}".format(epoch, end - start) log += ", Accuracy: Train {:.4f}, Val {:.4f}".format( train_res[0], val_res[0]) log += f", best_acc:{best_val}" logging.info(log) print(log, file=f, flush=True) if val_acc > best_val: best_val = val_acc best_epoch = epoch f.close() logging.info("Best Epoch {}, Val {:.4f}".format(best_epoch, best_val))
e = 8. epsilon = e/255. max_iter= int(min(e+4, 1.25*e)) def normalize(img, mean=mean, std=std): img_n = img - mean img_n = img_n / std return img_n adversary = PGDAttack(lambda x: net(x), eps=epsilon, nb_iter=7, ord=np.inf, eps_iter=epsilon/4.) writer = SummaryWriter(comment=tensorboard_comment) for epoch in range(start_epoch+1, nb_epoch+1): if epoch >= args.epoch_adv: train_acc, train_loss = train(epoch, net, train_loader, optimizer, criterion_da, args, adv_training=True, epsilon=args.eps_train/255., alpha=args.alpha_train/255., num_iter=args.num_iter) else: train_acc, train_loss = train(epoch, net, train_loader, optimizer, criterion_class, args, adv_training=False) net.eval() val_acc, val_loss = test(net, val_loader, criterion_class, args) # adv_acc, adv_loss, _, _ = adv_test(net, val_loader, criterion_class, adversary, epsilon, args, store_imgs=False) # writer.add_scalar('adv_acc', adv_acc, epoch) writer.add_scalar('train_acc', train_acc, epoch) writer.add_scalar('train_loss', train_loss, epoch) writer.add_scalar('val_acc', val_acc, epoch) writer.add_scalar('val_loss', val_loss, epoch) save_model(val_acc, net, optimizer, epoch, os.path.join("model", "checkpoints"), filename) if args.sgd: scheduler.step()
def main(): parser = argparse.ArgumentParser() parser.add_argument("--data", type=str, required=True, help="Path to the dataset directory.") parser.add_argument("--degree", type=int, default=5, help="Degree of the bezier curves.") parser.add_argument("--log_dir", type=str, default="runs", help="Path to save the tf event.") parser.add_argument("--log_name", type=str, required=True, help="Name of the experiment.") parser.add_argument("--beta", type=float, default=30, help="Loss balancing factor.") parser.add_argument("--weight_dir", type=str, default="weights", help="Folder to save the model weights.") parser.add_argument("--pretrained_weight", type=str, required=True, help="Path to the pretrained weight.") parser.add_argument("--gpu_ids", type=str, default='', help="Specify the GPU ids.") parser.add_argument("--seed", type=int, default=0, help="Random seed.") parser.add_argument("--batch_size", type=int, default=256, help="Batch size.") parser.add_argument("--num_workers", type=int, default=12, help="Number of workers.") parser.add_argument("--epochs", type=int, default=100, help="Number of epochs.") parser.add_argument("--input_size", type=int, nargs=2, required=True, help="Size of the input image (w, h).") parser.add_argument("--max_lane", type=int, default=4, help="Maximum number of lanes.") parser.add_argument("--num_points", type=int, default=72, help="Number of points for computing the loss.") parser.add_argument("--feat_dim", type=int, default=384, help="The output feature dimension of the backbone.") parser.add_argument("--lr", type=float, default=1e-3, help="Learning rate.") parser.add_argument("--momentum", type=float, default=0.9, help="Momentum rate.") parser.add_argument( "--factor", type=float, default=0.5, help="Factor by which the learning rate will be reduced.") parser.add_argument( "--patience", type=int, default=15, help= "Number of epochs with no improvement after which learning rate will be reduced." ) parser.add_argument( "--threshold", type=float, default=1e-2, help= "Threshold for measuring the new optimum, to only focus on significant changes. " ) parser.add_argument("--eval_freq", type=int, default=1, help="Evaluate frequency.") args = parser.parse_args() print(args) for s in args.gpu_ids: try: int(s) except ValueError as e: print("Invalid gpu id: {}".format(s)) raise ValueError os.environ["CUDA_VISIBLE_DEVICES"] = ','.join(args.gpu_ids) if args.gpu_ids: if torch.cuda.is_available(): use_gpu = True cudnn.benchmark = True torch.cuda.manual_seed(args.seed) else: use_gpu = False else: use_gpu = False logtime = datetime.now().strftime("%Y-%m-%d-%H-%M-%S") log_dir = os.path.join(args.log_dir, args.log_name, logtime) train_log = os.path.join(log_dir, "train") val_log = os.path.join(log_dir, "val") mkdir(train_log) mkdir(val_log) weight_dir = os.path.join(args.weight_dir, args.log_name, logtime) mkdir(weight_dir) train_loader, val_loader, test_loader, num_fc_nodes = build_dataloader( args.data, args.batch_size, tuple(args.input_size), args.degree, args.num_points, args.max_lane, use_gpu, args.num_workers) model = CustomResnet(args.feat_dim, args.pretrained_weight, args.max_lane, num_fc_nodes) optimizer = SGD(model.parameters(), lr=args.lr, momentum=args.momentum) scheduler = ReduceLROnPlateau(optimizer, mode="min", factor=args.factor, patience=args.patience, threshold=args.threshold, verbose=True) dsd_loss = DSDRandomLoss(args.degree, args.max_lane, args.num_points) xent_loss = torch.nn.CrossEntropyLoss() criterion = {"xent": xent_loss, "dsd": dsd_loss} if use_gpu: model = model.cuda() model = torch.nn.DataParallel(model) with SummaryWriter(log_dir=train_log) as tr_writer: with SummaryWriter(log_dir=val_log) as val_writer: js = { "best_epoch": 0, "loss": 1e+12, "cls_loss": 1e+12, "dsd_loss": 1e+12, "acc": 0.0, "seed": args.seed } for e in range(args.epochs): for i, param_group in enumerate(optimizer.param_groups): learning_rate = float(param_group['lr']) tr_writer.add_scalar("lr of group {}".format(i), learning_rate, global_step=e) train(model, train_loader, optimizer, criterion, args.beta, tr_writer, e, args.degree, use_gpu) if e % args.eval_freq == 0 or e == args.epochs - 1: val_loss, val_cls_loss, val_dsd_loss, val_acc = evaluate( model, val_loader, criterion, args.beta, scheduler, val_writer, e, args.degree, weight_dir, use_gpu) if val_loss < js["loss"]: js["best_epoch"] = e js["loss"] = val_loss js["cls_loss"] = val_cls_loss js["dsd_loss"] = val_dsd_loss js["acc"] = val_acc with open(os.path.join(log_dir, "best_result.json"), 'w') as f: json.dump(js, f)