def main_worker(gpu, ngpus_per_node, args): global best_acc1 args.gpu = gpu if args.gpu is not None: print("Use GPU: {} for training".format(args.gpu)) if args.distributed: if args.dist_url == "env://" and args.rank == -1: args.rank = int(os.environ["RANK"]) if args.multiprocessing_distributed: # For multiprocessing distributed training, rank needs to be the # global rank among all the processes args.rank = args.rank * ngpus_per_node + gpu dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size, rank=args.rank) # create model if args.pretrained: print("=> using pre-trained model '{}'".format(args.arch)) model = models.__dict__[args.arch](pretrained=True) else: model = custom_models.__dict__[args.arch]([2, 2, 2, 2], pooling_type='max', in_chns=1, num_classes=12, inplanes=64) # print("=> creating model '{}'".format(args.arch)) # model = models.__dict__[args.arch]() if not torch.cuda.is_available(): print('using CPU, this will be slow') elif args.distributed: # For multiprocessing distributed, DistributedDataParallel constructor # should always set the single device scope, otherwise, # DistributedDataParallel will use all available devices. if args.gpu is not None: torch.cuda.set_device(args.gpu) model.cuda(args.gpu) # When using a single GPU per process and per # DistributedDataParallel, we need to divide the batch size # ourselves based on the total number of GPUs we have args.batch_size = int(args.batch_size / ngpus_per_node) args.workers = int( (args.workers + ngpus_per_node - 1) / ngpus_per_node) model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.gpu]) else: model.cuda() # DistributedDataParallel will divide and allocate batch_size to all # available GPUs if device_ids are not set model = torch.nn.parallel.DistributedDataParallel(model) elif args.gpu is not None: torch.cuda.set_device(args.gpu) model = model.cuda(args.gpu) else: # DataParallel will divide and allocate batch_size to all available GPUs if args.arch.startswith('alexnet') or args.arch.startswith('vgg'): model.features = torch.nn.DataParallel(model.features) model.cuda() else: model = torch.nn.DataParallel(model).cuda() # define loss function (criterion) and optimizer criterion = nn.CrossEntropyLoss().cuda(args.gpu) optimizer = torch.optim.SGD(model.parameters(), args.lr, momentum=args.momentum, weight_decay=args.weight_decay) # optionally resume from a checkpoint if args.resume: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) if args.gpu is None: checkpoint = torch.load(args.resume) else: # Map model to be loaded to specified single gpu. loc = 'cuda:{}'.format(args.gpu) checkpoint = torch.load(args.resume, map_location=loc) args.start_epoch = checkpoint['epoch'] best_acc1 = checkpoint['best_acc1'] if args.gpu is not None: # best_acc1 may be from a checkpoint from a different GPU best_acc1 = best_acc1.to(args.gpu) model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) print("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) else: print("=> no checkpoint found at '{}'".format(args.resume)) cudnn.benchmark = True # Data loading code args.mean = [0.5, 0.5, 0.5] args.std = [0.5, 0.5, 0.5] trans_funcs = [] val_loader = torch.utils.data.DataLoader(get_val_dataset( args.data_dir + '/img/', args.data_dir + '/gt/', args.data_dir + '/all_imgs.txt', args.test_inds, trans_funcs, args.mean, args.std, args.target_size), batch_size=args.batch_size, shuffle=False, num_workers=args.workers, pin_memory=True) if args.evaluate: validate(val_loader, model, criterion, args) return train_dataset = get_train_dataset(args.data_dir + '/img/', args.data_dir + '/gt/', args.data_dir + '/all_imgs.txt', args.test_inds, trans_funcs, args.mean, args.std, args.target_size) if args.distributed: train_sampler = torch.utils.data.distributed.DistributedSampler( train_dataset) else: train_sampler = None train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None), num_workers=args.workers, pin_memory=True, sampler=train_sampler) for epoch in range(args.start_epoch, args.epochs): if args.distributed: train_sampler.set_epoch(epoch) adjust_learning_rate(optimizer, epoch, args) # train for one epoch train(train_loader, model, criterion, optimizer, epoch, args) # evaluate on validation set acc1 = validate(val_loader, model, criterion, args) # remember best acc@1 and save checkpoint is_best = acc1 > best_acc1 best_acc1 = max(acc1, best_acc1) if not args.multiprocessing_distributed or ( args.multiprocessing_distributed and args.rank % ngpus_per_node == 0): save_checkpoint( { 'epoch': epoch + 1, 'arch': args.arch, 'state_dict': model.state_dict(), 'best_acc1': best_acc1, 'optimizer': optimizer.state_dict(), }, is_best)
def main(): if not torch.cuda.is_available(): logging.info('no gpu device available') sys.exit(1) np.random.seed(args.seed) torch.cuda.set_device(args.gpu) cudnn.benchmark = True torch.manual_seed(args.seed) cudnn.enabled=True torch.cuda.manual_seed(args.seed) logging.info('gpu device = %d' % args.gpu) logging.info("args = %s", args) criterion = nn.CrossEntropyLoss() criterion = criterion.cuda() model = Network(args.init_channels, CIFAR_CLASSES, args.layers, criterion) model = model.cuda() logging.info("param size = %fMB", utils.count_parameters_in_MB(model)) optimizer = torch.optim.SGD( model.parameters(), args.learning_rate, momentum=args.momentum, weight_decay=args.weight_decay) train_data = get_train_dataset( args.set, args ) num_train = len(train_data) // 1 indices = list(range(num_train)) split = int(np.floor(args.train_portion * num_train)) train_queue = torch.utils.data.DataLoader( train_data, batch_size=args.batch_size, sampler=torch.utils.data.sampler.SubsetRandomSampler(indices[:split]), pin_memory=True, num_workers=8) valid_queue = torch.utils.data.DataLoader( train_data, batch_size=args.batch_size, sampler=torch.utils.data.sampler.SubsetRandomSampler(indices[split:num_train]), pin_memory=True, num_workers=8) scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( optimizer, float(args.epochs), eta_min=args.learning_rate_min) architect = Architect(model, args) for epoch in range(args.epochs): lr = scheduler.get_lr()[0] logging.info('epoch %d lr %e', epoch, lr) genotype = model.genotype() logging.info('genotype = %s', genotype) #print(F.softmax(model.alphas_normal, dim=-1)) #print(F.softmax(model.alphas_reduce, dim=-1)) # training train_acc, train_obj = train(train_queue, valid_queue, model, architect, criterion, optimizer, lr,epoch) logging.info('train_acc %f', train_acc) # validation if args.epochs-epoch<=1: valid_acc, valid_obj = infer(valid_queue, model, criterion) logging.info('valid_acc %f', valid_acc) utils.save(model, os.path.join(args.save, 'weights.pt')) scheduler.step() print( 'Experiment Dir:', args.save )
variable_parameters *= dim.value print(variable_parameters) total_parameters += variable_parameters print('total params: ') print(total_parameters) sess = tf.Session() sess.run(tf.global_variables_initializer()) lin_net.crf_feature_net.overwrite_init(sess) summary = tf.summary.merge_all() summary_writer = tf.summary.FileWriter( os.path.join(ARGS.logdir_path, 'summary'), sess.graph, ) dataset_reader = RandDatasetReader(get_train_dataset(ARGS.hdr_prefix), b) for it in range(ARGS.it_num): print(it) if it == 0 or it % 10000 == 9999: print('start save') checkpoint_path = os.path.join(ARGS.logdir_path, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=it) print('finish save') hdr_val, crf_val, invcrf_val, t_val = dataset_reader.read_batch_data() _, summary_val = sess.run( [train_op, summary], { hdr: hdr_val, crf: crf_val, invcrf: invcrf_val, t: t_val,
batch_sampler = torch.utils.data.sampler.BatchSampler( sampler, config.VAL.IMG_PER_GPU, drop_last=False) loader = torch.utils.data.DataLoader(subset, num_workers=config.VAL.NUM_WORKERS, batch_sampler=batch_sampler) return loader if __name__ == '__main__': from easydict import EasyDict as edict from exps.baseline.config import config from dataset import get_train_dataset, get_val_dataset dataset = get_train_dataset(config) # dataset = get_val_dataset(config) import argparse parser = argparse.ArgumentParser() parser.add_argument('--local_rank', type=int, default=0) args = parser.parse_args() torch.distributed.init_process_group(backend="nccl", init_method='env://') # loader = train_loader(dataset, config) loader = val_loader(dataset, config, 0, 2) iter_loader = iter(loader) if args.local_rank == 0: lr, hr = iter_loader.next()
len(self.text_map), len( self.code_map), self.stats['total_same_input_tests'], self.stats['total_same_other_tests'])) if self.args.show_tags: for name, value in self.task_types_stats.iteritems(): print("%s: %d" % (name, value)) for name, value in self.tags_stats.iteritems(): print("%s: %d" % (name, value)) def report_stats(args, dataset): ds = DatasetStats(args) for example in dataset.data: ds.update(example) ds.display() if __name__ == "__main__": parser = argparse.ArgumentParser(description='Data Statistics') parser.add_argument('--dataset', type=str, default='karel') parser.add_argument('--dataset_max_size', type=int, default=0) parser.add_argument('--dataset_max_code_length', type=int, default=0) parser.add_argument('--show-tags', action='store_true', default=False) parser.add_argument('--vocab_min_freq', type=int, default=50) args, _ = parser.parse_known_args(sys.argv) import dataset args.batch_size = 1 train_dataset = dataset.get_train_dataset(args) report_stats(args, train_dataset)
def main(args): parser = argparse.ArgumentParser(description='Variational AutoEncoders') parser.add_argument('data_dir', help='path to training data') parser.add_argument('--test_inds', type=int, nargs='+', help='inds test participants') parser.add_argument('--test_file', type=str, help='path to a file containing test inds') parser.add_argument('--target-size', default=260, type=int) parser.add_argument('-j', '--workers', default=4, type=int, help='number of data loading workers (default: 4)') parser.add_argument('--pred', type=str, default=None, help='Only prediction') model_parser = parser.add_argument_group('Model Parameters') model_parser.add_argument('--model', default='vqvae', choices=['vae', 'vqvae', 'resnet'], help='autoencoder variant to use: vae | vqvae') model_parser.add_argument( '--batch-size', type=int, default=4, metavar='N', help='input batch size for training (default: 128)') model_parser.add_argument('--hidden', type=int, metavar='N', help='number of hidden channels') model_parser.add_argument('-k', '--dict-size', type=int, dest='k', metavar='K', help='number of atoms in dictionary') model_parser.add_argument('-kl', '--kl', type=int, dest='kl', default=None, help='length of vectors in embedded space') model_parser.add_argument('--lr', type=float, default=None, help='learning rate') model_parser.add_argument('--vq_coef', type=float, default=None, help='vq coefficient in loss') model_parser.add_argument('--commit_coef', type=float, default=None, help='commitment coefficient in loss') model_parser.add_argument('--kl_coef', type=float, default=None, help='kl-divergence coefficient in loss') model_parser.add_argument('--gabor_layer', action='store_true', default=False, help='using gabor like layer') parser.add_argument('--resume', type=str, default=None, help='The path to resume.') training_parser = parser.add_argument_group('Training Parameters') training_parser.add_argument( '--dataset', default='custom', choices=['mnist', 'cifar10', 'imagenet', 'coco', 'custom'], help='dataset to use: mnist | cifar10 | imagenet | coco | custom') training_parser.add_argument( '--dataset_dir_name', default='', help='name of the dir containing the dataset if dataset == custom') training_parser.add_argument('--data-dir', default='/media/ssd/Datasets', help='directory containing the dataset') training_parser.add_argument( '--epochs', type=int, default=20, metavar='N', help='number of epochs to train (default: 10)') training_parser.add_argument('--max-epoch-samples', type=int, default=50000, help='max num of samples per epoch') training_parser.add_argument('--no-cuda', action='store_true', default=False, help='enables CUDA training') training_parser.add_argument('--seed', type=int, default=1, metavar='S', help='random seed (default: 1)') training_parser.add_argument('--gpus', default='0', help='gpus used for training - e.g 0,1,3') logging_parser = parser.add_argument_group('Logging Parameters') logging_parser.add_argument( '--log-interval', type=int, default=10, metavar='N', help='how many batches to wait before logging training status') logging_parser.add_argument('--results-dir', metavar='RESULTS_DIR', default='./results', help='results dir') logging_parser.add_argument('--save-name', default='', help='saved folder') logging_parser.add_argument('--data-format', default='json', help='in which format to save the data') model_parser.add_argument('--backbone', type=str, default=None, nargs='+', help='details of backbone') args = parser.parse_args(args) args.cuda = not args.no_cuda and torch.cuda.is_available() lr = args.lr or default_hyperparams[args.dataset]['lr'] k = args.k or default_hyperparams[args.dataset]['k'] hidden = args.hidden or default_hyperparams[args.dataset]['hidden'] num_channels = dataset_n_channels[args.dataset] save_path = ex_util.setup_logging_from_args(args) writer = SummaryWriter(save_path) # if test file is specified use it for selecting test train sets if args.test_file is not None: args.test_inds = args.test_file args.inv_func = None torch.manual_seed(args.seed) if args.cuda: torch.cuda.manual_seed_all(args.seed) args.gpus = [int(i) for i in args.gpus.split(',')] torch.cuda.set_device(args.gpus[0]) cudnn.benchmark = True torch.cuda.manual_seed(args.seed) if args.model == 'resnet': backbone = { 'arch_name': args.backbone[0], 'layer_name': args.backbone[1] } if len(args.backbone) > 2: backbone['weights_path'] = args.backbone[2] model = models[args.dataset][args.model](hidden, k=k, kl=args.kl, num_channels=num_channels, gabor_layer=args.gabor_layer, backbone=backbone) else: model = models[args.dataset][args.model](hidden, k=k, kl=args.kl, num_channels=num_channels, gabor_layer=args.gabor_layer) if args.resume is not None: weights = torch.load(args.resume, map_location='cpu') model.load_state_dict(weights) if args.cuda: model.cuda() optimizer = optim.Adam(model.parameters(), lr=lr) scheduler = optim.lr_scheduler.StepLR(optimizer, int(args.epochs / 3), 0.5) # NOTE: right now there's no additional transformaiton function trans_funcs = [] # normlisation args.mean = [0.5, 0.5, 0.5] args.std = [0.5, 0.5, 0.5] in_chns = 1 if args.model == 'resnet': in_chns = 3 val_dataset = get_val_dataset(args.data_dir + '/img/', args.data_dir + '/gt/', args.data_dir + '/all_imgs.txt', args.test_inds, trans_funcs, args.mean, args.std, args.target_size, chns=in_chns) # NOTE: shuffle is False val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.workers, pin_memory=True, sampler=None) if args.pred is not None: weights = torch.load(args.pred, map_location='cpu') model.load_state_dict(weights) model.cuda() predict_net(model, val_loader, save_path, args) return train_dataset = get_train_dataset(args.data_dir + '/img/', args.data_dir + '/gt/', args.data_dir + '/all_imgs.txt', args.test_inds, trans_funcs, args.mean, args.std, args.target_size, chns=in_chns) train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.workers, pin_memory=True, sampler=None) console = logging.StreamHandler() console.setLevel(logging.INFO) logging.getLogger('').addHandler(console) for epoch in range(1, args.epochs + 1): train_losses = train(epoch, model, train_loader, optimizer, args.cuda, args.log_interval, save_path, args, writer) test_losses = test_net(epoch, model, val_loader, args.cuda, save_path, args, writer) ex_util.save_checkpoint(model, epoch, save_path) for k in train_losses.keys(): name = k.replace('_train', '') train_name = k test_name = k.replace('train', 'test') writer.add_scalars( name, { 'train': train_losses[train_name], 'test': test_losses[test_name], }) scheduler.step()
def main(opts): """Main function for the training pipeline :opts: commandlien arguments :returns: None """ ########################################################################## # Basic settings # ########################################################################## exp_dir = 'experiments' log_dir = os.path.join(exp_dir, 'logs') model_dir = os.path.join(exp_dir, 'models') os.makedirs(os.path.join(model_dir, opts.run_name), exist_ok=True) os.makedirs(os.path.join(log_dir, opts.run_name)) pprint(vars(opts)) with open(os.path.join(log_dir, opts.run_name, "args.json"), 'w') as f: json.dump(vars(opts), f, indent=True) torch.manual_seed(opts.seed) np.random.seed(opts.seed) random.seed(opts.seed) ########################################################################## # Define all the necessary variables for model training and evaluation # ########################################################################## writer = SummaryWriter(os.path.join(log_dir, opts.run_name), flush_secs=5) if opts.train_mode == 'combined': train_dataset = get_train_dataset(opts.data_root, opts, opts.folder1, opts.folder2, opts.folder3) elif opts.train_mode == 'oversampling': train_dataset = get_train_dataset_by_oversampling( opts.data_root, opts, opts.folder1, opts.folder2, opts.folder3) elif opts.train_mode == 'pretrain_and_finetune': train_dataset, finetune_dataset = get_pretrain_and_finetune_datast( opts.data_root, opts, opts.folder1, opts.folder2, opts.folder3) finetune_loader = torch.utils.data.DataLoader( finetune_dataset, batch_size=opts.batch_size, num_workers=opts.num_workers, drop_last=False, shuffle=True) train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=opts.batch_size, num_workers=opts.num_workers, drop_last=False, shuffle=True) val_dataset = get_val_dataset(os.path.join('data', 'val'), opts) val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=opts.eval_batch_size, shuffle=False, num_workers=opts.num_workers, drop_last=False) test_dataset = get_test_dataset(os.path.join('data', 'test'), opts) test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=opts.eval_batch_size, shuffle=False, num_workers=opts.num_workers, drop_last=False) assert train_dataset.class_to_idx == val_dataset.class_to_idx == test_dataset.class_to_idx, "Mapping not correct" model = get_model(opts) opts.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') if torch.cuda.device_count() > 1 and not opts.no_data_parallel: model = nn.DataParallel(model) model = model.to(opts.device) optimizer = optim.RMSprop(model.parameters(), lr=opts.lr, alpha=0.9, weight_decay=1e-5, momentum=0.9) scheduler = get_lr_scheduler(optimizer, opts) best_val_loss = float('inf') best_val_accu = float(0) best_val_rec = float(0) best_val_prec = float(0) best_val_f1 = float(0) best_val_auc = float(0) iteration_change_loss = 0 t_start_training = time.time() ########################################################################## # Main training loop # ########################################################################## for epoch in range(opts.epochs): current_lr = get_lr(optimizer) t_start = time.time() ############################################################ # The actual training and validation step for each epoch # ############################################################ train_loss, train_metric = train_model(model, train_loader, optimizer, opts) if epoch == opts.finetune_epoch and opts.train_mode == 'pretrain_and_finetune': train_loader = finetune_loader optimizer = optim.RMSprop(model.parameters(), lr=opts.lr, alpha=0.9, weight_decay=1e-5, momentum=0.9) scheduler = torch.optim.lr_scheduler.StepLR( optimizer, step_size=opts.step_size_finetuning, gamma=opts.gamma) # Run the validation set with torch.no_grad(): val_loss, val_metric = evaluate_model(model, val_loader, opts) ############################## # Write to summary writer # ############################## train_acc, val_acc = train_metric['accuracy'], val_metric['accuracy'] train_rec, val_rec = train_metric['recalls'], val_metric['recalls'] train_prec, val_prec = train_metric['precisions'], val_metric[ 'precisions'] train_f1, val_f1 = train_metric['f1'], val_metric['f1'] train_auc, val_auc = train_metric['auc'], val_metric['auc'] writer.add_scalar('Loss/Train', train_loss, epoch) writer.add_scalar('Accuracy/Train', train_acc, epoch) writer.add_scalar('Precision/Train', train_prec, epoch) writer.add_scalar('Recall/Train', train_rec, epoch) writer.add_scalar('F1/Train', train_f1, epoch) writer.add_scalar('AUC/Train', train_auc, epoch) writer.add_scalar('Loss/Val', val_loss, epoch) writer.add_scalar('Accuracy/Val', val_acc, epoch) writer.add_scalar('Precision/Val', val_prec, epoch) writer.add_scalar('Recall/Val', val_rec, epoch) writer.add_scalar('F1/Val', val_f1, epoch) writer.add_scalar('AUC/Val', val_auc, epoch) ############################## # Adjust the learning rate # ############################## if opts.lr_scheduler == 'plateau': scheduler.step(val_loss) elif opts.lr_scheduler in ['step', 'cosine']: scheduler.step() t_end = time.time() delta = t_end - t_start print_epoch_progress(epoch, opts.epochs, train_loss, val_loss, delta, train_metric, val_metric) iteration_change_loss += 1 print('-' * 30) if val_acc > best_val_accu: best_val_accu = val_acc if bool(opts.save_model): torch.save( model.state_dict(), os.path.join(model_dir, opts.run_name, 'best_state_dict.pth')) if val_loss < best_val_loss: best_val_loss = val_loss iteration_change_loss = 0 if val_rec > best_val_rec: best_val_rec = val_rec if val_prec > best_val_prec: best_val_prec = val_prec if val_f1 > best_val_f1: best_val_f1 = val_f1 print(f'The best validation F1-score is now {best_val_f1}') print( f'The validation accuracy and AUC are now {val_acc} and {val_auc}' ) if val_auc > best_val_auc: best_val_auc = val_auc if iteration_change_loss == opts.patience and opts.early_stopping: print( ('Early stopping after {0} iterations without the decrease ' + 'of the val loss').format(iteration_change_loss)) break t_end_training = time.time() print(f'training took {t_end_training - t_start_training}s') print(f'Best validation accuracy: {best_val_accu}') print(f'Best validation loss: {best_val_loss}') print(f'Best validation precision: {best_val_prec}') print(f'Best validation recall: {best_val_rec}') print(f'Best validation f1: {best_val_f1}') print(f'Best validation AUC: {best_val_auc}') with torch.no_grad(): if opts.train_mode in ['combined', 'oversampling']: model.load_state_dict( torch.load( os.path.join(model_dir, opts.run_name, 'best_state_dict.pth'))) test_loss, test_metric = evaluate_model(model, test_loader, opts) print(f'The best test F1: {test_metric["f1"]}') print(f'The best test auc: {test_metric["auc"]}') print(f'The best test accuracy: {test_metric["accuracy"]}')
def main(opts): """Main function for the training pipeline :opts: commandlien arguments :returns: None """ ########################################################################## # Basic settings # ########################################################################## exp_dir = 'experiments' log_dir = os.path.join(exp_dir, 'logs') model_dir = os.path.join(exp_dir, 'models') os.makedirs(os.path.join(model_dir, opts.run_name), exist_ok=True) ########################################################################## # Define all the necessary variables for model training and evaluation # ########################################################################## writer = SummaryWriter(os.path.join(log_dir, opts.run_name), flush_secs=5) train_dataset = get_train_dataset(root=os.path.join('data', 'train')) weights = make_weights_for_balanced_classes(train_dataset.imgs, len(train_dataset.classes)) weights = torch.DoubleTensor(weights) sampler = torch.utils.data.sampler.WeightedRandomSampler( weights, len(weights)) train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=opts.batch_size, num_workers=6, drop_last=False, sampler=sampler) val_dataset = get_val_dataset(root=os.path.join('data', 'val')) val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=opts.batch_size, shuffle=False, num_workers=6, drop_last=False) assert train_dataset.class_to_idx == val_dataset.class_to_idx, "Mapping not correct" model = load_baseline(n_classes=2) if torch.cuda.is_available(): model = model.cuda() optimizer = optim.Adam(model.parameters(), lr=opts.lr, weight_decay=0.1) if opts.lr_scheduler == "plateau": scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=3, factor=.3, threshold=1e-4, verbose=True) elif opts.lr_scheduler == "step": scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=3, gamma=opts.gamma) best_val_loss = float('inf') best_val_accu = float(0) iteration_change_loss = 0 t_start_training = time.time() ########################################################################## # Main training loop # ########################################################################## for epoch in range(opts.epochs): current_lr = get_lr(optimizer) t_start = time.time() ############################################################ # The actual training and validation step for each epoch # ############################################################ train_loss, train_metric = train_model(model, train_loader, epoch, opts.epochs, optimizer, writer, current_lr, opts.log_every) with torch.no_grad(): val_loss, val_metric = evaluate_model(model, val_loader, epoch, opts.epochs, writer, current_lr) ############################## # Write to summary writer # ############################## writer.add_scalar('Loss/Train', train_loss, epoch) writer.add_scalar('Accuracy/Train', train_metric['accuracy'], epoch) writer.add_scalar('Precision/Train', train_metric['precisions'], epoch) writer.add_scalar('Recall/Train', train_metric['recalls'], epoch) writer.add_scalar('F1/Train', train_metric['f1'], epoch) writer.add_scalar('Loss/Val', val_loss, epoch) writer.add_scalar('Accuracy/Val', val_metric['accuracy'], epoch) writer.add_scalar('Precision/Val', val_metric['precisions'], epoch) writer.add_scalar('Recall/Val', val_metric['recalls'], epoch) writer.add_scalar('F1/Val', val_metric['f1'], epoch) ############################## # Adjust the learning rate # ############################## if opts.lr_scheduler == 'plateau': scheduler.step(val_loss) elif opts.lr_scheduler == 'step': scheduler.step() t_end = time.time() delta = t_end - t_start print_epoch_progress(train_loss, val_loss, delta, train_metric, val_metric) iteration_change_loss += 1 print('-' * 30) train_acc, val_acc = train_metric['accuracy'], val_metric['accuracy'] # file_name = ('val_acc_{}_train_acc_{}_epoch_{}.pth'. # format(train_acc, val_acc, epoch)) # torch.save(model, os.path.join(model_dir, opts.run_name, file_name)) if val_acc > best_val_accu: best_val_accu = val_acc if bool(opts.save_model): torch.save(model, os.path.join(model_dir, opts.run_name, 'best.pth')) if val_loss < best_val_loss: best_val_loss = val_loss iteration_change_loss = 0 if iteration_change_loss == opts.patience and opts.early_stopping: print( ('Early stopping after {0} iterations without the decrease ' + 'of the val loss').format(iteration_change_loss)) break t_end_training = time.time() print('training took {}s'.format(t_end_training - t_start_training))
weight_decay = 1e-3 norm = None num_epochs = 350 cude_if = True ## Tensorboard writer = None #writer = SummaryWriter(log_dir='board_view/simple_endecoder/whole_data') ## Stage 2 Letter Data Set persons = read_rawdata_to_person(data_path, all_data, millisec=1000, resample=resample) dataset = data_split(persons, alpha=alpha) trainset = get_train_dataset(dataset) testset = get_test_dataset(dataset) ### write own dataloader trainloader = t.utils.data.DataLoader(trainset, batch_size=batch_size, shuffle=True) testloader = t.utils.data.DataLoader(testset, batch_size=batch_size, shuffle=True) ## Model Setting model = Cnn_LstmNet(10, 32) # model = ResNet_lstm(BasicBlock, [2, 2, 1, 1]) ## Loss function and Optimizer criterion = t.nn.CrossEntropyLoss()
def main(): if device != 'cuda': logging.info('no gpu device available') sys.exit(1) np.random.seed(args.seed) torch.cuda.set_device(args.gpu) cudnn.benchmark = True torch.manual_seed(args.seed) cudnn.enabled = True torch.cuda.manual_seed(args.seed) logging.info('gpu device = %d' % args.gpu) logging.info("args = %s", args) # setup criterion, model criterion = nn.CrossEntropyLoss() criterion = criterion.cuda() label_criterion = nn.NLLLoss().to(device) domain_criterion = nn.NLLLoss().to(device) #model = NetworkFE(args.init_channels, NUM_CLASSES, args.layers, criterion) model = DANN(args.init_channels, NUM_CLASSES, args.layers, label_criterion, domain_criterion) model = model.to(device) logging.info("param size = %fMB", utils.count_parameters_in_MB(model)) # TODO: setup right optimizer optimizer = torch.optim.SGD(model.parameters(), args.learning_rate, momentum=args.momentum, weight_decay=args.weight_decay) #optimizer = torch.optim.Adam( # model.parameters(), # args.learning_rate ) src_train_data = get_train_dataset(args.src_set, args) tgt_train_data = get_train_dataset(args.tgt_set, args) num_train = min(len(src_train_data), len(tgt_train_data)) // 1 indices = list(range(num_train)) split = int(np.floor(args.train_portion * num_train)) # DataLoader for src,tgt training data src_train_queue = torch.utils.data.DataLoader( src_train_data, batch_size=args.batch_size, sampler=torch.utils.data.sampler.SubsetRandomSampler(indices[:split]), pin_memory=True, num_workers=8) tgt_train_queue = torch.utils.data.DataLoader( tgt_train_data, batch_size=args.batch_size, sampler=torch.utils.data.sampler.SubsetRandomSampler(indices[:split]), pin_memory=True, num_workers=8) # DataLoader for src,tgt validation data src_valid_queue = torch.utils.data.DataLoader( src_train_data, batch_size=args.batch_size, sampler=torch.utils.data.sampler.SubsetRandomSampler( indices[split:num_train]), pin_memory=True, num_workers=8) tgt_valid_queue = torch.utils.data.DataLoader( tgt_train_data, batch_size=args.batch_size, sampler=torch.utils.data.sampler.SubsetRandomSampler( indices[split:num_train]), pin_memory=True, num_workers=8) scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( optimizer, float(args.epochs), eta_min=args.learning_rate_min) # Use architect for domain adaptation architect = ArchitectDA(model, args) # import pdb; pdb.set_trace() # main loop for epoch in range(args.epochs): lr = scheduler.get_lr()[0] logging.info('epoch %d lr %e', epoch, lr) genotype = model.genotype() logging.info('genotype = %s', genotype) # train step train_acc, train_obj = train(src_train_queue, tgt_train_queue, src_valid_queue, tgt_valid_queue, model, architect, label_criterion, domain_criterion, optimizer, lr, epoch) logging.info('train_acc %f', train_acc) # validation only on last epoch # if args.epochs-epoch<=1: src_val_acc, tgt_val_acc, valid_obj = infer(src_valid_queue, tgt_valid_queue, model, label_criterion, domain_criterion) logging.info('src_val_acc %f tgt_val_acc %f', src_val_acc, tgt_val_acc) # save model utils.save(model, os.path.join(args.save, 'weights.pt')) scheduler.step() print('Experiment Dir:', args.save)