def __init__(self, use_cuda, load_model, model_folder, train_directory, validation_directory, builder, loss_fn, args, multi_gpu=True): self.use_cuda = use_cuda self.load_model = load_model self.model_folder = model_folder self.validation_directory = validation_directory self.train_directory = train_directory self.args = args self.builder = builder self.loss_fn = loss_fn self.logdir = join(model_folder, 'logs') self.writer = SummaryWriter(self.logdir) self.logger = Logger(self.args.log_file) self.itr = 0 # Create Model self.model = self.create_model() if multi_gpu: self.model = torch.nn.DataParallel(self.model, device_ids=range( torch.cuda.device_count())) # Build validation set validation_builder = builder(self.args.n_views, validation_directory, IMAGE_SIZE, self.args, toRot=True, sample_size=SAMPLE_SIZE) validation_set = [ validation_builder.build_set() for i in range(VAL_SEQS) ] validation_set = ConcatDataset(validation_set) self.len_validation_set = len(validation_set) del validation_builder self.validation_loader = DataLoader( validation_set, batch_size=8, shuffle=False, pin_memory=self.use_cuda, ) self.validation_calls = 0 # Build Training Set self.triplet_builder = builder(self.args.n_views, \ train_directory, IMAGE_SIZE, self.args, toRot=True, sample_size=SAMPLE_SIZE) self.training_queue = multiprocessing.Queue(1) dataset_builder_process = multiprocessing.Process( target=self.build_set, args=(self.training_queue, self.triplet_builder, self.logger), daemon=True) dataset_builder_process.start() # Get Logger # Model specific setup # self.optimizer = optim.SGD(self.model.parameters(), lr=self.args.lr_start, momentum=0.9) self.optimizer = optim.Adam(self.model.parameters(), lr=0.001, betas=(0.9, 0.999), eps=1e-08) # This will diminish the learning rate at the milestones ///// 0.1, 0.01, 0.001 if not using automized scheduler self.learning_rate_scheduler = lr_scheduler.ReduceLROnPlateau( self.optimizer, 'min')
# Loop over epochs. lr = args.lr best_val_loss = [] stored_loss = 100000000 # At any point you can hit Ctrl + C to break out of training early. try: optimizer = None # Ensure the optimizer is optimizing params, which includes both the model's weights as well as the criterion's weight (i.e. Adaptive Softmax) if args.optimizer == 'sgd': optimizer = torch.optim.SGD(params, lr=args.lr, weight_decay=args.wdecay) if args.optimizer == 'adam': optimizer = torch.optim.Adam(params, lr=args.lr, betas=(0, 0.999), eps=1e-9, weight_decay=args.wdecay) scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, 'min', 0.5, patience=2, threshold=0) for epoch in range(1, args.epochs + 1): epoch_start_time = time.time() train() if 't0' in optimizer.param_groups[0]: tmp = {} for prm in model.parameters(): tmp[prm] = prm.data.clone() prm.data = optimizer.state[prm]['ax'].clone() val_loss2 = evaluate(val_data, eval_batch_size) print('-' * 89) print('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | ' 'valid ppl {:8.2f} | valid bpc {:8.3f}'.format( epoch, (time.time() - epoch_start_time), val_loss2, math.exp(val_loss2), val_loss2 / math.log(2))) print('-' * 89)
elif model_name == 'densenet': model.classifier = torch.nn.Linear(2208, 3) if torch.cuda.device_count() > 1: model = torch.nn.DataParallel(model) model.to(device) return model model = get_model("resnet18") criterion = torch.nn.CrossEntropyLoss() optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY) scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=1) # prepare_dataset() train_transforms = transforms.Compose([ transforms.Resize((224, 224)), transforms.ToTensor(), transforms.Normalize(mean=mean, std=std) ]) augment_transforms = transforms.Compose([ transforms.Resize((224, 224)), torchvision.transforms.RandomHorizontalFlip(p=1), torchvision.transforms.RandomRotation(20, resample=Image.BILINEAR), transforms.ToTensor(), transforms.Normalize(mean=mean, std=std) ])
def get_scheduler(optimizer, opt): print('opt.lr_policy = [{}]'.format(opt.lr_policy)) if opt.lr_policy == 'lambda': def lambda_rule(epoch): lr_l = 1.0 - max(0, epoch + 1 + opt.epoch_count - opt.niter) / float(opt.niter_decay + 1) return lr_l scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lambda_rule) elif opt.lr_policy == 'step': scheduler = lr_scheduler.StepLR(optimizer, step_size=opt.lr_decay_iters, gamma=0.5) elif opt.lr_policy == 'step2': scheduler = lr_scheduler.StepLR(optimizer, step_size=opt.lr_decay_iters, gamma=0.1) elif opt.lr_policy == 'plateau': print('schedular=plateau') scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, threshold=0.01, patience=5) elif opt.lr_policy == 'plateau2': scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.2, threshold=0.01, patience=5) elif opt.lr_policy == 'step_warmstart': def lambda_rule(epoch): #print(epoch) if epoch < 5: lr_l = 0.1 elif 5 <= epoch < 100: lr_l = 1 elif 100 <= epoch < 200: lr_l = 0.1 elif 200 <= epoch: lr_l = 0.01 return lr_l scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lambda_rule) elif opt.lr_policy == 'step_warmstart2': def lambda_rule(epoch): #print(epoch) if epoch < 5: lr_l = 0.1 elif 5 <= epoch < 50: lr_l = 1 elif 50 <= epoch < 100: lr_l = 0.1 elif 100 <= epoch: lr_l = 0.01 return lr_l scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lambda_rule) else: return NotImplementedError( 'learning rate policy [%s] is not implemented', opt.lr_policy) return scheduler
def train(train_iter, dev_iter, test_iter, model, args): if args.cuda: model.cuda() # torch.cuda.seed() torch.cuda.manual_seed(hyperparams.seed_num) # optimizer = torch.optim.Adam(model.parameters(), lr=args.lr, weight_decay=1e-8) # optimizer = torch.optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.init_weight_decay) # optimizer = torch.optim.SGD(model.parameters(), lr=args.lr,momentum=) # optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=args.lr) if args.Adam is True: print("Adam Training......") optimizer = torch.optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.init_weight_decay) elif args.SGD is True: print("SGD Training.......") optimizer = torch.optim.SGD(model.parameters(), lr=args.lr, weight_decay=args.init_weight_decay, momentum=args.momentum_value) elif args.Adadelta is True: print("Adadelta Training.......") optimizer = torch.optim.Adadelta(model.parameters(), lr=args.lr, weight_decay=args.init_weight_decay) # lambda1 = lambda epoch: epoch // 30 # lambda2 = lambda epoch: 0.99 ** epoch # print("lambda1 {} lambda2 {} ".format(lambda1, lambda2)) # scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=[lambda2]) # scheduler = lr_scheduler.StepLR(optimizer, step_size=3, gamma=0.9) scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, 'min') steps = 0 epoch_step = 0 model_count = 0 model.train() for epoch in range(1, args.epochs+1): print("\n## 第{} 轮迭代,共计迭代 {} 次 !##\n".format(epoch, args.epochs)) # scheduler.step() # print("now lr is {} \n".format(scheduler.get_lr())) print("now lr is {} \n".format(optimizer.param_groups[0].get("lr"))) for batch in train_iter: feature, target = batch.text, batch.label feature.data.t_(), target.data.sub_(1) # batch first, index align if args.cuda: feature, target = feature.cuda(), target.cuda() optimizer.zero_grad() model.zero_grad() logit = model(feature) loss = F.cross_entropy(logit, target) loss.backward() if args.init_clip_max_norm is not None: utils.clip_grad_norm(model.parameters(), max_norm=args.init_clip_max_norm) optimizer.step() steps += 1 if steps % args.log_interval == 0: train_size = len(train_iter.dataset) corrects = (torch.max(logit, 1)[1].view(target.size()).data == target.data).sum() accuracy = float(corrects)/batch.batch_size * 100.0 sys.stdout.write( '\rBatch[{}/{}] - loss: {:.6f} acc: {:.4f}%({}/{})'.format(steps, train_size, loss.data[0], accuracy, corrects, batch.batch_size)) if steps % args.test_interval == 0: eval(dev_iter, model, args) if steps % args.save_interval == 0: if not os.path.isdir(args.save_dir): os.makedirs(args.save_dir) save_prefix = os.path.join(args.save_dir, 'snapshot') save_path = '{}_steps{}.pt'.format(save_prefix, steps) torch.save(model, save_path) print("\n", save_path, end=" ") test_model = torch.load(save_path) model_count += 1 test_eval(test_iter, test_model, save_path, args, model_count) return model_count
if torch.cuda.device_count() > 1: print("Let's use", torch.cuda.device_count(), "GPUs!") #dim = 0 [30, xxx] -> [10, ...], [10, ...], [10, ...] on 3 GPUs model = nn.DataParallel(model) model.load_state_dict(torch.load('plantclef_imagenet_true_6.pth')) num_ftrs = model.module.fc.in_features model.fc = nn.Linear(num_ftrs, len(classes)) model = model.to(device) criterion = nn.CrossEntropyLoss() # Observe that all parameters are being optimized optimizer = optim.SGD(model.parameters(), lr=lr, momentum=momentum) # Decay LR by a factor of 0.1 every 7 epochs exp_lr_scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, patience=5) if resume: if os.path.isfile(resume): print("=> loading checkpoint '{}'".format(resume)) checkpoint = torch.load(resume) start_epoch = checkpoint['epoch'] best_acc = checkpoint['best_acc'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) train_acc = checkpoint['train_acc'] best_prec1 = checkpoint['best_prec1'] best_prec5 = checkpoint['best_prec5'] lr = checkpoint['lr'] top1_acc = checkpoint['top1_acc'] top5_acc = checkpoint['top5_acc'] train_losses = checkpoint['train_losses']
) proba_t = np.zeros((len(test_data), NUM_CLASSES)) folds = 5 train_data.stratifiedKFold(folds) for fold in range(folds): #划分训练集和验证集 并返回验证集数据 model = Model(num_classes=NUM_CLASSES) save_dir = os.path.join(SAVE_DIR, "flod_{}".format(fold)) agent = Agent(model=model, device_info=DEVICE_INFO, save_dir=save_dir) earlyStopping = None LOSS = {"celoss": CELoss()} OPTIM = Adam(model.parameters(), lr=0.001, weight_decay=0.001) reduceLR = lr_scheduler.ReduceLROnPlateau(OPTIM, mode="max", factor=0.5, patience=8, verbose=True) agent.compile(loss_dict=LOSS, optimizer=OPTIM, metrics=METRICS) agent.summary() valid_X, valid_Y = train_data.get_valid_data(fold) valid_Y = one_hot(valid_Y, NUM_CLASSES) valid_data = [(valid_X[i], valid_Y[i]) for i in range(valid_X.shape[0])] train_generator = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True, num_workers=0) agent.fit_generator(train_generator, epochs=EPOCH, validation_data=valid_data,
def _get_scheduler(self, optimizer, config): return lr_scheduler.ReduceLROnPlateau(optimizer=optimizer, factor=config['factor'], patience=config['patience'])
def train(args): global DEBUG DEBUG = args.debug # get timestamp for model id dt = datetime.datetime.now() timestamp = '{}-{}/{:02d}-{:02d}-{:02d}'.format(dt.strftime("%b"), dt.day, dt.hour, dt.minute, dt.second) model_dir = os.path.join(EXP_DIR, timestamp) os.makedirs(model_dir) # configure logging logging.basicConfig(filename=os.path.join(model_dir, 'log.txt'),level=logging.INFO, format='%(asctime)s %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p') if args.verbosity >= 1: root = logging.getLogger() root.setLevel(logging.DEBUG) ch = logging.StreamHandler(sys.stdout) ch.setLevel(logging.DEBUG) formatter = logging.Formatter('%(asctime)s %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p') ch.setFormatter(formatter) root.addHandler(ch) # set device (if using CUDA) seed = 12345 if torch.cuda.is_available(): torch.cuda.device(args.gpu) torch.cuda.manual_seed(seed) else: torch.manual_seed(seed) # write the args to outfile for k, v in sorted(vars(args).items()): logging.info('{} : {}\n'.format(k, v)) # load data training_set, validation_set = load_data(args) logging.info('Loaded data: {} training examples, {} validation examples\n'.format( len(training_set), len(validation_set))) # get config experiment_config = get_experiment_config(args, training_set, validation_set) # initialize model if args.load is None: logging.info('Initializing model...\n') model = experiment_config.model_generator(experiment_config.model_config) else: logging.info('Loading model from {}\n'.format(args.load)) model = torch.load(os.path.join(EXP_DIR, args.load, 'model.ckpt')) if torch.cuda.is_available(): training_set.cuda() validation_set.cuda() model.cuda() logging.info(model) logging.info('Number of trainable parameters: {}\n'.format(model.number_of_parameters())) logging.info('Training loss: {}\n'.format(experiment_config.loss_fn)) # optimizer lr = args.lr optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=args.weight_decay) scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, 'min', verbose=True, factor=0.5, patience=3, min_lr=lr/32) logging.info(optimizer) logging.info(scheduler) # Start Training for epoch in range(1, args.epochs + 1): if args.randomize_nodes: training_set.randomize_nodes() train_results = train_one_epoch(model, training_set, experiment_config.loss_fn, optimizer, experiment_config.monitors, args.debug) logging.info(results_str(epoch, train_results, 'train')) if epoch % 5 == 0: results = evaluate_one_epoch(model, validation_set, experiment_config.loss_fn, experiment_config.monitors) logging.info(results_str(epoch, results, 'eval')) torch.save(model, os.path.join(model_dir, 'model.ckpt')) logging.info("Saved model to {}\n".format(os.path.join(model_dir, 'model.ckpt'))) logging.info("Training: processed {:.1f} graphs per second".format(len(training_set) / train_results['time'])) with Capturing() as output: scheduler.step(results['loss']) if len(output) > 0: logging.info(output[0]) return model
def get_optimizer(args, net): """ Decide Optimizer """ def poly_schd(epoch): return math.pow(1 - epoch / args.num_steps, args.poly_exp) param_groups = net.parameters() if args.optim.lower() == "sgd": optimizer = optim.SGD(param_groups, lr=args.lr, weight_decay=args.weight_decay, momentum=args.momentum, nesterov=False) logger.info( f"[*] Using The SGD Optimizer with lr {args.lr} and weight decay {args.weight_decay} " f"and momentum {args.momentum}.") elif args.optim.lower() == "adam": optimizer = optim.Adam(param_groups, lr=args.lr, weight_decay=args.weight_decay) logger.info( f"[*] Using The Adam Optimizer with lr {args.lr} and weight decay {args.weight_decay}" ) else: raise NotImplementedError if args.lr_schedule == "step": # step_size 30 gamma 0.2 scheduler = lr_scheduler.StepLR(optimizer, step_size=args.step_size, gamma=args.gamma, last_epoch=-1) logger.info( f"[*] Using `Step` LR Scheduler with step size {args.step_size} and gamma {args.gamma}" ) elif args.lr_schedule == "multi_step": if isinstance(args.milestones, str): args.milestones = eval(args.milestones) scheduler = lr_scheduler.MultiStepLR(optimizer, milestones=args.milestones, gamma=args.gamma, last_epoch=args.last_epoch) logger.info( f"[*] Using `Multi Step` LR Scheduler with milestones {args.milestones} and gamma {args.gamma}" ) elif args.lr_schedule == "reduce_lr_on_plateau": patience, threshold = 8, 0.0005 scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, mode="max", factor=0.1, patience=patience, threshold=threshold, threshold_mode="rel", cooldown=0, min_lr=0) logger.info( f"[*] Using `Reduce Lr On Plateau` LR Scheduler with patience {8} and threshold {threshold}" ) elif args.lr_schedule == "poly": scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=poly_schd) logger.info(f"[*] Using `Poly` LR Scheduler with poly {args.poly_exp}") elif args.lr_schedule == "CosineAnnealingLR": scheduler = lr_scheduler.CosineAnnealingLR(optimizer, T_max=args.T_max, eta_min=args.min_lr) logger.info( f"[*] Using `CosineAnnealingLR` LR Scheduler with T_max: {args.T_max}, eta_min: {args.min_lr}" ) else: raise NotImplementedError return optimizer, scheduler
def train(): alex_net = AlexNet() alex_net = alex_net.cuda() alex_net_optimal = AlexNet() alex_net_optimal = alex_net_optimal.cuda() criterion = nn.CrossEntropyLoss() optimizer = optim.SGD(alex_net.parameters(), lr=0.01, weight_decay=0.0005, momentum=0.9) scheduler = ls.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=2) #stopping criteria parameters wait = 0 best_acc = 0.0 min_delta = 1e-3 p = 10 #for epoch in range(2): # loop over the dataset multiple times epoch = 0 j = 0 train_error = [] iter_train = [] while epoch < p: # for epoch in range(1): j = j + 1 running_loss = 0.0 for i, data in enumerate(trainloader, 0): # get the inputs inputs, labels = data # wrap them in Variable inputs, labels = Variable(inputs.cuda()), Variable(labels.cuda()) # zero the parameter gradients optimizer.zero_grad() # forward + backward + optimize outputs = alex_net(inputs) loss = criterion(outputs, labels) loss.backward() optimizer.step() # # print statistics running_loss += loss.data[0] # if i % 2000 == 0: # print every 2000 mini-batches # print('[%d, %5d] loss: %.3f' %(epoch + 1, i + 1, running_loss / 2000)) # running_loss = 0.0 train_error.append(100 - test(trainloader, alex_net)) iter_train.append(j) # acc = test(valloader,alex_net) # val_acc=0.0 val_loss = 0.0 correct = 0 total = 0 for i, data in enumerate(valloader, 0): val_input, val_label = data val_input, val_label = Variable(val_input.cuda()), val_label.cuda() val_output = alex_net(val_input) loss = criterion(val_output, Variable(val_label)) val_loss += loss.data[0] _, predicted = torch.max(val_output.data, 1) total += val_label.size(0) correct += (predicted == val_label).sum() val_acc = 100 * (correct / total) print( 'Accuracy of the network on the validation set: %.5f %% and validation loss: %.3f' % (val_acc, val_loss)) scheduler.step(100 - val_acc) if (val_acc - best_acc) > min_delta: best_acc = val_acc epoch = 0 alex_net_optimal.load_state_dict(alex_net.state_dict()) # alex_net_optimal = copy.deepcopy(alex_net) else: epoch = epoch + 1 #print('Finished Training') # plt.plot(iter_train, train_error, label='Train') # plt.xlabel('Epoch') # plt.ylabel('Cross-Entropy Loss') # plt.legend() print train_error return alex_net_optimal
def main(): ''' Main function ''' parser = argparse.ArgumentParser() parser.add_argument('-data', type=str, default='') parser.add_argument('-epoch', type=int, default=100) parser.add_argument('-batch_size', type=int, default=128) parser.add_argument('-dropout', type=float, default=0.5) parser.add_argument('-d_model', type=int, default=200) parser.add_argument('-brn', type=int, default=3) parser.add_argument('-log', default=None) parser.add_argument('-save_model', default='model/') parser.add_argument('-save_mode', type=str, choices=['all', 'best'], default='best') parser.add_argument('-no_cuda', action='store_true') parser.add_argument('-label_smoothing', action='store_true') parser.add_argument('--grained', type=int, default=8) ##### class parser.add_argument('-train_src', default='data/udp_any_data_8.txt') parser.add_argument('-save_data', default='') parser.add_argument('-max_word_seq_len', type=int, default=128) parser.add_argument('-min_word_count', type=int, default=0) parser.add_argument('-keep_case', action='store_true') parser.add_argument('-share_vocab', action='store_true') parser.add_argument('-vocab', default=None) parser.add_argument('-fold_num', type=int, default=0) #############fold parser.add_argument('-CUDA_VISIBLE_DEVICES', type=str, default='0') parser.add_argument('-lr', type=float, default=1e-3) opt = parser.parse_args() opt.cuda = not opt.no_cuda #opt.cuda = opt.no_cuda os.environ['CUDA_VISIBLE_DEVICES'] = opt.CUDA_VISIBLE_DEVICES os.makedirs(opt.save_model, exist_ok=True) #========= Processing Dataset =========# if (opt.data == ''): datamanager = DataManager(opt.train_src, opt.grained, opt.max_word_seq_len, opt.keep_case, opt.fold_num, opt.min_word_count, opt.save_data) data = datamanager.getdata() # #========= Loading Dataset =========# else: data = torch.load(opt.data) print('now seq len:', opt.max_word_seq_len) training_data, validation_data, testing_data = prepare_dataloaders( data, opt) #========= Preparing Model =========# print(opt) device = torch.device('cuda' if opt.cuda else 'cpu') nixae = Nixae(opt.max_word_seq_len, brn=opt.brn, label=opt.grained, d_model=opt.d_model, dropout=opt.dropout).to(device) learningrate = opt.lr optimizer = optim.Adam(filter(lambda x: x.requires_grad, nixae.parameters()), lr=learningrate, weight_decay=0.0003) scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, 'min', factor=0.1, patience=20, verbose=True, min_lr=1e-5) print(nixae) train(nixae, training_data, validation_data, testing_data, optimizer, device, opt, scheduler)
def train(): opt = parse_opts() print(opt) opt.arch = '{}-{}'.format(opt.model, opt.model_depth) torch.manual_seed(opt.manual_seed) print("Preprocessing train data ...") train_data = globals()['{}_test'.format(opt.dataset)](split=opt.split, train=1, opt=opt) print("Length of train data = ", len(train_data)) print("Preprocessing validation data ...") val_data = globals()['{}_test'.format(opt.dataset)](split=opt.split, train=2, opt=opt) print("Length of validation data = ", len(val_data)) if opt.modality=='RGB': opt.input_channels = 3 elif opt.modality=='Flow': opt.input_channels = 2 print("Preparing datatloaders ...") train_dataloader = DataLoader(train_data, batch_size = opt.batch_size, shuffle=True, num_workers = opt.n_workers, pin_memory = True, drop_last=True) val_dataloader = DataLoader(val_data, batch_size = opt.batch_size, shuffle=True, num_workers = opt.n_workers, pin_memory = True, drop_last=True) print("Length of train datatloader = ",len(train_dataloader)) print("Length of validation datatloader = ",len(val_dataloader)) log_path = os.path.join(opt.result_path, opt.dataset) if not os.path.exists(log_path): os.makedirs(log_path) result_path = "{}/{}/".format(opt.result_path, opt.dataset) if not os.path.exists(result_path): os.makedirs(result_path) if opt.log == 1: epoch_logger = Logger_MARS(os.path.join(log_path, 'Fusion_{}_{}_train_batch{}_sample{}_clip{}_lr{}_nesterov{}_manualseed{}_model{}{}_ftbeginidx{}_alpha{}.log' .format(opt.dataset, opt.split, opt.batch_size, opt.sample_size, opt.sample_duration, opt.learning_rate, opt.nesterov, opt.manual_seed, opt.model, opt.model_depth, opt.ft_begin_index , opt.MARS_alpha)) ,['epoch', 'loss', 'acc', 'lr'], opt.MARS_resume_path, opt.begin_epoch) val_logger = Logger_MARS(os.path.join(log_path, 'Fusion_{}_{}_val_batch{}_sample{}_clip{}_lr{}_nesterov{}_manualseed{}_model{}{}_ftbeginidx{}_alpha{}.log' .format(opt.dataset,opt.split, opt.batch_size, opt.sample_size, opt.sample_duration, opt.learning_rate, opt.nesterov, opt.manual_seed, opt.model, opt.model_depth, opt.ft_begin_index, opt.MARS_alpha)) ,['epoch', 'loss', 'acc'], opt.MARS_resume_path, opt.begin_epoch) if opt.nesterov: dampening = 0 else: dampening = opt.dampening # define the model print("Loading models... ", opt.model, opt.model_depth) model1, parameters1 = generate_model(opt) # if testing RGB+Flow streams change input channels opt.input_channels = 2 model2, parameters2 = generate_model(opt) model_fusion = new_fusion_model(opt.n_finetune_classes) model_fusion = model_fusion.cuda() model_fusion = nn.DataParallel(model_fusion) if opt.resume_path1: print('Loading MARS model {}'.format(opt.resume_path1)) checkpoint = torch.load(opt.resume_path1) assert opt.arch == checkpoint['arch'] model1.load_state_dict(checkpoint['state_dict']) if opt.resume_path2: print('Loading Flow model {}'.format(opt.resume_path2)) checkpoint = torch.load(opt.resume_path2) assert opt.arch == checkpoint['arch'] model2.load_state_dict(checkpoint['state_dict']) if opt.resume_path3: print('Loading Fusion model {}'.format(opt.resume_path3)) checkpoint = torch.load(opt.resume_path3) assert opt.arch == checkpoint['arch'] model2.load_state_dict(checkpoint['state_dict']) model1.eval() model2.eval() model_fusion.train() for p in model1.parameters(): # if p.requires_grad: # print("Need to freeze the parameters") p.requires_grad = False for p in model2.parameters(): # if p.requires_grad: # print("Need to freeze the parameters..") p.requires_grad = False print("Initializing the optimizer ...") print("lr = {} \t momentum = {} \t dampening = {} \t weight_decay = {}, \t nesterov = {}" .format(opt.learning_rate, opt.momentum, dampening, opt.weight_decay, opt.nesterov)) print("LR patience = ", opt.lr_patience) optimizer = optim.SGD( model_fusion.parameters(), lr=opt.learning_rate, momentum=opt.momentum, dampening=dampening, weight_decay=opt.weight_decay, nesterov=opt.nesterov) scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=opt.lr_patience) criterion = nn.CrossEntropyLoss().cuda() print('run') for epoch in range(opt.begin_epoch, opt.n_epochs + 1): batch_time = AverageMeter() data_time = AverageMeter() losses = AverageMeter() accuracies = AverageMeter() weights=AverageMeter() end_time = time.time() for i, (inputs, targets) in enumerate(train_dataloader): data_time.update(time.time() - end_time) inputs_MARS = inputs[:, 0:3, :, :, :] inputs_Flow = inputs[:, 3:, :, :, :] targets = targets.cuda(non_blocking=True) inputs_MARS = Variable(inputs_MARS) inputs_Flow = Variable(inputs_Flow) targets = Variable(targets) outputs_MARS = model1(inputs_MARS) outputs_Flow = model2(inputs_Flow) weight,outputs_var =model_fusion(outputs_MARS.detach(),outputs_Flow.detach()) loss=criterion(outputs_var,targets) acc = calculate_accuracy(outputs_var, targets) losses.update(loss.data, inputs.size(0)) accuracies.update(acc, inputs.size(0)) weights.update(weight[0][0].data, inputs.size(0)) optimizer.zero_grad() loss.backward() optimizer.step() batch_time.update(time.time() - end_time) end_time = time.time() print('Epoch: [{0}][{1}/{2}]\t' 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 'Data {data_time.val:.3f} ({data_time.avg:.3f})\t' 'Loss {loss.val:.4f} ({loss.avg:.4f})\t' 'Acc {acc.val:.3f} ({acc.avg:.3f})\t' 'Weight {weight.val:.3f} ({weight.avg:.3f})'.format( epoch, i + 1, len(train_dataloader), batch_time=batch_time, data_time=data_time, loss=losses, acc=accuracies, weight=weights )) if opt.log == 1: epoch_logger.log({ 'epoch': epoch, 'loss': losses.avg, 'acc': accuracies.avg, 'lr': optimizer.param_groups[0]['lr'] }) if epoch % opt.checkpoint == 0: if opt.pretrain_path != '': save_file_path = os.path.join(log_path, 'Fusion_{}_{}_train_batch{}_sample{}_clip{}_lr{}_nesterov{}_manualseed{}_model{}{}_ftbeginidx{}_alpha{}_{}.pth' .format(opt.dataset, opt.split, opt.batch_size, opt.sample_size, opt.sample_duration, opt.learning_rate, opt.nesterov, opt.manual_seed, opt.model, opt.model_depth, opt.ft_begin_index, opt.MARS_alpha, epoch)) else: save_file_path = os.path.join(log_path, 'Fusion_{}_{}_train_batch{}_sample{}_clip{}_lr{}_nesterov{}_manualseed{}_model{}{}_ftbeginidx{}_alpha{}_{}.pth' .format(opt.dataset, opt.split, opt.batch_size, opt.sample_size, opt.sample_duration, opt.learning_rate, opt.nesterov, opt.manual_seed, opt.model, opt.model_depth, opt.ft_begin_index, opt.MARS_alpha, epoch)) states = { 'epoch': epoch + 1, 'arch': opt.arch, 'state_dict': model_fusion.state_dict(), 'optimizer': optimizer.state_dict(), } torch.save(states, save_file_path) model_fusion.eval() batch_time = AverageMeter() data_time = AverageMeter() losses = AverageMeter() accuracies = AverageMeter() end_time = time.time() with torch.no_grad(): for i, (inputs, targets) in enumerate(val_dataloader): data_time.update(time.time() - end_time) inputs_MARS = inputs[:, 0:3, :, :, :] inputs_Flow = inputs[:, 3:, :, :, :] targets = targets.cuda(non_blocking=True) inputs_MARS = Variable(inputs_MARS) inputs_Flow=Variable(inputs_Flow) targets = Variable(targets) outputs_MARS = model1(inputs_MARS) outputs_Flow = model2(inputs_Flow) _,outputs_var=model_fusion(outputs_MARS,outputs_Flow) loss = criterion(outputs_var, targets) acc = calculate_accuracy(outputs_var, targets) losses.update(loss.data, inputs.size(0)) accuracies.update(acc, inputs.size(0)) batch_time.update(time.time() - end_time) end_time = time.time() print('Val_Epoch: [{0}][{1}/{2}]\t' 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 'Data {data_time.val:.3f} ({data_time.avg:.3f})\t' 'Loss {loss.val:.4f} ({loss.avg:.4f})\t' 'Acc {acc.val:.3f} ({acc.avg:.3f})'.format( epoch, i + 1, len(val_dataloader), batch_time=batch_time, data_time=data_time, loss=losses, acc=accuracies)) if opt.log == 1: val_logger.log({'epoch': epoch, 'loss': losses.avg, 'acc': accuracies.avg}) scheduler.step(losses.avg)
def train(paths_dict, model, transformation, criterion, device, save_path, opt): since = time.time() dataloaders = dict() # Define transforms for data normalization and augmentation for domain in ['source', 'target']: subjects_domain_train = ImagesDataset( paths_dict[domain]['training'], transform=transformation['training'][domain]) subjects_domain_val = ImagesDataset( paths_dict[domain]['validation'], transform=transformation['validation'][domain]) # Number of workers workers = 10 batch_loader_domain_train = infinite_iterable( DataLoader(subjects_domain_train, batch_size=batch_size)) batch_loader_domain_val = infinite_iterable( DataLoader(subjects_domain_val, batch_size=batch_size)) dataloaders_domain = dict() dataloaders_domain['training'] = batch_loader_domain_train dataloaders_domain['validation'] = batch_loader_domain_val dataloaders[domain] = dataloaders_domain # Training parameters are saved df_path = os.path.join(opt.model_dir, 'log.csv') if os.path.isfile(df_path): # If the training already started df = pd.read_csv(df_path, index_col=False) epoch = df.iloc[-1]['epoch'] best_epoch = df.iloc[-1]['best_epoch'] val_eval_criterion_MA = df.iloc[-1]['MA'] best_val_eval_criterion_MA = df.iloc[-1]['best_MA'] initial_lr = df.iloc[-1]['lr'] model.load_state_dict(torch.load(save_path.format('best'))) else: # If training from scratch df = pd.DataFrame( columns=['epoch', 'best_epoch', 'MA', 'best_MA', 'lr']) val_eval_criterion_MA = None best_epoch = 0 epoch = 0 initial_lr = opt.learning_rate model = model.to(device) # Optimisation policy optimizer = torch.optim.Adam(model.parameters(), initial_lr, weight_decay=weight_decay, amsgrad=True) lr_s = lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.2, patience=patience_lr, verbose=True, threshold=1e-3, threshold_mode="abs") # Loop parameters continue_training = True ind_batch_train = np.arange( 0, samples_per_volume * len(paths_dict['source']['training']), batch_size) ind_batch_val = np.arange( 0, samples_per_volume * max(len(paths_dict['source']['validation']), len(paths_dict['target']['validation'])), batch_size) ind_batch = dict() ind_batch['training'] = ind_batch_train ind_batch['validation'] = ind_batch_val # Loss initialisation crf_l = CRFLoss(alpha=opt.alpha, beta=opt.beta, is_da=False) crf_l_da = CRFLoss(alpha=0, beta=opt.beta_da, is_da=True) while continue_training: epoch += 1 print('-' * 10) print('Epoch {}/'.format(epoch)) for param_group in optimizer.param_groups: print("Current learning rate is: {}".format(param_group['lr'])) # Each epoch has a training and validation phase for phase in ['training', 'validation']: print(phase) if phase == 'training': model.train() # Set model to training mode else: model.eval() # Set model to evaluate mode running_loss = 0.0 running_loss_target = 0.0 running_loss_source = 0.0 epoch_samples = 0 # Iterate over data for _ in tqdm(ind_batch[phase]): # Next source batch batch_source = next(dataloaders['source'][phase]) labels_source = batch_source['label'][DATA].to(device).type( torch.cuda.IntTensor) inputs_source = torch.cat( [batch_source[k][DATA] for k in MODALITIES_SOURCE], 1).to(device) # Next target batch batch_target = next(dataloaders['target'][phase]) scribbles_target = batch_target['scribble'][DATA].to(device) inputs_target = torch.cat( [batch_target[k][DATA] for k in MODALITIES_TARGET], 1).to(device) # zero the parameter gradients optimizer.zero_grad() # track history if only in train with torch.set_grad_enabled(phase == 'training'): outputs, features = model( torch.cat([inputs_source, inputs_target], 0), 'source') outputs_source, features_source = outputs[:batch_size, ...], features[: batch_size, ...] outputs_target, features_target = outputs[ batch_size:, ...], features[batch_size:, ...] # Loss Source with full Labels loss_source = criterion(outputs_source, labels_source) # Loss Target on Scribbles loss_target = scribble_loss(outputs_target, scribbles_target, criterion) # Within scans regularisation (target only) if (opt.beta > 0 or opt.alpha > 0) and phase == 'training': reg_target = opt.weight_crf / nb_voxels[ 'target'] * crf_l(inputs_target, outputs_target) else: reg_target = 0.0 # Pairwise scans regularisation (DA) if opt.beta_da > 0 and phase == 'training' and opt.warmup > epoch: index = torch.LongTensor(2).random_( 0, features_source.shape[1]) features_crf = [ features_source[:, index, ...], features_target[:, index, ...] ] features_crf = torch.cat(features_crf, 0).detach().cuda() prob = [ onehot(labels_source, outputs_source.shape), torch.nn.Softmax(1)(outputs_target) ] prob = torch.cat(prob, 0) reg_da = opt.weight_crf / nb_voxels[ 'target'] * crf_l_da(I=features_crf, U=prob) else: reg_da = 0.0 if phase == 'training': loss = loss_source + loss_target + reg_target + reg_da else: loss = loss_source + loss_target # backward + optimize only if in training phase if phase == 'training': loss.backward() optimizer.step() # statistics epoch_samples += 1 running_loss += loss.item() running_loss_source += loss_source.item() running_loss_target += loss_target.item() epoch_loss = running_loss / epoch_samples epoch_loss_source = running_loss_source / epoch_samples epoch_loss_target = running_loss_target / epoch_samples print('{} Loss Seg Source: {:.4f}'.format(phase, epoch_loss_source)) print('{} Loss Seg Target: {:.4f}'.format(phase, epoch_loss_target)) if phase == 'validation': if val_eval_criterion_MA is None: # first iteration val_eval_criterion_MA = epoch_loss best_val_eval_criterion_MA = val_eval_criterion_MA else: #update criterion val_eval_criterion_MA = val_eval_criterion_alpha * val_eval_criterion_MA + ( 1 - val_eval_criterion_alpha) * epoch_loss df = df.append( { 'epoch': epoch, 'best_epoch': best_epoch, 'MA': val_eval_criterion_MA, 'best_MA': best_val_eval_criterion_MA, 'lr': param_group['lr'] }, ignore_index=True) df.to_csv(df_path, index=False) lr_s.step(val_eval_criterion_MA) if val_eval_criterion_MA < best_val_eval_criterion_MA: best_val_eval_criterion_MA = val_eval_criterion_MA best_epoch = epoch torch.save(model.state_dict(), save_path.format('best')) else: if epoch - best_epoch > nb_patience: continue_training = False if epoch == opt.warmup: torch.save(model.state_dict(), save_path.format('warmup')) time_elapsed = time.time() - since print('Training completed in {:.0f}m {:.0f}s'.format( time_elapsed // 60, time_elapsed % 60)) print('Best epoch is {}'.format(best_epoch))
def train_reinforcement(grammar=True, model=None, EPOCHS=None, BATCH_SIZE=None, lr=2e-4, main_dataset=None, new_datasets=None, plot_ignore_initial=0, save_file=None, plot_prefix='', dashboard='main', preload_weights=False): root_location = os.path.dirname( os.path.abspath(inspect.getfile(inspect.currentframe()))) root_location = root_location + '/../' if save_file is not None: save_path = root_location + 'pretrained/' + save_file else: save_path = None molecules = True # checking for validity only makes sense for molecules settings = get_settings(molecules=molecules, grammar=grammar) # TODO: separate settings for this? if EPOCHS is not None: settings['EPOCHS'] = EPOCHS if BATCH_SIZE is not None: settings['BATCH_SIZE'] = BATCH_SIZE if preload_weights: try: model.load(save_path) except: pass nice_params = filter(lambda p: p.requires_grad, model.parameters()) optimizer = optim.Adam(nice_params, lr=lr) # # create the composite loaders # train_loader, valid_loader = train_valid_loaders(main_dataset, # valid_fraction=0.1, # batch_size=BATCH_SIZE, # pin_memory=use_gpu) train_l = [] valid_l = [] for ds in new_datasets: train_loader, valid_loader = SamplingWrapper(ds)\ .get_train_valid_loaders(BATCH_SIZE, valid_batch_size = 1+int(BATCH_SIZE/5), dataset_name=['actions','seq_len','valid','sample_seq_ind'], window=1000) train_l.append(train_loader) valid_l.append(valid_loader) train_gen = CombinedLoader(train_l, num_batches=90) valid_gen = CombinedLoader(valid_l, num_batches=10) scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.2, patience=3, min_lr=0.0001, eps=1e-08) #scheduler = lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.9) loss_obj = ReinforcementLoss() fitter = fit(train_gen=train_gen, valid_gen=valid_gen, model=model, optimizer=optimizer, scheduler=scheduler, epochs=settings['EPOCHS'], loss_fn=loss_obj, save_path=save_path, save_always=True, dashboard_name=dashboard, plot_ignore_initial=plot_ignore_initial, plot_prefix=plot_prefix, loss_display_cap=200) return fitter
def __init__(self, model, train_loader, val_loader, optimizer, log_dir="./cache/logs/", log_level=logging.INFO, checkpoint_dir="./cache/model_cache/", echo=False, device="cuda:0", use_tensorboard=False, use_amp=False, seed=12321, n_gpus=1, patience=20): super(BaseBot, self).__init__() self.criterion = torch.nn.CrossEntropyLoss() self.model = model self.train_loader = train_loader self.val_loader = val_loader self.patience = patience self.optimizer = optimizer self.lr = self.optimizer.param_groups[0]['lr'] self.log_dir = log_dir self.log_level = log_level self.checkpoint_dir = checkpoint_dir self.checkpoint_path = os.path.join(self.checkpoint_dir, "checkpoint.pt") self.echo = echo self.device = device self.use_tensorboard = use_tensorboard self.use_amp = use_amp self.seed = seed self.n_gpus = n_gpus self.step = 0 self.gradient_accumulation_steps = 1 self.clip_grad = 0 self.batch_dim = 0 self.y_task = 2 ########################################################### self.scheduler = lr_scheduler.ReduceLROnPlateau(self.optimizer, factor=0.1, patience=int( self.patience / 2), verbose=True) ########################################################### for path in [self.log_dir, self.checkpoint_dir]: if not os.path.exists(path) or not os.path.isdir(path): try: os.makedirs(path) except: print(f"make {path} failed!") ########################################################### if self.use_amp: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) self.model, self.optimizer = amp.initialize(self.model, self.optimizer, opt_level="O1") if self.n_gpus > 1: self.model = torch.nn.DataParallel(self.model) self.model.to(self.device) ########################################################### # self.logger = Logger( # self.name, str(self.log_dir), self.log_level, # use_tensorboard=self.use_tensorboard, echo=self.echo) self.logger = logging.getLogger() self.logger.setLevel(logging.INFO) self.logger.info("SEED: %s", self.seed) ########################################################### self.count_model_parameters() ########################################################### self.set_seed(self.seed)
def train_network(): print('') print('') # Start measuring time - to evaluate performance of the training function start = timeit.default_timer() # Set seeds set_seed(args) # Make folders if not yet exist try: os.makedirs('save') except FileExistsError: pass # Save relevant arguments from a and set hardcoded arguments lr = args.lr # learning rate batch_size = args.batch_size # Mini-batch size num_epochs = args.num_epochs # Number of epochs to train the network seq_len = args.seq_len # Network architecture: rnn_name = args.rnn_name inputs_list = args.inputs_list outputs_list = args.outputs_list load_rnn = args.load_rnn # If specified this is the name of pretrained RNN which should be loaded path_save = args.path_save # Create rnn instance and update lists of input, outputs and its name (if pretraind net loaded) net, rnn_name, inputs_list, outputs_list \ = create_rnn_instance(rnn_name, inputs_list, outputs_list, load_rnn, path_save, device) # Create log for this RNN and determine its full name rnn_full_name = create_log_file(rnn_name, inputs_list, outputs_list, path_save) net.rnn_full_name = rnn_full_name ######################################################## # Create Dataset ######################################################## train_dfs, _ = load_data(args, args.train_file_name) normalization_info = calculate_normalization_info(train_dfs, args.path_save, rnn_full_name) test_dfs, time_axes_dev = load_data(args, args.val_file_name) train_dfs_norm = normalize_df(train_dfs, normalization_info) test_dfs_norm = normalize_df(test_dfs, normalization_info) del train_dfs, test_dfs train_set = Dataset(train_dfs_norm, args) dev_set = Dataset(test_dfs_norm, args, time_axes=time_axes_dev) print('Number of samples in training set: {}'.format(train_set.number_of_samples)) print('The training sets sizes are: {}'.format(train_set.df_lengths)) print('Number of samples in validation set: {}'.format(dev_set.number_of_samples)) print('') plot_results(net=net, args=args, dataset=dev_set, seq_len=1024, comment='This is the network at the beginning of the training', inputs_list=inputs_list, outputs_list=outputs_list, save=True, closed_loop_enabled=True) # Create PyTorch dataloaders for train and dev set train_generator = data.DataLoader(dataset=train_set, batch_size=batch_size, shuffle=True, num_workers=args.num_workers) dev_generator = data.DataLoader(dataset=dev_set, batch_size=512, shuffle=False, num_workers=args.num_workers) # Print parameter count print_parameter_count(net) # Seems not to function well # Select Optimizer optimizer = optim.Adam(net.parameters(), amsgrad=True, lr=lr) # TODO: Verify if scheduler is working. Try tweaking parameters of below scheduler and try cyclic lr scheduler # scheduler = lr_scheduler.CyclicLR(optimizer, base_lr=lr, max_lr=0.1) # scheduler = lr_scheduler.StepLR(optimizer, step_size=200, gamma=0.5) scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, 'min',patience=1, verbose=True) # Select Loss Function criterion = nn.MSELoss() # Mean square error loss function ''' Init Tensorboard ''' comment = f' batch_size={batch_size} lr={lr} seq_len={seq_len}' tb = SummaryWriter(comment=comment) ######################################################## # Training ######################################################## print("Starting training...") print('') time.sleep(0.001) # Create dictionary to store training history dict_history = {} dict_history['epoch'] = [] dict_history['time'] = [] dict_history['lr'] = [] dict_history['train_loss'] = [] dict_history['dev_loss'] = [] dict_history['dev_gain'] = [] dict_history['test_loss'] = [] dev_gain = 1 # The epoch_saved variable will indicate from which epoch is the last RNN model, # which was good enough to be saved epoch_saved = -1 for epoch in range(num_epochs): ########################################################################################################### # Training - Iterate batches ########################################################################################################### # Set RNN in training mode net = net.train() # Define variables accumulating training loss and counting training batchs train_loss = 0 train_batches = 0 # Iterate training over available batches # tqdm() is just a function which displays the progress bar # Otherwise the line below is the same as "for batch, labels in train_generator:" for batch, labels in tqdm(train_generator): # Iterate through batches # Reset the network (internal states of hidden layers and output history not the weights!) net.reset() # Further modifying the input and output form to fit RNN requirements # If GPU available we send tensors to GPU (cuda) if torch.cuda.is_available(): batch = batch.float().cuda().transpose(0, 1) labels = labels.float().cuda() else: batch = batch.float().transpose(0, 1) labels = labels.float() # # Reset memory of gradients # optimizer.zero_grad() # Warm-up (open loop prediction) to settle the internal state of RNN hidden layers net(rnn_input=batch[:args.warm_up_len, :, :]) # Reset memory of gradients optimizer.zero_grad() # Forward propagation - These are the results from which we calculate the update to RNN weights # GRU Input size must be (seq_len, batch, input_size) net(rnn_input=batch[args.warm_up_len:, :, :]) out = net.return_outputs_history() # Get loss loss = criterion(out[:, args.warm_up_len:, :], labels[:, args.warm_up_len:, :]) # Backward propagation loss.backward() # Gradient clipping - prevent gradient from exploding torch.nn.utils.clip_grad_norm_(net.parameters(), 100) # Update parameters optimizer.step() # scheduler.step() # Update variables for loss calculation batch_loss = loss.detach() train_loss += batch_loss # Accumulate loss train_batches += 1 # Accumulate count so we can calculate mean later ########################################################################################################### # Validation - Iterate batches ########################################################################################################### # Set the network in evaluation mode net = net.eval() # Define variables accumulating evaluation loss and counting evaluation batches dev_loss = 0 dev_batches = 0 for (batch, labels) in tqdm(dev_generator): # Reset the network (internal states of hidden layers and output history not the weights!) net.reset() # Further modifying the input and output form to fit RNN requirements # If GPU available we send tensors to GPU (cuda) if torch.cuda.is_available(): batch = batch.float().cuda().transpose(0, 1) labels = labels.float().cuda() else: batch = batch.float().transpose(0, 1) labels = labels.float() # Warm-up (open loop prediction) to settle the internal state of RNN hidden layers net(rnn_input=batch) out = net.return_outputs_history() # Get loss # For evaluation we always calculate loss over the whole maximal prediction period # This allow us to compare RNN models from different epochs loss = criterion(out[:, args.warm_up_len: args.seq_len], labels[:, args.warm_up_len: args.seq_len]) # Update variables for loss calculation batch_loss = loss.detach() dev_loss += batch_loss # Accumulate loss dev_batches += 1 # Accumulate count so we can calculate mean later # Reset the network (internal states of hidden layers and output history not the weights!) net.reset() # Get current learning rate # TODO(Fixed. It does changes now): I think now the learning rate do not change during traing, or it is not a right way to get this info. for param_group in optimizer.param_groups: lr_curr = param_group['lr'] scheduler.step(dev_loss) ''' Add data for tensorboard TODO : Add network graph and I/O to tensorboard ''' # tb.add_graph(net) tb.add_scalar('Train Loss', train_loss / train_batches, epoch) tb.add_scalar('Dev Loss', dev_loss / dev_batches, epoch) # Add the first sample of batch to tensorboard. Prediction is represented by Dotted line # TODO: Concatenate such graphs. But they are not continous # for i in range(labels.shape[2]): # time_label = np.arange(0, labels.shape[1], 1) # time_out = np.arange(0, out.shape[1], 1) # true_data = labels[1, :, i] # predicted_data = out[1, :, i] # fig_tb = plt.figure(5) # plt.plot(time_label, true_data.detach().cpu()) # plt.plot(time_out, predicted_data.detach().cpu(), linestyle='dashed') # tb.add_figure(tag=str(a.outputs_list[i]), figure=fig_tb, global_step=epoch) for name, param in net.named_parameters(): tb.add_histogram(name, param, epoch) tb.add_histogram(f'{name}.grad', param.grad, epoch) tb.close() # Write the summary information about the training for the just completed epoch to a dictionary dict_history['epoch'].append(epoch) dict_history['lr'].append(lr_curr) dict_history['train_loss'].append( train_loss.detach().cpu().numpy() / train_batches / (args.seq_len - args.warm_up_len)) dict_history['dev_loss'].append( dev_loss.detach().cpu().numpy() / dev_batches / (args.seq_len - args.warm_up_len)) # Get relative loss gain for network evaluation if epoch >= 1: dev_gain = (dict_history['dev_loss'][epoch - 1] - dict_history['dev_loss'][epoch]) / \ dict_history['dev_loss'][epoch - 1] dict_history['dev_gain'].append(dev_gain) # Print the summary information about the training for the just completed epoch print('\nEpoch: %3d of %3d | ' 'LR: %1.5f | ' 'Train-L: %6.4f | ' 'Val-L: %6.4f | ' 'Val-Gain: %3.2f |' % (dict_history['epoch'][epoch], num_epochs - 1, dict_history['lr'][epoch], dict_history['train_loss'][epoch], dict_history['dev_loss'][epoch], dict_history['dev_gain'][epoch] * 100)) print('') # Save the best model with the lowest dev loss # Always save the model from epoch 0 # TODO: this is a bug: you should only save the model from epoch 0 if there is no pretraind network if epoch == 0: min_dev_loss = dev_loss # If current loss smaller equal than minimal till now achieved loss, # save the current RNN model and save its loss as minimal ever achieved if dev_loss <= min_dev_loss: epoch_saved = epoch min_dev_loss = dev_loss torch.save(net.state_dict(), args.path_save + rnn_full_name + '.pt', _use_new_zipfile_serialization=False) print('>>> saving best model from epoch {}'.format(epoch)) print('') plot_string = 'This is the network after {} training epoch'.format(epoch + 1) plot_results(net=net, args=args, dataset=dev_set, seq_len=1024, comment=plot_string, inputs_list=inputs_list, outputs_list=outputs_list, save=True, closed_loop_enabled=True) else: print('>>> We keep model from epoch {}'.format(epoch_saved)) print('') # Evaluate the performance of the current network # by checking its predictions on a randomly generated CartPole experiment # open_loop_prediction_experiment(net, a, val_file) # When finished the training print the final message print("Training Completed... ") print(" ") # Calculate the total time it took to run the function stop = timeit.default_timer() total_time = stop - start # Return the total time it took to run the function return total_time
def main(): fold = 0 # 4.1 mkdirs if not os.path.exists(config.submit): os.makedirs(config.submit) if not os.path.exists(config.weights + config.model_name + os.sep +str(fold)): os.makedirs(config.weights + config.model_name + os.sep +str(fold)) if not os.path.exists(config.best_models): os.mkdir(config.best_models) if not os.path.exists("./logs/"): os.mkdir("./logs/") #4.2 get model model = MultiModalNet("se_resnext50_32x4d","dpn26",0.5) #se_resnext101_32x4d #4.3 optim & criterion optimizer = optim.SGD(model.parameters(),lr = config.lr,momentum=0.9,weight_decay=1e-4) # criterion = FocalLoss(alpha=[1,1,1,1,1,1,1,1,1]).to(device) criterion = nn.CrossEntropyLoss().to(device) start_epoch = 0 best_acc=0 best_loss = np.inf best_f1 = 0 best_results = [0,np.inf,0] val_metrics = [0,np.inf,0] resume = False if resume: checkpoint_path = r'./checkpoints/best_models/multimodal_fold_0_model_best_loss.pth.tar' if not os.path.isfile(checkpoint_path): raise RuntimeError("=> no checkpoint found at '{}'".format(checkpoint_path)) checkpoint = torch.load(checkpoint_path,map_location=device) best_acc = checkpoint['best_acc'] best_loss = checkpoint['best_loss'] best_f1 = checkpoint['best_f1'] start_epoch = checkpoint['epoch'] #args.cuda # if torch.cuda.is_available(): # model.module.load_state_dict(checkpoint['state_dict']) # else: # model.load_state_dict(checkpoint['state_dict']) model.load_state_dict(checkpoint['state_dict']) # ft = True # if ft: # optimizer.load_state_dict(checkpoint['optimizer']) # # Clear start epoch if fine-tuning # if args.ft: # args.start_epoch = 0 muti_gpu = False if torch.cuda.device_count() > 1 and muti_gpu == True: model = nn.DataParallel(model) model.to(device) all_files = pd.read_csv("/data/BaiDuBigData19-URFC/data/train_oversampling.csv") test_files = pd.read_csv("/data/BaiDuBigData19-URFC/data/test.csv") train_data_list,val_data_list = train_test_split(all_files, test_size=0.1, random_state = 2050) # load dataset train_gen = MultiModalDataset(train_data_list,config.train_data,config.train_vis,mode="train") train_loader = DataLoader(train_gen,batch_size=config.batch_size,shuffle=True,pin_memory=True,num_workers=16) #num_worker is limited by shared memory in Docker! val_gen = MultiModalDataset(val_data_list,config.train_data,config.train_vis,augument=False,mode="train") val_loader = DataLoader(val_gen,batch_size=config.batch_size,shuffle=False,pin_memory=True,num_workers=16) test_gen = MultiModalDataset(test_files,config.test_data,config.test_vis,augument=False,mode="test") test_loader = DataLoader(test_gen,1,shuffle=False,pin_memory=True,num_workers=16) #scheduler = lr_scheduler.StepLR(optimizer,step_size=10,gamma=0.1) #如果是best_acc 的话,mode = "max" ,如果是best_loss的话,mode = "min" scheduler = lr_scheduler.ReduceLROnPlateau(optimizer) #n_batches = int(len(train_loader.dataset) // train_loader.batch_size) #scheduler = CosineAnnealingLR(optimizer, T_max=n_batches*2) start = timer() #train for epoch in range(0,config.epochs):#config.epochs scheduler.step(epoch) # train train_metrics = train(train_loader,model,criterion,optimizer,epoch,val_metrics,best_results,start) # val val_metrics = evaluate(val_loader,model,criterion,epoch,train_metrics,best_results,start) # check results is_best_acc = val_metrics[0] > best_results[0] best_results[0] = max(val_metrics[0],best_results[0]) is_best_loss = val_metrics[1] < best_results[1] best_results[1] = min(val_metrics[1],best_results[1]) is_best_f1 = val_metrics[2] > best_results[2] best_results[2] = max(val_metrics[2],best_results[2]) # save model save_checkpoint({ "epoch":epoch + 1, "model_name":config.model_name, "state_dict":model.state_dict(), "best_acc":best_results[0], "best_loss":best_results[1], "optimizer":optimizer.state_dict(), "fold":fold, "best_f1":best_results[2], },is_best_acc,is_best_loss,is_best_f1,fold) # print logs print('\r',end='',flush=True) log.write('%s %5.1f %6.1f | %0.3f %0.3f %0.3f | %0.3f %0.3f %0.3f | %s %s %s | %s' % (\ "best", epoch, epoch, train_metrics[0], train_metrics[1],train_metrics[2], val_metrics[0],val_metrics[1],val_metrics[2], str(best_results[0])[:8],str(best_results[1])[:8],str(best_results[2])[:8], time_to_str((timer() - start),'min')) ) log.write("\n") time.sleep(0.01) best_model = torch.load("%s/%s_fold_%s_model_best_loss.pth.tar"%(config.best_models,config.model_name,str(fold))) model.load_state_dict(best_model["state_dict"]) evaluation(test_loader,model,fold)
def do_test(): device = "cuda:0" #set device sr_destination_base = 'Z:/SuperResolution/Labeled_Tiled_Datasets_Fix/BSDS200\Scale_3/' test_base_200 = 'Z:/SuperResolution/Labeled_Tiled_Datasets_Fix/BSDS100\Scale_3/' out_base = 'Z:\SuperResolution\Outputs\DRCNN_Baisc\\' checkFolder(out_base) #training online for a whole day batch_size = 32 epochs = 500 momentum = 0.9 decay = 0.0001 workers = 4 sr_dataset = ImageLabelDataset(sr_destination_base, transform=transforms.ToTensor(), resize=False) sr_dataloader = DataLoader(sr_dataset, batch_size=batch_size, shuffle=True, num_workers=workers, drop_last=True) test_dataset_200 = ImageLabelDataset(test_base_200, transform=transforms.ToTensor(), resize=False) test_dataloader_200 = DataLoader(test_dataset_200, batch_size=batch_size, shuffle=False, num_workers=workers, drop_last=True) model = Basic_DRC(n_recursions=8, n_channels=3) model = model.to(device) mse = nn.MSELoss() #SGD optimizer where each layer has their own weights opt = torch.optim.SGD(params=[ { 'params': model.parameters(), 'lr': 0.01 }, ], momentum=momentum, weight_decay=decay) sched = lr_scheduler.ReduceLROnPlateau(opt, 'min', factor=0.001, patience=5, min_lr=10e-6) avg_loss = 0 avg_test_loss = 0 train_loss_list = [] test_loss_list = [] test_psnr_list = [] test_ssim_list = [] for e in range(epochs): #print("Train Epoch: " + str(e)) for i, sample in tqdm(enumerate(sr_dataloader, 0), total=len(sr_dataloader)): model.train() x = sample['input'].to(device) y = sample['label'].to(device) opt.zero_grad() out = model(x) loss = mse(out, y).to(device) avg_loss += loss.item() loss.backward() opt.step() epoch_train_loss = avg_loss / len(sr_dataloader) train_loss_list.append(epoch_train_loss) print("Train Loss: " + str(epoch_train_loss)) avg_loss = 0 avg_psnr = 0 avg_ssim = 0 force_test = False if e % 10 == 0 or force_test: with torch.no_grad(): print("Testing Epoch: " + str(e)) for i, sample in tqdm(enumerate(test_dataloader_200, 0), total=len(test_dataloader_200)): model.eval() x = sample['input'].to(device) y = sample['label'].to(device) opt.zero_grad() out = model(x) test_loss = mse(out, y).to(device) sched.step(test_loss) if out.dtype != y.dtype: print("Dtype mixmatch") if out.shape != y.shape: print("shape mismatch") avg_test_loss += test_loss.item() avg_ssim += ssim(y.permute(0, 2, 3, 1).detach().cpu().numpy(), out.permute(0, 2, 3, 1).detach().cpu().numpy(), multichannel=True) avg_psnr += psnr(y.detach().cpu().numpy(), out.detach().cpu().numpy()) if i == 50: t_o = out[0].permute(1, 2, 0).detach().cpu().numpy() t_y = y[0].permute(1, 2, 0).detach().cpu().numpy() t_x = x[0].permute(1, 2, 0).detach().cpu().numpy() epoch_test_loss = avg_test_loss / len(test_dataloader_200) avg_ssim /= len(test_dataloader_200) avg_psnr /= len(test_dataloader_200) test_loss_list.append(epoch_test_loss) test_psnr_list.append(avg_psnr) test_ssim_list.append(avg_ssim) print("Test Loss: " + str(epoch_test_loss)) print("Avg SSIM: " + str(avg_ssim)) print("Avg PSNR: " + str(avg_psnr)) avg_test_loss = 0 fig, ax = plt.subplots(3) ax[0].imshow(t_y) ax[1].imshow(t_x) ax[2].imshow(t_o) nb_out = len(os.listdir(out_base)) fig.savefig(out_base + str(nb_out) + '.png', dpi=800) fig_l, ax_l = plt.subplots(4) ax_l[0].plot(train_loss_list, color='blue') ax_l[0].set_title("Train Loss") ax_l[1].plot(test_loss_list, color='red') ax_l[1].set_title("Test Loss") ax_l[2].plot(test_psnr_list) ax_l[2].set_title("Test Avg PSNR") ax_l[3].plot(test_ssim_list) ax_l[3].set_title("Test Avg SSIM") fig_l.tight_layout() fig_l.savefig(out_base + "test_metrics" + '.png', dpi=800)
def train(clean_dir, adv_dir, attack_type): ''' clean_dir: (str) path of the root folder of all clean images adv_dir: (str) path to the root folder of all attacked images attack_type: (str) type of attack name ''' # Ignore all warnings import warnings warnings.filterwarnings("ignore") # Setup Model hyer-param z_size = 2048 hidden_dim = 64 drop_p = 0.5 image_size = 224 channel_num = 3 is_res = True # Set up training hyer-params lr = 1e-3 weight_decay = 1e-5 batch_size = 64 num_epochs = 50 beta = 1 visual_interval = 2 best_loss = math.inf loss_record = {'train': {'total_loss': [], 'rec_loss':[], 'kl_loss':[]}, 'val': {'total_loss': [], 'rec_loss':[], 'kl_loss':[]}} dataset = {x: ImageDataset(clean_dir, adv_dir, attack_type, x) for x in ['train', 'val']} dataset_sizes = {x: len(dataset[x]) for x in ['train', 'val']} print('Dataset size: train {}, val {}'.format(dataset_sizes['train'], dataset_sizes['val'])) dataloaders = {'train': DataLoader(dataset['train'], batch_size=batch_size, shuffle=True, num_workers=0), 'val' : DataLoader(dataset['val'], batch_size=batch_size, shuffle=False, num_workers=0)} # Initialize VAE model, optimizer and scheduler model = VAE(image_size, channel_num, hidden_dim, z_size, is_res, drop_p).to(device) optimizer = optim.Adam(model.parameters(), lr=lr, betas=(0.9, 0.999), weight_decay=weight_decay) scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.1, patience=5, threshold=1e-7) # Training print ('Start training on {}...'.format(device)) since = time.time() counter = 0 for epoch in range(num_epochs): print('\nEpoch {}/{}, lr: {}, wd: {}'.format(epoch + 1, num_epochs, optimizer.param_groups[0]['lr'], weight_decay)) print('-' * 30) # early stop counter if optimizer.param_groups[0]['lr'] < 1e-6: counter += 1 if counter >= 5: break for phase in ['train', 'val']: if phase == 'train': model.train() else: model.eval() # Initial running loss running_total_loss = 0.0 running_rec_loss = 0.0 running_kl_loss = 0.0 for inputs, targets in tqdm(dataloaders[phase], desc='{} iterations'.format(phase), leave=False): inputs = inputs.to(device) targets = targets.to(device) # forward-prop with torch.set_grad_enabled(phase == 'train'): (mean, logvar), reconstructed = model(inputs) rec_loss = model.reconstruction_loss(reconstructed, targets) kl_loss = model.kl_divergence_loss(mean, logvar) total_loss = rec_loss + beta * kl_loss # backward + optimize only if in training phase if phase == 'train': # zero the parameter gradients optimizer.zero_grad() # backward-prop total_loss.backward() optimizer.step() # compute loss for running loss running_kl_loss += kl_loss.item() * inputs.size(0) running_rec_loss += rec_loss.item() * inputs.size(0) running_total_loss += total_loss.item() * inputs.size(0) # Compute epoch loss epoch_kl_loss = running_kl_loss / dataset_sizes[phase] epoch_rec_loss = running_rec_loss / dataset_sizes[phase] epoch_total_loss = running_total_loss / dataset_sizes[phase] # Update loss records loss_record[phase]['total_loss'].append(epoch_total_loss) loss_record[phase]['rec_loss'].append(epoch_rec_loss) loss_record[phase]['kl_loss'].append(epoch_kl_loss) # Output training/val results print('{} Loss: total: {:.4f}, rec_loss: {:.4f}, kl_loss: {:.4f}' .format(phase, epoch_total_loss, epoch_rec_loss, epoch_kl_loss)) # Save images if (epoch+1) % visual_interval == 0 and epoch > 0 and phase == 'val': rndIdx = random.randint(0, inputs.size(0)-1) print ('Save reconstructed images, random index={} in the last batch'.format(rndIdx)) visualResults(inputs[rndIdx], reconstructed[rndIdx], targets[rndIdx], epoch+1) # Step optimizer scheduler if phase == 'val': scheduler.step(epoch_total_loss) # Copy best model if phase == 'val' and epoch_total_loss < best_loss: best_loss = epoch_total_loss best_model_wts = copy.deepcopy(model.state_dict()) # End of training, return the best model time_elapsed = time.time() - since print('\nTraining complete in {:.0f}m {:.0f}s'.format(time_elapsed // 60, time_elapsed % 60)) print('Best val loss: {}'.format(best_loss)) # Save the best weights and loss_records save_path = './trained_weights/' if not os.path.isdir(save_path): os.mkdir(save_path) weight_fname = 'vae_{}_zdim{}_hdim{}_e{}_lr{}.torch'.format(attack_type, z_size, hidden_dim, num_epochs, str(lr).split('.')[-1]) s_path = os.path.join(save_path, weight_fname) torch.save(best_model_wts, s_path) print ('Best weight save to:', s_path) save_path = './trained_records/' if not os.path.isdir(save_path): os.mkdir(save_path) weight_fname = 'vae_{}_zdim{}_hdim{}_e{}_lr{}.pkl'.format(attack_type, z_size, hidden_dim, num_epochs, str(lr).split('.')[-1]) s_path = os.path.join(save_path, weight_fname) torch.save(best_model_wts, s_path) print ('Training records save to:', s_path)
def main(args): model_path = args.model_path save_dir = args.save_dir vec_dim = 128 data_type = ['validation' ] if args.phase == 'test' else ['train', 'validation'] img_list, base_path, item_dict = read_data("DeepFashion2", bbox_gt=True, type_list=data_type) # model = ResNetbasedNet(vec_dim=vec_dim, max_pool=True, load_path=model_path, clf2_num=2, adv_eta=1e-4) model = ResNetbasedNet(vec_dim=vec_dim, max_pool=True, load_path=model_path, clf2_num=2) domain_adap = args.domain_adap adv_train = args.adv_train is_cud = torch.cuda.is_available() device = torch.device("cuda" if is_cud else "cpu") if is_cud: if torch.cuda.device_count() > 1: model = nn.DataParallel(model) model.to(device) kwargs = {'num_workers': 8, 'pin_memory': True} if is_cud else {} if args.phase == 'train': train_dataset = DeepFashionDataset(img_list['train'], root=base_path, augment=True) train_batch_sampler = BalancedBatchSampler(train_dataset.labels, train_dataset.source, n_classes=64, n_samples=4) online_train_loader = torch.utils.data.DataLoader( train_dataset, batch_sampler=train_batch_sampler, **kwargs) test_dataset = DeepFashionDataset(img_list['validation'], root=base_path) test_batch_sampler = BalancedBatchSampler(test_dataset.labels, test_dataset.source, n_classes=64, n_samples=4) online_test_loader = torch.utils.data.DataLoader( test_dataset, batch_sampler=test_batch_sampler, **kwargs) margin = 0.2 loss_fn = OnlineTripletLoss(margin, HardestNegativeTripletSelector(margin), domain_adap) # loss_fn = AllTripletLoss(margin) criterion = nn.CrossEntropyLoss() optimizer = optim.Adam(model.parameters(), lr=1e-5, weight_decay=5e-4) # scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, mode="max", patience=4, threshold=0.001, cooldown=2, min_lr=1e-4 / (10 * 2),) scheduler = lr_scheduler.ReduceLROnPlateau( optimizer, mode="max", patience=4, threshold=1, cooldown=2, min_lr=1e-5 / (10 * 2), ) n_epochs = 300 log_interval = 200 fit(online_train_loader, online_test_loader, model, loss_fn, optimizer, scheduler, n_epochs, is_cud, log_interval, save_dir, metrics=[AverageNonzeroTripletsMetric()], start_epoch=200, criterion=criterion, domain_adap=domain_adap, adv_train=adv_train) # fit(online_train_loader, online_test_loader, model, loss_fn, optimizer, scheduler, n_epochs, is_cud, log_interval, # save_dir, metrics=[AverageNonzeroTripletsMetric()], start_epoch=0, criterion=criterion, # adv_train=True, adv_epsilon=0.01, adv_alph=0.007, adv_iter=1) else: with torch.no_grad(): model.eval() test_dataset = DeepFashionDataset(img_list['validation'], root=base_path) test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=256, shuffle=False, num_workers=4) embedding_mtx = torch.zeros((len(test_dataset), vec_dim)) labels = np.zeros(len(test_dataset)) top_k = 500 idx_ = 0 start_time = time.time() cf_mtx = np.zeros( 4, dtype=float ) # predict_user_real_user / predict_user_real_shop / predict_shop_real_user / predict_shop_real_shop for idx, (data, target, _, source) in enumerate(test_loader): emb_vecs = model(data.cuda()) embedding_mtx[idx_:idx_ + len(data)] = emb_vecs[0] predict = torch.argmax(emb_vecs[1], dim=1).cpu().numpy() real = source.cpu().numpy() cf_mtx[0] += np.sum((predict == 0) & (real == 0)) cf_mtx[1] += np.sum((predict == 0) & (real == 1)) cf_mtx[2] += np.sum((predict == 1) & (real == 0)) cf_mtx[3] += np.sum((predict == 1) & (real == 1)) labels[idx_:idx_ + len(data)] = np.asarray(target) idx_ += len(data) if idx % 20 == 0: print('processing {}/{}... elapsed time {}s'.format( idx + 1, len(test_loader), time.time() - start_time)) print('Total: {}, Domain Classification Acc: {:.5f}'.format( np.sum(cf_mtx), (cf_mtx[0] + cf_mtx[3]) / np.sum(cf_mtx))) print('Recall User Photo: {:.5f}'.format(cf_mtx[0] / (cf_mtx[0] + cf_mtx[2]))) print('Recall Shop Photo: {:.5f}'.format(cf_mtx[3] / (cf_mtx[1] + cf_mtx[3]))) np.save(os.path.join(save_dir, 'emb_mtx.npy'), embedding_mtx) with open(os.path.join(save_dir, 'file_info.txt'), 'w') as f: for i in range(len(test_dataset)): f.write('{},{},{},{}\n'.format(img_list['validation'][i][0], test_dataset[i][1], test_dataset[i][2], test_dataset[i][3])) print('save files!') distance_mtx = pdist(embedding_mtx) sorted_idx = torch.argsort(distance_mtx, dim=1).cpu().numpy() result_arr = np.zeros((sorted_idx.shape[0], top_k)) for idx in range(sorted_idx.shape[0]): result_arr[idx] = sorted_idx[idx][sorted_idx[idx] != idx][:top_k] result_arr[idx] = labels[result_arr[idx].astype( np.int)] == labels[idx] if idx % 1000 == 0: print(idx) for k in [1, 5, 10, 20, 100, 200, 500]: topk_accuracy = np.sum( np.sum(result_arr[:, :k], axis=1) > 0) / result_arr.shape[0] print('Top-{} Accuracy: {:.5f}'.format(k, topk_accuracy))
def main(): args = parse_command() print(args) # if setting gpu id, the using single GPU if args.gpu: print('Single GPU Mode.') os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu best_result = Result() best_result.set_to_worst() # set random seed torch.manual_seed(args.manual_seed) torch.cuda.manual_seed(args.manual_seed) np.random.seed(args.manual_seed) if torch.cuda.device_count() > 1: print("Let's use", torch.cuda.device_count(), "GPUs!") args.batch_size = args.batch_size * torch.cuda.device_count() else: print("Let's use GPU ", torch.cuda.current_device()) train_loader, val_loader = create_loader(args) if args.resume: assert os.path.isfile(args.resume), \ "=> no checkpoint found at '{}'".format(args.resume) print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume) start_epoch = checkpoint['epoch'] + 1 best_result = checkpoint['best_result'] optimizer = checkpoint['optimizer'] # solve 'out of memory' model = checkpoint['model'] print("=> loaded checkpoint (epoch {})".format(checkpoint['epoch'])) # clear memory del checkpoint # del model_dict torch.cuda.empty_cache() else: print("=> creating Model") model = get_models(args) print("=> model created.") start_epoch = 0 # different modules have different learning rate train_params = [{ 'params': model.get_1x_lr_params(), 'lr': args.lr }, { 'params': model.get_10x_lr_params(), 'lr': args.lr * 10 }] optimizer = torch.optim.SGD(train_params, lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) # You can use DataParallel() whether you use Multi-GPUs or not model = nn.DataParallel(model).cuda() # when training, use reduceLROnPlateau to reduce learning rate scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=args.lr_patience) # loss function criterion = criteria._CrossEntropyLoss2d(size_average=True, batch_average=True) # create directory path output_directory = utils.get_output_directory(args) if not os.path.exists(output_directory): os.makedirs(output_directory) best_txt = os.path.join(output_directory, 'best.txt') config_txt = os.path.join(output_directory, 'config.txt') # write training parameters to config file if not os.path.exists(config_txt): with open(config_txt, 'w') as txtfile: args_ = vars(args) args_str = '' for k, v in args_.items(): args_str = args_str + str(k) + ':' + str(v) + ',\t\n' txtfile.write(args_str) # create log log_path = os.path.join( output_directory, 'logs', datetime.now().strftime('%b%d_%H-%M-%S') + '_' + socket.gethostname()) if os.path.isdir(log_path): shutil.rmtree(log_path) os.makedirs(log_path) logger = SummaryWriter(log_path) start_iter = len(train_loader) * start_epoch + 1 max_iter = len(train_loader) * (args.epochs - start_epoch + 1) + 1 iter_save = len(train_loader) # iter_save = 1 # train model.train() if args.freeze: model.module.freeze_backbone_bn() output_directory = utils.get_output_directory(args, check=True) average_meter = AverageMeter() train_meter = AverageMeter() for it in tqdm(range(start_iter, max_iter + 1), total=max_iter, leave=False, dynamic_ncols=True): optimizer.zero_grad() loss = 0 data_time = 0 gpu_time = 0 for _ in range(args.iter_size): end = time.time() try: samples = next(loader_iter) except: loader_iter = iter(train_loader) samples = next(loader_iter) input = samples['image'].cuda() target = samples['label'].cuda() torch.cuda.synchronize() data_time_ = time.time() data_time += data_time_ - end with torch.autograd.detect_anomaly(): preds = model(input) # @wx 注意输出 # print('#train preds size:', len(preds)) # print('#train preds[0] size:', preds[0].size()) iter_loss = 0 if args.msc: for pred in preds: # Resize labels for {100%, 75%, 50%, Max} logits target_ = utils.resize_labels(target, shape=(pred.size()[-2], pred.size()[-1])) # print('#train pred size:', pred.size()) iter_loss += criterion(pred, target_) else: pred = preds target_ = utils.resize_labels(target, shape=(pred.size()[-2], pred.size()[-1])) # print('#train pred size:', pred.size()) # print('#train target size:', target.size()) iter_loss += criterion(pred, target_) # Backpropagate (just compute gradients wrt the loss) iter_loss /= args.iter_size iter_loss.backward() loss += float(iter_loss) gpu_time += time.time() - data_time_ torch.cuda.synchronize() # Update weights with accumulated gradients optimizer.step() # measure accuracy and record loss result = Result() pred = F.softmax(pred, dim=1) result.evaluate(pred.data.cpu().numpy(), target.data.cpu().numpy(), n_class=21) average_meter.update(result, gpu_time, data_time, input.size(0)) train_meter.update(result, gpu_time, data_time, input.size(0)) if it % args.print_freq == 0: print('=> output: {}'.format(output_directory)) print('Train Iter: [{0}/{1}]\t' 't_Data={data_time:.3f}({average.data_time:.3f}) ' 't_GPU={gpu_time:.3f}({average.gpu_time:.3f})\n\t' 'Loss={Loss:.5f} ' 'MeanAcc={result.mean_acc:.3f}({average.mean_acc:.3f}) ' 'MIOU={result.mean_iou:.3f}({average.mean_iou:.3f}) '.format( it, max_iter, data_time=data_time, gpu_time=gpu_time, Loss=loss, result=result, average=average_meter.average())) logger.add_scalar('Train/Loss', loss, it) logger.add_scalar('Train/mean_acc', result.mean_iou, it) logger.add_scalar('Train/mean_iou', result.mean_acc, it) if it % iter_save == 0: epoch = it // iter_save resu1t, img_merge = validate(args, val_loader, model, epoch=epoch, logger=logger) # when rml doesn't fall, reduce learning rate scheduler.step(result.mean_iou) # save the change of learning_rate for i, param_group in enumerate(optimizer.param_groups): old_lr = float(param_group['lr']) logger.add_scalar('Lr/lr_' + str(i), old_lr, it) # vis the change between train and test train_avg = train_meter.average() logger.add_scalars( 'TrainVal/mean_acc', { 'train_mean_acc': train_avg.mean_acc, 'test_mean_acc': result.mean_acc }, epoch) logger.add_scalars( 'TrainVal/mean_iou', { 'train_mean_iou': train_avg.mean_iou, 'test_mean_iou': result.mean_iou }, epoch) train_meter.reset() # remember best rmse and save checkpoint is_best = result.mean_iou < best_result.mean_iou if is_best: best_result = result with open(best_txt, 'w') as txtfile: txtfile.write("epoch={}, mean_iou={:.3f}, mean_acc={:.3f}" "t_gpu={:.4f}".format( epoch, result.mean_iou, result.mean_acc, result.gpu_time)) if img_merge is not None: img_filename = output_directory + '/comparison_best.png' utils.save_image(img_merge, img_filename) # save checkpoint for each epoch utils.save_checkpoint( { 'args': args, 'epoch': epoch, 'model': model, 'best_result': best_result, 'optimizer': optimizer, }, is_best, it, output_directory) # change to train mode model.train() if args.freeze: model.module.freeze_backbone_bn() logger.close()
nn.BatchNorm1d(64), nn.ReLU(), nn.Linear(64, 10), ) def forward(self,x): out = self.layer(x) out = out.view(batch_size, -1) out = self.fc_layer(out) return out #머신 러닝 과정 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") model = CNN().to(device) loss_func = nn.CrossEntropyLoss().to(device) optimizer = optim.Adam(model.parameters(), lr=learning_rate) scheduler = lr_scheduler.ReduceLROnPlateau(optimizer,threshold=0.1, patience=1, mode='min') for i in range(1, num_epoch+1): for _,[image,label] in enumerate(train_loader): x = image.to(device) y_= label.to(device) optimizer.zero_grad() output = model.forward(x) loss = loss_func(output, y_) loss.backward() optimizer.step() scheduler.step(loss) print('Epoch: {}, Loss: {}, LR: {}'.format(i, loss.item(), scheduler.optimizer.state_dict()['param_groups'][0]['lr'])) #정확도 계산
def __init__(self, argpath, mode='cn'): super(TRADE, self).__init__() self.init_session() self.crosswoz_root = os.path.dirname(os.path.abspath(__file__)) self.download_model() self.download_data() directory = argpath.split("/") HDD = directory[2].split('HDD')[1].split('BSZ')[0] decoder = directory[1].split('-')[0] BSZ = int(args['batch']) if args['batch'] else int(directory[2].split('BSZ')[1].split('DR')[0]) args["decoder"] = decoder train, dev, test, test_special, lang, SLOTS_LIST, gating_dict, max_word = prepare_data_seq_cn(False, 'dst', False, batch_size=4) self.slot_list = SLOTS_LIST self.test_set = test hidden_size = int(HDD) lang = lang path = argpath lr=0 task = 'dst' dropout = 0 slots = SLOTS_LIST gating_dict = gating_dict nb_train_vocab = max_word self.mode = mode self.name = "TRADE" self.task = task self.hidden_size = hidden_size self.lang = lang[0] self.mem_lang = lang[1] self.lr = lr self.dropout = dropout self.slots = slots[0] self.slot_temp = slots[2] self.gating_dict = gating_dict self.nb_gate = len(gating_dict) self.cross_entorpy = nn.CrossEntropyLoss() self.encoder = EncoderRNN(self.lang.n_words, hidden_size, self.dropout, mode=mode) self.decoder = Generator(self.lang, self.encoder.embedding, self.lang.n_words, hidden_size, self.dropout, self.slots, self.nb_gate) model_root = os.path.dirname(os.path.abspath(__file__)) if path: path = os.path.join(model_root, path) # if USE_CUDA: # print("MODEL {} LOADED".format(str(path))) # trained_encoder = torch.load(str(path) + '/enc.th') # trained_decoder = torch.load(str(path) + '/dec.th') # else: # print("MODEL {} LOADED".format(str(path))) # trained_encoder = torch.load(str(path) + '/enc.th', lambda storage, loc: storage) # trained_decoder = torch.load(str(path) + '/dec.th', lambda storage, loc: storage) self.encoder.load_state_dict(torch.load(str(path) + '/enc.pr')) self.decoder.load_state_dict(torch.load(str(path) + '/dec.pr')) # Initialize optimizers and criterion self.optimizer = optim.Adam(self.parameters(), lr=lr) self.scheduler = lr_scheduler.ReduceLROnPlateau(self.optimizer, mode='max', factor=0.5, patience=1, min_lr=0.0001, verbose=True) self.reset() if USE_CUDA: self.encoder.cuda() self.decoder.cuda()
def train(args): device = torch.device(args.device) text_field = TextField() label_field = LabelField() train_dataset, valid_dataset, test_dataset = load_data( root='data', text_field=text_field, label_field=label_field) # Our model will be run in 'open-vocabulary' mode. text_field.build_vocab(train_dataset, valid_dataset, test_dataset) label_field.build_vocab(train_dataset) text_field.vocab.load_vectors(args.word_vector) # Trim training data to make them shorter than the max length trim_dataset(train_dataset, max_length=args.max_length) train_loader, valid_loader, test_loader = data.Iterator.splits( datasets=(train_dataset, valid_dataset, test_dataset), batch_size=args.batch_size, device=device) config_path = os.path.join(args.save_dir, 'config.yml') with open(config_path, 'r') as f: config = yaml.load(f) model = QuoraModel(num_words=len(text_field.vocab), num_classes=len(label_field.vocab), **config['model']) model.word_embedding.weight.data.set_(text_field.vocab.vectors) model.word_embedding.weight.requires_grad = args.tune_word_embeddings print(model) model.to(device) num_params = sum(p.numel() for p in model.parameters()) num_intrinsic_params = num_params - model.word_embedding.weight.numel() logger.info(f'* # of params: {num_params}') logger.info(f' - Intrinsic: {num_intrinsic_params}') logger.info(f' - Word embedding: {num_params - num_intrinsic_params}') trainable_params = [p for p in model.parameters() if p.requires_grad] optimizer = optim.Adam(trainable_params) assert not args.warm_restart or args.cosine_lr if args.cosine_lr: if not args.warm_restart: scheduler = lr_scheduler.CosineAnnealingLR( optimizer=optimizer, T_max=len(train_loader) * args.max_epoch) else: scheduler = lr_scheduler.CosineAnnealingLR(optimizer=optimizer, T_max=len(train_loader)) else: scheduler = lr_scheduler.ReduceLROnPlateau(optimizer=optimizer, mode='max', factor=0.5, patience=10, verbose=True) criterion = nn.CrossEntropyLoss() def run_iter(batch): pre_text, pre_length = batch.text1 hyp_text, hyp_length = batch.text2 label = batch.label logit = model(pre_inputs=pre_text, pre_length=pre_length, hyp_inputs=hyp_text, hyp_length=hyp_length) clf_loss = criterion(input=logit, target=label) pred = logit.max(1)[1] accuracy = torch.eq(pred, label).float().mean() if model.training: if args.l2_weight > 0: l2_norm = sum(p.pow(2).sum() for p in trainable_params).sqrt() else: l2_norm = 0 loss = clf_loss + args.l2_weight * l2_norm optimizer.zero_grad() loss.backward() nn.utils.clip_grad_norm_(trainable_params, max_norm=5) optimizer.step() return clf_loss.item(), accuracy.item() def validate(loader): model.eval() clf_loss_sum = accuracy_sum = 0 num_valid_data = len(loader.dataset) with torch.no_grad(): for valid_batch in loader: clf_loss, accuracy = run_iter(valid_batch) clf_loss_sum += clf_loss * valid_batch.batch_size accuracy_sum += accuracy * valid_batch.batch_size clf_loss = clf_loss_sum / num_valid_data accuracy = accuracy_sum / num_valid_data return clf_loss, accuracy train_summary_writer = SummaryWriter( os.path.join(args.save_dir, 'log', 'train')) valid_summary_writer = SummaryWriter( os.path.join(args.save_dir, 'log', 'valid')) validate_every = len(train_loader) // args.verbosity best_valid_accuracy = 0 global_step = 0 logger.info('Training starts!') for train_batch in train_loader: if not model.training: model.train() train_clf_loss, train_accuracy = run_iter(train_batch) global_step += 1 if args.cosine_lr: if not args.warm_restart: scheduler.step() else: if scheduler.last_epoch == scheduler.T_max: scheduler.T_max = scheduler.T_max * 2 scheduler.step(0) logger.info('Warm-restarted the learning rate!') else: scheduler.step() train_summary_writer.add_scalar(tag='clf_loss', scalar_value=train_clf_loss, global_step=global_step) train_summary_writer.add_scalar(tag='accuracy', scalar_value=train_accuracy, global_step=global_step) if global_step % validate_every == 0: progress = train_loader.iterations / len(train_loader) logger.info(f'* Epoch {progress:.2f}') logger.info(f' - lr = {optimizer.param_groups[0]["lr"]:.6f}') logger.info(f' - Validation starts') valid_clf_loss, valid_accuracy = validate(valid_loader) _, test_accuracy = validate(test_loader) if not args.cosine_lr: scheduler.step(valid_accuracy) valid_summary_writer.add_scalar(tag='clf_loss', scalar_value=valid_clf_loss, global_step=global_step) valid_summary_writer.add_scalar(tag='accuracy', scalar_value=valid_accuracy, global_step=global_step) valid_summary_writer.add_scalar( tag='lr', scalar_value=optimizer.param_groups[0]['lr'], global_step=global_step) logger.info(f' - Valid clf loss: {valid_clf_loss:.5f}') logger.info(f' - Valid accuracy: {valid_accuracy:.5f}') logger.info(f' - Test accuracy: {test_accuracy:.5f}') if valid_accuracy > best_valid_accuracy: best_valid_accuracy = valid_accuracy model_filename = (f'best-{progress:.2f}' f'-{valid_clf_loss:.5f}' f'-{valid_accuracy:.5f}.pt') model_path = os.path.join(args.save_dir, model_filename) torch.save(model.state_dict(), model_path) logger.info(f' - Saved the new best model to: {model_path}') elif args.save_every_epoch and global_step % (validate_every * 10) == 0: model_filename = (f'model-{progress:.2f}' f'-{valid_clf_loss:.5f}' f'-{valid_accuracy:.5f}.pt') model_path = os.path.join(args.save_dir, model_filename) torch.save(model.state_dict(), model_path) logger.info(f' - Saved the new model to: {model_path}') if train_loader.epoch > args.max_epoch: break
def main(): fold = 0 # 4.1 mkdirs if not os.path.exists(config.submit): os.makedirs(config.submit) if not os.path.exists(config.weights + config.model_name + os.sep + str(fold)): os.makedirs(config.weights + config.model_name + os.sep + str(fold)) if not os.path.exists(config.best_models): os.mkdir(config.best_models) if not os.path.exists("./logs/"): os.mkdir("./logs/") #4.2 get model # model=MultiModalNet("se_resnext101_32x4d","dpn107",0.5) model = MultiModalNet("se_resnext50_32x4d", "dpn26", 0.5) #4.3 optim & criterion #optimizer = optim.SGD(model.parameters(),lr = config.lr,momentum=0.9,weight_decay=1e-4) criterion = nn.CrossEntropyLoss().to(device) #多任务处理,所以选择交叉熵 # optimizer = optim.SGD([{'params': model.base.parameters()}, # {'params': model.classifier.parameters(), 'lr': config.lr*0.1}], lr=1e-5,momentum=0.9,weight_decay=1e-4) #betas = (0.9,0.999), eps = 1e-08, optimizer = optim.Adam( model.parameters(), lr=config.lr, betas=(0.9, 0.999), weight_decay=1e-4) #betas = (0.9,0.999), eps = 1e-08, # class SGD(Optimizer): # def __init__(self, params, lr=required, momentum=0, dampening=0, weight_decay1=0, weight_decay2=0, nesterov=False): # defaults = dict(lr=lr, momentum=momentum, dampening=dampening, # weight_decay1=weight_decay1, weight_decay2=weight_decay2, nesterov=nesterov) # if nesterov and (momentum <= 0 or dampening != 0): # raise ValueError("Nesterov momentum requires a momentum and zero dampening") # super(SGD, self).__init__(params, defaults) # def __setstate__(self, state): # super(SGD, self).__setstate__(state) # for group in self.param_groups: # group.setdefault('nesterov', False) # def step(self, closure=None): # """Performs a single optimization step. Arguments: closure (callable, optional): A closure that reevaluates the model and returns the loss. """ # loss = None # if closure is not None: # loss = closure() # for group in self.param_groups: # weight_decay1 = group['weight_decay1'] # weight_decay2 = group['weight_decay2'] # momentum = group['momentum'] # dampening = group['dampening'] # nesterov = group['nesterov'] # for p in group['params']: # if p.grad is None: # continue # d_p = p.grad.data # if weight_decay1 != 0: # d_p.add_(weight_decay1, torch.sign(p.data)) # if weight_decay2 != 0: # d_p.add_(weight_decay2, p.data) # if momentum != 0: # param_state = self.state[p] # if 'momentum_buffer' not in param_state: # buf = param_state['momentum_buffer'] = torch.zeros_like(p.data) # buf.mul_(momentum).add_(d_p) # else: # buf = param_state['momentum_buffer'] # buf.mul_(momentum).add_(1 - dampening, d_p) # if nesterov: # d_p = d_p.add(momentum, buf) # else: # d_p = buf # p.data.add_(-group['lr'], d_p) # return loss start_epoch = 0 best_acc = 0 best_loss = np.inf best_f1 = 0 best_results = [0, np.inf, 0] val_metrics = [0, np.inf, 0] resume = False if resume: checkpoint = torch.load( r'./checkpoints/best_models/seresnext101_dpn107_defrog_multimodal_fold_0_model_best_loss.pth.tar' ) best_acc = checkpoint['best_acc'] best_loss = checkpoint['best_loss'] best_f1 = checkpoint['best_f1'] start_epoch = checkpoint['epoch'] if torch.cuda.device_count() > 1: model = nn.DataParallel(model) model.to(device) all_files = pd.read_csv("./train.csv") test_files = pd.read_csv("./test.csv") train_data_list, val_data_list = train_test_split(all_files, test_size=0.1, random_state=2050) # load dataset train_gen = MultiModalDataset(train_data_list, config.train_data, config.train_vis, mode="train") train_loader = DataLoader( train_gen, batch_size=config.batch_size, shuffle=True, pin_memory=True, num_workers=1) #num_worker is limited by shared memory in Docker! val_gen = MultiModalDataset(val_data_list, config.train_data, config.train_vis, augument=False, mode="train") val_loader = DataLoader(val_gen, batch_size=config.batch_size, shuffle=False, pin_memory=True, num_workers=1) test_gen = MultiModalDataset(test_files, config.test_data, config.test_vis, augument=False, mode="test") test_loader = DataLoader(test_gen, 1, shuffle=False, pin_memory=True, num_workers=1) #scheduler = lr_scheduler.StepLR(optimizer,step_size=10,gamma=0.1,last_epoch = -1) scheduler = lr_scheduler.ReduceLROnPlateau(optimizer) #n_batches = int(len(train_loader.dataset) // train_loader.batch_size) #scheduler = CosineAnnealingLR(optimizer, T_max=n_batches*2) start = timer() #train for epoch in range(0, config.epochs): scheduler.step(epoch) # train train_metrics = train(train_loader, model, criterion, optimizer, epoch, val_metrics, best_results, start) # val val_metrics = evaluate(val_loader, model, criterion, epoch, train_metrics, best_results, start) # check results is_best_acc = val_metrics[0] > best_results[0] best_results[0] = max(val_metrics[0], best_results[0]) is_best_loss = val_metrics[1] < best_results[1] best_results[1] = min(val_metrics[1], best_results[1]) is_best_f1 = val_metrics[2] > best_results[2] best_results[2] = max(val_metrics[2], best_results[2]) # save model save_checkpoint( { "epoch": epoch + 1, "model_name": config.model_name, "state_dict": model.state_dict(), "best_acc": best_results[0], "best_loss": best_results[1], "optimizer": optimizer.state_dict(), "fold": fold, "best_f1": best_results[2], }, is_best_acc, is_best_loss, is_best_f1, fold) # print logs print('\r', end='', flush=True) log.write('%s %5.1f %6.1f | %0.3f %0.3f %0.3f | %0.3f %0.3f %0.3f | %s %s %s | %s' % (\ "best", epoch, epoch, train_metrics[0], train_metrics[1],train_metrics[2], val_metrics[0],val_metrics[1],val_metrics[2], str(best_results[0])[:8],str(best_results[1])[:8],str(best_results[2])[:8], time_to_str((timer() - start),'min')) ) log.write("\n") time.sleep(0.01) best_model = torch.load("%s/%s_fold_%s_model_best_loss.pth.tar" % (config.best_models, config.model_name, str(fold))) model.load_state_dict(best_model["state_dict"]) test(test_loader, model, fold)
# ## Load a model # In[19]: model_conv = models.inception_v3(pretrained=True) model_conv.aux_logits = False num_ftrs = model_conv.fc.in_features model_conv.fc = nn.Linear(num_ftrs, 3) model_conv = model_conv.to(device) criterion = nn.CrossEntropyLoss() # In[20]: optimizer_ft = optim.Adam(model_conv.parameters(), lr=0.001) exp_lr_scheduler = lr_scheduler.ReduceLROnPlateau(optimizer_ft, patience=5, verbose=True) # ## Run the model # In[21]: model_conv, train_loss, train_acc, val_loss, val_acc, true_train_labels, predicted_train_labels = train_model( model_conv, criterion, optimizer_ft, exp_lr_scheduler, num_epochs=10) # In[24]: plt.hist(train_labels['level3']) # In[25]:
def main(args): source_train_set = custom_dataset(args.train_data_path, args.train_gt_path) target_train_set = custom_dataset(args.target_data_path, args.target_gt_path) valid_train_set = valid_dataset(args.val_data_path, args.val_gt_path) source_train_loader = data.DataLoader(source_train_set, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers, drop_last=True) target_train_loader = data.DataLoader(target_train_set, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers, drop_last=True) valid_loader = data.DataLoader(valid_train_set, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers, drop_last=False) criterion = Loss().to(device) # domain loss loss_domain = torch.nn.CrossEntropyLoss().to(device) best_loss = 1000 best_num = 0 model = EAST() if args.pretrained_model_path: model.load_state_dict(torch.load(args.pretrained_model_path)) # resume if args.resume: checkpoint = torch.load(args.resume) model.load_state_dict(checkpoint['state_dict']) best_loss = checkpoint['best_loss'] current_epoch_num = checkpoint['epoch'] data_parallel = False if torch.cuda.device_count() > 1: model = nn.DataParallel(model) data_parallel = True model.to(device) total_epoch = args.epochs optimizer = torch.optim.Adam(model.parameters(), lr=args.lr) # scheduler = lr_scheduler.MultiStepLR(optimizer, milestones=[total_epoch // 3, total_epoch * 2 // 3], gamma=0.1) scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=6, threshold=args.lr / 100) current_epoch_num = 0 # resume if args.resume: checkpoint = torch.load(args.resume) scheduler.load_state_dict(checkpoint['scheduler']) for epoch in range(current_epoch_num, total_epoch): each_epoch_start = time.time() # scheduler.step(epoch) # add lr in tensorboardX writer.add_scalar('epoch/lr', get_learning_rate(optimizer), epoch) train(source_train_loader, target_train_loader, model, criterion, loss_domain, optimizer, epoch) val_loss = eval(model, valid_loader, criterion, loss_domain, epoch) scheduler.step(val_loss) if val_loss < best_loss: best_num = epoch + 1 best_loss = val_loss best_model_wts = copy.deepcopy(model.module.state_dict( ) if data_parallel else model.state_dict()) # save best model torch.save( { 'epoch': epoch + 1, 'state_dict': best_model_wts, 'best_loss': best_loss, 'scheduler': scheduler.state_dict(), }, os.path.join(save_folder, "model_epoch_best.pth")) log.write('best model num:{}, best loss is {:.8f}'.format( best_num, best_loss)) log.write('\n') if (epoch + 1) % int(args.save_interval) == 0: state_dict = model.module.state_dict( ) if data_parallel else model.state_dict() torch.save( { 'epoch': epoch + 1, 'state_dict': state_dict, 'best_loss': best_loss, 'scheduler': scheduler.state_dict(), }, os.path.join(save_folder, 'model_epoch_{}.pth'.format(epoch + 1))) log.write('save model') log.write('\n') log.write('=' * 50) log.write('\n')
def main_worker(): seed = 1 random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) opt = parse_opts() train_data = get_training_data(cfg) val_data = get_validation_data(cfg) train_loader = DataLoader(train_data, num_workers=opt.num_workers, collate_fn=collater, batch_size=opt.batch_size, shuffle=True) val_loader = DataLoader(val_data, num_workers=opt.num_workers, collate_fn=collater, batch_size=opt.batch_size, shuffle=True) print(f"Training dataset size : {len(train_loader.dataset)}") print(f"Validation dataset size : {len(val_loader.dataset)}") dataiterator = iter(train_loader) faster_rcnn = FasterRCNN() # if torch.cuda.device_count() > 1 and opt.multi_gpu : # print("Let's use", torch.cuda.device_count(), "GPUs!") # faster_rcnn = nn.DataParallel(faster_rcnn) # loading model from a ckpt if opt.weight_path: load_from_ckpt(opt, faster_rcnn) faster_rcnn.to(cfg.DEVICE) if opt.lr is not None: cfg.TRAIN.LEARNING_RATE = opt.lr lr = cfg.TRAIN.LEARNING_RATE print(f"Learning rate : {lr}") if opt.weight_decay is not None: cfg.TRAIN.WEIGHT_DECAY = opt.weight_decay print(f"Weight Decay : {cfg.TRAIN.WEIGHT_DECAY}") ### Optimizer ### # record backbone params, i.e., conv_body and box_head params backbone_bias_params = [] backbone_bias_param_names = [] prd_branch_bias_params = [] prd_branch_bias_param_names = [] backbone_nonbias_params = [] backbone_nonbias_param_names = [] prd_branch_nonbias_params = [] prd_branch_nonbias_param_names = [] for key, value in dict(faster_rcnn.named_parameters()).items(): if value.requires_grad: if 'fpn' in key or 'box_head' in key or 'box_predictor' in key or 'rpn' in key: if 'bias' in key: backbone_bias_params.append(value) backbone_bias_param_names.append(key) else: backbone_nonbias_params.append(value) backbone_nonbias_param_names.append(key) else: if 'bias' in key: prd_branch_bias_params.append(value) prd_branch_bias_param_names.append(key) else: prd_branch_nonbias_params.append(value) prd_branch_nonbias_param_names.append(key) params = [ { 'params': backbone_nonbias_params, 'lr': cfg.TRAIN.LEARNING_RATE, 'weight_decay': cfg.TRAIN.WEIGHT_DECAY }, { 'params': backbone_bias_params, 'lr': cfg.TRAIN.LEARNING_RATE * (cfg.TRAIN.DOUBLE_BIAS + 1), 'weight_decay': cfg.TRAIN.WEIGHT_DECAY if cfg.TRAIN.BIAS_DECAY else 0 }, { 'params': prd_branch_nonbias_params, 'lr': cfg.TRAIN.LEARNING_RATE, 'weight_decay': cfg.TRAIN.WEIGHT_DECAY }, { 'params': prd_branch_bias_params, 'lr': cfg.TRAIN.LEARNING_RATE * (cfg.TRAIN.DOUBLE_BIAS + 1), 'weight_decay': cfg.TRAIN.WEIGHT_DECAY if cfg.TRAIN.BIAS_DECAY else 0 }, ] if cfg.TRAIN.TYPE == "ADAM": optimizer = torch.optim.Adam(params) elif cfg.TRAIN.TYPE == "SGD": optimizer = torch.optim.SGD(params, momentum=cfg.TRAIN.MOMENTUM) # scheduler if opt.scheduler == "plateau": scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=5) elif opt.scheduler == "multi_step": scheduler = lr_scheduler.MultiStepLR(optimizer, milestones=[83631, 111508]) elif opt.scheduler == "step_lr": scheduler = lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.1, last_epoch=-1) if opt.weight_path: opt.begin_iter = load_train_utils(opt, optimizer, scheduler) # lr of non-backbone parameters, for commmand line outputs. lr = optimizer.param_groups[0]['lr'] # lr of backbone parameters, for commmand line outputs. # backbone_lr = optimizer.param_groups[0]['lr'] summary_writer = Metrics(log_dir='tf_logs') losses_sbj = AverageMeter('Sbj loss: ', ':.2f') losses_obj = AverageMeter('Obj loss: ', ':.2f') losses_rel = AverageMeter('Rel loss: ', ':.2f') losses_total = AverageMeter('Total loss: ', ':.2f') progress = ProgressMeter( [losses_sbj, losses_obj, losses_rel, losses_total], prefix='Train: ') faster_rcnn.train() th = 10000 for step in range(opt.begin_iter, opt.max_iter): try: input_data = next(dataiterator) except StopIteration: dataiterator = iter(train_loader) input_data = next(dataiterator) images, targets = input_data _, metrics = faster_rcnn(images, targets) final_loss = metrics["loss_objectness"] + metrics["loss_rpn_box_reg"] + \ metrics["loss_classifier"] + metrics["loss_box_reg"] + \ metrics["loss_sbj"] + metrics["loss_obj"] + metrics["loss_rlp"] optimizer.zero_grad() final_loss.backward() optimizer.step() losses_sbj.update(metrics["loss_sbj"].item(), len(images)) losses_obj.update(metrics["loss_obj"].item(), len(images)) losses_rel.update(metrics["loss_rlp"].item(), len(images)) losses_total.update(final_loss.item(), len(images)) if opt.scheduler != "plateau": scheduler.step() if (step) % 10 == 0: progress.display(step) if step % 2500 == 0: train_losses = {} train_losses['total_loss'] = losses_total.avg train_losses['sbj_loss'] = losses_sbj.avg train_losses['obj_loss'] = losses_obj.avg train_losses['rel_loss'] = losses_rel.avg val_losses = val_epoch(faster_rcnn, val_loader) if opt.scheduler == "plateau": scheduler.step(val_losses['total_loss']) lr = optimizer.param_groups[0]['lr'] # if val_losses['total_loss'] < th: # save_model(faster_rcnn, optimizer, scheduler, step) # print(f"*** Saved model ***") # th = val_losses['total_loss'] save_model(faster_rcnn, optimizer, scheduler, step) # write summary summary_writer.log_metrics(train_losses, val_losses, step, lr) print( f"* Average training loss : {train_losses['total_loss']:.3f}") print( f"* Average validation loss : {val_losses['total_loss']:.3f}") losses_sbj.reset() losses_obj.reset() losses_rel.reset() losses_total.reset() faster_rcnn.train()
def train(ref_only_df, cons_train_df, cons_val_df, label_encoder, torch_transform, labelcol, batch_size, _, args, n_epochs, results_dir=None, add_perspective=False ): dataloaders = create_dataloaders( ref_only_df, cons_train_df, cons_val_df, label_encoder, torch_transform, labelcol, batch_size, add_perspective=add_perspective ) model, device = init_mod_dev(args, label_encoder) if args.optimizer == 'momentum': optimizer = optim.SGD(list(model.parameters()), lr=args.init_lr) elif args.optimizer == 'adamdelta': optimizer = optim.Adadelta(model.parameters(), lr=args.init_lr) else: optimizer = optim.Adam(model.parameters(), lr=args.init_lr) # Reduces the LR on plateaus exp_lr_scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, factor=args.lr_factor, patience=args.lr_patience, verbose=True) if results_dir is None: results_dir = os.path.join(args.results_dir, build_logid_string(args)) print("Starting multihead training") loss_weights = {'ce': args.ce_w, 'arcface': args.arcface_w, 'contrastive': args.contrastive_w, 'triplet': args.triplet_w, 'focal': args.focal_w} print("loss_weights", loss_weights) onlinecriterion = MultiheadLoss(len(label_encoder.classes_), args.metric_margin, HardNegativePairSelector(), args.metric_margin, RandomNegativeTripletSelector(args.metric_margin), use_cosine=args.metric_evaluator_type == 'cosine', weights=loss_weights, focal_gamma=args.focal_gamma, use_side_labels=args.train_with_side_labels) # switch evaluator for monitoring validation performance val_evaluator = 'logit' if loss_weights['triplet'] > 0 or loss_weights['contrastive'] > 0 or loss_weights['arcface'] > 0: val_evaluator = 'metric' print(f'Will use {val_evaluator} evaluator for validation') model, best_val_metrics = hneg_train_model(model, optimizer, exp_lr_scheduler, device, dataloaders, results_dir, label_encoder, onlinecriterion, num_epochs=n_epochs, earlystop_patience=3 * (args.lr_patience + 1), simul_sidepairs=args.metric_simul_sidepairs_eval, sidepairs_agg=args.sidepairs_agg, train_with_side_labels=args.train_with_side_labels, metric_evaluator_type=args.metric_evaluator_type, val_evaluator=val_evaluator) return model, best_val_metrics