def main(): parser = argparse.ArgumentParser() parser.add_argument('--epochs', default=50, type=int, help='epoch number') parser.add_argument('-b', '--batch_size', default=64, type=int, help='mini-batch size') parser.add_argument('--lr', '--learning_rate', default=1e-4, type=float, help='initial learning rate') parser.add_argument('-c', '--continue', dest='continue_path', type=str, required=False) parser.add_argument('--state_dict', default=None, type=str, required=False, help='state_dict when doing full training ') parser.add_argument('--exp_name', default=config.exp_name, type=str, required=False) parser.add_argument('--drop_rate', default=0, type=float, required=False) parser.add_argument('--local', action='store_true', help='train local branch') args = parser.parse_args() print(args) config.exp_name = args.exp_name config.make_dir() save_args(args, config.log_dir) # get network if args.state_dict is not None: state_dict = torch.load(args.state_dict) net = fusenet() net.load_state_dict(state_dict) net.set_fcweights() else: global_branch_state = torch.load(GLOBAL_BRANCH_DIR) local_branch_state = torch.load(LOCAL_BRANCH_DIR) net = fusenet(global_branch_state, local_branch_state) net.to(config.device) sess = Session(config, net=net) # get dataloader train_loader = get_dataloaders('train', batch_size=args.batch_size, shuffle=True) valid_loader = get_dataloaders('valid', batch_size=args.batch_size, shuffle=False) if args.continue_path and os.path.exists(args.continue_path): sess.load_checkpoint(args.continue_path) # start session clock = sess.clock tb_writer = sess.tb_writer sess.save_checkpoint('start.pth.tar') # set criterion, optimizer and scheduler criterion = nn.BCELoss().cuda() if args.local: # train local branch optimizer = optim.Adam(sess.net.module.local_branch.parameters(), args.lr) else: # train final fc layer optimizer = optim.Adam(sess.net.classifier.parameters(), args.lr) scheduler = ReduceLROnPlateau(optimizer, 'max', factor=0.1, patience=10, verbose=True) # start training for e in range(args.epochs): train_out = train_model(train_loader, sess.net, criterion, optimizer, clock.epoch) valid_out = valid_model(valid_loader, sess.net, criterion, optimizer, clock.epoch) tb_writer.add_scalars('loss', {'train': train_out['epoch_loss'], 'valid': valid_out['epoch_loss']}, clock.epoch) tb_writer.add_scalars('acc', {'train': train_out['epoch_acc'], 'valid': valid_out['epoch_acc']}, clock.epoch) tb_writer.add_scalar('auc', valid_out['epoch_auc'], clock.epoch) tb_writer.add_scalar('learning_rate', optimizer.param_groups[-1]['lr'], clock.epoch) scheduler.step(valid_out['epoch_auc']) if valid_out['epoch_auc'] > sess.best_val_acc: sess.best_val_acc = valid_out['epoch_auc'] sess.save_checkpoint('best_model.pth.tar') if clock.epoch % 10 == 0: sess.save_checkpoint('epoch{}.pth.tar'.format(clock.epoch)) sess.save_checkpoint('latest.pth.tar') clock.tock()
def train_net(model, file_path, in_seq_len, out_seq_len, pre_model, save_dir, batch_size, lr, log_after, cuda, device): print(model) # if os.path.exists('runs'): # import shutil # shutil.rmtree('runs') # just in case... if not os.path.exists(save_dir): os.mkdir(save_dir) if cuda: print('GPU') model.cuda(device=device) print('log: training started on device: {}'.format(device)) writer = SummaryWriter() optimizer = Adam(model.parameters(), lr=lr) criterion = nn.MSELoss() train_dataloader, val_dataloader, test_dataloader = get_dataloaders( file_path=file_path, in_seq_len=in_seq_len, out_seq_len=out_seq_len, batch_size=batch_size) if True: i = 1 m_loss, m_accuracy = [], [] if pre_model: model.load_state_dict(torch.load(pre_model)) print('log: resumed model {} successfully!'.format(pre_model)) # starting point model_number = int(re.findall('\d+', str(pre_model))[0]) i = i + model_number - 1 else: print("log: let's start from the beginning...") while True: i += 1 net_loss = [] # new model path save_path = os.path.join(save_dir, 'model-{}.pt'.format(i)) # remember to save only five previous models, so del_this = os.path.join(save_dir, 'model-{}.pt'.format(i - 6)) if os.path.exists(del_this): os.remove(del_this) print('log: removed {}'.format(del_this)) if i > 1 and not os.path.exists(save_path): torch.save(model.state_dict(), save_path) print('log: saved {}'.format(save_path)) for idx, data in enumerate(train_dataloader, 1): ########################## model.train() # train mode at each epoch, just in case... ################################# test_x, label = data['input'].unsqueeze( 2), data['label'].squeeze(1) if cuda: test_x = test_x.cuda(device=device) label = label.cuda(device=device) out_x, h_n = model.continuous_forward(test_x, out_seq_len=out_seq_len) loss = criterion(out_x.view_as(label), label) net_loss.append(loss.item()) if idx % log_after == 0 and idx > 0: print('{}. ({}/{}) image size = {}, loss = {}'.format( i, idx, len(train_dataloader), out_x.size(), loss.item())) ################################# # three steps for backprop model.zero_grad() loss.backward() # perform gradient clipping between loss backward and optimizer step clip_grad_norm_(model.parameters(), 0.05) optimizer.step() ################################# mean_loss = np.asarray(net_loss).sum() / idx m_loss.append((i, mean_loss)) writer.add_scalar(tag='train_loss', scalar_value=mean_loss, global_step=i) print('####################################') print('in_shape = {}, out_shape = {}'.format( test_x.shape, out_x.shape)) print('epoch {} -> total loss = {:.5f}'.format(i, mean_loss)) print('####################################') # validate model after each epoch eval_net(model=model, out_seq_len=out_seq_len, writer=writer, criterion=criterion, val_loader=val_dataloader, denominator=batch_size, cuda=cuda, device=device, global_step=i) pass
def train_net(model, data_path, pre_model, save_dir, batch_size, lr, log_after, cuda, device, one_hot=False): if not pre_model: print(model) writer = SummaryWriter() if cuda: print('GPU') model.cuda(device=device) print('log: training started on device: {}'.format(device)) # define loss and optimizer optimizer = Adam(model.parameters(), lr=lr) lr_final = 0.0000003 num_epochs = 500 LR_decay = (lr_final/lr)**(1./num_epochs) scheduler = lr_scheduler.ExponentialLR(optimizer=optimizer, gamma=LR_decay) # print(LR_decay, optimizer.state) # print(optimizer.param_groups[0]['lr']) # criterion = nn.CrossEntropyLoss() criterion = nn.CrossEntropyLoss() train_loader, val_dataloader, test_loader = get_dataloaders(path_to_nparray=data_path, batch_size=batch_size, normalize=True) if not os.path.exists(save_dir): os.mkdir(save_dir) if True: i = 1 m_loss, m_accuracy = [], [] if pre_model: # self.load_state_dict(torch.load(pre_model)['model']) model.load_state_dict(torch.load(os.path.join(save_dir, "model-"+pre_model+'.pt'))) print('log: resumed model {} successfully!'.format(pre_model)) print(model) # starting point # model_number = int(pre_model.split('/')[1].split('-')[1].split('.')[0]) model_number = int(pre_model) #re.findall('\d+', str(pre_model))[0]) i = i + model_number - 1 else: print('log: starting anew...') while i < num_epochs: i += 1 net_loss = [] # new model path save_path = os.path.join(save_dir, 'model-{}.pt'.format(i)) # remember to save only five previous models, so del_this = os.path.join(save_dir, 'model-{}.pt'.format(i-5)) if os.path.exists(del_this): os.remove(del_this) print('log: removed {}'.format(del_this)) if i > 1 and not os.path.exists(save_path): torch.save(model.state_dict(), save_path) print('log: saved {}'.format(save_path)) correct_count, total_count = 0, 0 for idx, data in enumerate(train_loader): ########################## model.train() # train mode at each epoch, just in case... ########################## test_x, label = data if cuda: test_x = test_x.cuda(device=device) label = label.cuda(device=device) # forward out_x, pred = model(test_x) # out_x, pred = out_x.cpu(), pred.cpu() loss = criterion(out_x, label) net_loss.append(loss.item()) # get accuracy metric if one_hot: batch_correct = (torch.argmax(label, dim=1).eq(pred.long())).double().sum().item() else: batch_correct = (label.eq(pred.long())).double().sum().item() correct_count += batch_correct # print(batch_correct) total_count += np.float(pred.size(0)) if idx % log_after == 0 and idx > 0: print('{}. ({}/{}) image size = {}, loss = {}: accuracy = {}/{}'.format(i, idx, len(train_loader), out_x.size(), loss.item(), batch_correct, pred.size(0))) ################################# # three steps for backprop model.zero_grad() loss.backward() # perform gradient clipping between loss backward and optimizer step clip_grad_norm_(model.parameters(), 0.05) optimizer.step() ################################# # remember this should be in the epoch loop ;) scheduler.step() # to dynamically change the learning rate mean_accuracy = correct_count / total_count * 100 mean_loss = np.asarray(net_loss).mean() m_loss.append((i, mean_loss)) m_accuracy.append((i, mean_accuracy)) writer.add_scalar(tag='train loss', scalar_value=mean_loss, global_step=i) writer.add_scalar(tag='train over_all accuracy', scalar_value=mean_accuracy, global_step=i) print('####################################') print('epoch {} -> total loss = {:.5f}, total accuracy = {:.5f}% (lr: {})'.format(i, mean_loss, mean_accuracy, optimizer.param_groups[0]['lr'])) print('####################################') # validate model after each epoch with torch.no_grad(): eval_net(model=model, writer=writer, criterion=criterion, val_loader=val_dataloader, denominator=batch_size, cuda=cuda, device=device, global_step=i, one_hot=one_hot) pass
def train(cfg) -> None: learning_rate = cfg["learning_rate"] emb_dim = cfg["emb_dim"] dropout = cfg["dropout"] n_heads = cfg["n_heads"] n_encoder_layers = cfg["n_encoder_layers"] n_decoder_layers = cfg["n_decoder_layers"] dim_feedforward = cfg["dim_feedforward"] batch_size = cfg["batch_size"] validation_batch_size = cfg["validation_batch_size"] max_window_size = cfg["max_window_size"] num_workers = cfg["num_workers"] use_lectures = cfg["use_lectures"] use_prior_q_times = cfg["use_prior_q_times"] val_step_frequency = cfg["val_step_frequency"] val_size = cfg["val_size"] use_agg_feats = cfg["use_agg_feats"] use_exercise_feats = cfg["use_exercise_feats"] use_lgbm_feats = cfg["use_lgbm_feats"] concat_response_embeds = cfg["concat_response_embeds"] train_loader, val_loader = get_dataloaders( batch_size=batch_size, validation_batch_size=validation_batch_size, max_window_size=max_window_size, use_lectures=use_lectures, num_workers=num_workers, use_agg_feats=use_agg_feats, ) # Init our model model = RIIDDTransformerModel( learning_rate=learning_rate, emb_dim=emb_dim, # embedding dimension - this is for everything dropout=dropout, n_heads=n_heads, n_encoder_layers=n_encoder_layers, n_decoder_layers=n_decoder_layers, dim_feedforward=dim_feedforward, max_window_size=max_window_size, use_prior_q_times=use_prior_q_times, lr_step_frequency=val_step_frequency, use_agg_feats=use_agg_feats, use_exercise_feats=use_exercise_feats, use_lgbm_feats=use_lgbm_feats, concat_response_embeds=concat_response_embeds) experiment_name = f"concat_response_embeds" logger = TensorBoardLogger(f"{get_wd()}lightning_logs", name=experiment_name) # Initialize a trainer trainer = pl.Trainer( gpus=1, max_epochs=1000, progress_bar_refresh_rate=1, callbacks=[ EarlyStopping(monitor="avg_val_auc", patience=10, mode="max"), ModelCheckpoint( monitor="avg_val_auc", filename="{epoch}-{val_loss_step:.2f}-{avg_val_auc:.2f}", mode="max", ), LearningRateMonitor(logging_interval="epoch"), ], logger=logger, limit_train_batches=val_step_frequency, # check validation every epoch limit_val_batches=val_size, # run through only 10% of val every time ) # Train the model ⚡ trainer.fit( model, train_dataloader=train_loader, val_dataloaders=[val_loader], ) # Test on Final Full validation set trainer.test(test_dataloaders=[val_loader])
# set up the handler filehandler = logging.FileHandler( os.path.join(opt.log_path, "convert-stdout.txt")) formatter = logging.Formatter(LOG_FORMAT) filehandler.setFormatter(formatter) # add a 'logfile' handler now! logging.getLogger().addHandler(filehandler) # set random seeds torch.manual_seed(opt.seed) torch.cuda.manual_seed(opt.seed) np.random.seed(opt.seed) # load model model, optimizer = load_stock_model.load_model_and_optimizer( opt, reload_model=True) # get datasets and dataloaders train_loader, train_dataset, test_loader, test_dataset = dataset.get_dataloaders( opt) try: # Train the model evaluate(opt, model, train_loader, "df-train.feather") evaluate(opt, model, test_loader, "df-test.feather") except KeyboardInterrupt: logging.info("Training got interrupted, saving log-files now.")
out_channels = in_channels num_training_updates = 25000 num_hiddens = 128 num_residual_hiddens = 32 num_residual_layers = 2 embedding_dim = 64 num_embeddings = 512 commitment_cost = 0.25 decay = 0.99 learning_rate = 3e-4 device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # dset = MocapDataset() train_loader, test_loader = get_dataloaders(batch_size=batch_size) print('There are {} minibatches with {} batch_size in one epoch'.format( len(train_loader), train_loader.batch_size)) model = Model(in_channels, out_channels, num_hiddens=128, num_residual_layers=32, num_residual_hiddens=2, embedding_dim=64, num_embeddings=512, commitment_cost=commitment_cost, decay=decay).to(device) optimizer = optim.Adam(model.parameters(), lr=learning_rate, amsgrad=True) loss_function = nn.MSELoss()
def eval_net(**kwargs): model = kwargs['model'] cuda = kwargs['cuda'] device = kwargs['device'] if cuda: model.cuda(device=device) if 'criterion' in kwargs.keys(): writer = kwargs['writer'] val_loader = kwargs['val_loader'] criterion = kwargs['criterion'] global_step = kwargs['global_step'] correct_count, total_count = 0, 0 net_loss = [] model.eval() # put in eval mode first ############################ print('evaluating with batch size = 1') for idx, data in enumerate(val_loader): test_x, label = data['input'], data['label'] if cuda: test_x = test_x.cuda(device=device) label = label.cuda(device=device) # forward out_x, pred = model.forward(test_x) loss = criterion(out_x, label) net_loss.append(loss.item()) # get accuracy metric batch_correct = (label.eq(pred.long())).double().sum().item() correct_count += batch_correct total_count += np.float(pred.size(0)) ################################# mean_accuracy = correct_count / total_count * 100 mean_loss = np.asarray(net_loss).mean() # summarize mean accuracy writer.add_scalar(tag='val. loss', scalar_value=mean_loss, global_step=global_step) writer.add_scalar(tag='val. over_all accuracy', scalar_value=mean_accuracy, global_step=global_step) print('$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$') print( 'log: validation:: total loss = {:.5f}, total accuracy = {:.5f}%'. format(mean_loss, mean_accuracy)) print('$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$') else: # model, images, labels, pre_model, save_dir, sum_dir, batch_size, lr, log_after, cuda pre_model = kwargs['pre_model'] base_folder = kwargs['base_folder'] batch_size = kwargs['batch_size'] log_after = kwargs['log_after'] criterion = nn.CrossEntropyLoss() un_confusion_meter = tnt.meter.ConfusionMeter(10, normalized=False) confusion_meter = tnt.meter.ConfusionMeter(10, normalized=True) model.load_state_dict(torch.load(pre_model)) print('log: resumed model {} successfully!'.format(pre_model)) _, _, test_loader = get_dataloaders(base_folder=base_folder, batch_size=batch_size) net_accuracy, net_loss = [], [] correct_count = 0 total_count = 0 print('batch size = {}'.format(batch_size)) model.eval() # put in eval mode first for idx, data in enumerate(test_loader): # if idx == 1: # break # print(model.training) test_x, label = data['input'], data['label'] # print(test_x) # print(test_x.shape) # this = test_x.numpy().squeeze(0).transpose(1,2,0) # print(this.shape, np.min(this), np.max(this)) if cuda: test_x = test_x.cuda(device=device) label = label.cuda(device=device) # forward out_x, pred = model.forward(test_x) loss = criterion(out_x, label) un_confusion_meter.add(predicted=pred, target=label) confusion_meter.add(predicted=pred, target=label) ############################### # pred = pred.view(-1) # pred = pred.cpu().numpy() # label = label.cpu().numpy() # print(pred.shape, label.shape) ############################### # get accuracy metric # correct_count += np.sum((pred == label)) # print(pred, label) batch_correct = (label.eq(pred.long())).sum().item() correct_count += batch_correct # print(batch_correct) total_count += np.float(batch_size) net_loss.append(loss.item()) if idx % log_after == 0: print('log: on {}'.format(idx)) ################################# mean_loss = np.asarray(net_loss).mean() mean_accuracy = correct_count * 100 / total_count print(correct_count, total_count) print('$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$') print( 'log: test:: total loss = {:.5f}, total accuracy = {:.5f}%'.format( mean_loss, mean_accuracy)) print('$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$') with open('normalized.pkl', 'wb') as this: pkl.dump(confusion_meter.value(), this, protocol=pkl.HIGHEST_PROTOCOL) with open('un_normalized.pkl', 'wb') as this: pkl.dump(un_confusion_meter.value(), this, protocol=pkl.HIGHEST_PROTOCOL) pass pass
parser.add_argument('--progressbar', action='store_true', default=False, help='Show progress bar during train/test.') parser.add_argument('--evaluate', action='store_true', default=False, help='Evaluation only using 25 clips per video') ##### Read in parameters opt = parser.parse_args() opt.multiple_clips = False opt.kernels = multiprocessing.cpu_count() """=================================DATALOADER SETUPS=====================""" if torch.cuda.device_count() > 1: print("Let's use", torch.cuda.device_count(), "GPUs!") opt.bs = opt.bs * torch.cuda.device_count() print('Total batch size: %d' % opt.bs) dataloaders = dataset.get_dataloaders(opt) if not opt.evaluate: opt.n_classes = dataloaders['training'][0].dataset.class_embed.shape[0] else: opt.n_classes = dataloaders['testing'][0].dataset.class_embed.shape[0] """=================================OUTPUT FOLDER=====================""" opt.savename = opt.save_path + '/' if not opt.evaluate: opt.savename += '%s/CLIP%d_LR%f_%s_BS%d' % ( opt.dataset, opt.clip_len, opt.lr, opt.network, opt.bs) if opt.class_overlap > 0: opt.savename += '_CLASSOVERLAP%.2f' % opt.class_overlap
def main(args): set_seed(SEED) train_transforms, test_transforms = get_transforms(args.dataset) print(f"Data transformations:\n{train_transforms}\n") # Get the dataloaders train_loader, test_loader = get_dataloaders(args.dataset, args.batch_size, args.workers, train_transforms, test_transforms) # Architecture if args.dataset == 'mnist': in_channels = 1 else: raise NotImplementedError() if args.activation == 'relu': activation = nn.ReLU(inplace=True) else: raise NotImplementedError() if args.pooling == 'max': pooling = nn.MaxPool2d(kernel_size=(2, 2), stride=2) else: raise NotImplementedError() drop_rate = args.drop_rate # Build model model = LeNet5(in_channels, activation, pooling, drop_rate) if torch.cuda.is_available(): torch.cuda.set_device(args.gpu) model = model.cuda() # Weight normal initialization if args.init_weights: model.apply(normal_initialization) start_epoch = 0 if args.resume is not None: model, optimizer, start_epoch = load_training_state( model, optimizer, args.resume) # Loss function & optimizer if args.criterion == 'ce': criterion = nn.CrossEntropyLoss() else: raise NotImplementedError() if args.optimizer == 'sgd': # Issue optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay, nesterov=args.nesterov) elif args.optimizer == 'adam': optimizer = optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.weight_decay) else: raise NotImplementedError() scheduler = ReduceLROnPlateau(optimizer, factor=0.5, patience=0, threshold=1e-2, verbose=True) # Output folder output_folder = os.path.join(args.output_folder, args.training_name) if not os.path.exists(output_folder): os.makedirs(output_folder) log_path = os.path.join(args.output_folder, 'logs', args.training_name) if os.path.exists(log_path): rmtree(log_path) logger = SummaryWriter(log_path) # Train best_loss = math.inf mb = master_bar(range(args.nb_epochs)) for epoch_idx in mb: # Training train_epoch(model, train_loader, optimizer, criterion, mb, tb_logger=logger, epoch=start_epoch + epoch_idx) # Evaluation val_loss, accuracy = evaluate(model, test_loader, criterion) mb.first_bar.comment = f"Epoch {start_epoch+epoch_idx+1}/{start_epoch+args.nb_epochs}" mb.write( f'Epoch {start_epoch+epoch_idx+1}/{start_epoch+args.nb_epochs} - Validation loss: {val_loss:.4} (Acc@1: {accuracy:.2%})' ) # State saving if val_loss < best_loss: print( f"Validation loss decreased {best_loss:.4} --> {val_loss:.4}: saving state..." ) best_loss = val_loss torch.save( dict(epoch=start_epoch + epoch_idx, model_state_dict=model.state_dict(), optimizer_state_dict=optimizer.state_dict(), val_loss=val_loss), os.path.join(output_folder, "training_state.pth")) if logger is not None: current_iter = (start_epoch + epoch_idx + 1) * len(train_loader) logger.add_scalar(f"Validation loss", val_loss, current_iter) logger.add_scalar(f"Error rate", 1 - accuracy, current_iter) logger.flush() scheduler.step(val_loss)
from common import * from dataset import get_dataloaders from model import MURA_Net from train import train_model import os dataloaders, dataset_sizes = get_dataloaders( study_name='XR_HUMERUS', data_dir='MURA-v1.0', batch_size=50, batch_eval_ten=15, shuffle=True ) print(dataset_sizes) model = MURA_Net() model = model.to(device) # model.load_state_dict(torch.load('models/model_XR_WRIST.pth')) optimizer = torch.optim.Adam(model.parameters(), lr=0.0001) scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', patience=1, verbose=True) model = train_model(model, optimizer, dataloaders, scheduler, dataset_sizes, 500) # torch.save(model.state_dict(), 'models/model_hand_auc.pth')
def main(): start_time = time.time() if not torch.cuda.is_available(): logging.info('no gpu device available') sys.exit(1) torch.cuda.set_device(args.gpu) reproducibility(args.seed) logging.info('gpu device = %d' % args.gpu) logging.info("args = %s", args) criterion = nn.CrossEntropyLoss() criterion = criterion.cuda() model = Network(args.model_name, CIFAR_CLASSES, sub_policies, args.use_cuda, args.use_parallel, temperature=args.temperature, criterion=criterion) # model = model.cuda() logging.info("param size = %fMB", utils.count_parameters_in_MB(model)) optimizer = torch.optim.SGD(model.parameters(), args.learning_rate, momentum=args.momentum, weight_decay=args.weight_decay) # train_transform, valid_transform = utils._data_transforms_cifar10(args) # train_data = dset.CIFAR10(root=args.data, train=True, download=True, transform=train_transform) # train_data = AugmCIFAR10( # root=args.data, train=True, download=True, # transform=train_transform, ops_names=sub_policies, search=True, magnitudes=model.magnitudes) # valid_data = AugmCIFAR10( # root=args.data, train=True, download=True, # transform=train_transform, ops_names=sub_policies, search=False, magnitudes=model.magnitudes) # num_train = len(train_data) # indices = list(range(num_train)) # split = int(np.floor(args.train_portion * num_train)) # train_queue = torch.utils.data.DataLoader( # train_data, batch_size=args.batch_size, # sampler=torch.utils.data.sampler.SubsetRandomSampler(indices[:split]), # pin_memory=True, num_workers=args.num_workers) # valid_queue = torch.utils.data.DataLoader( # valid_data, batch_size=args.batch_size, # sampler=torch.utils.data.sampler.SubsetRandomSampler(indices[split:num_train]), # pin_memory=True, num_workers=args.num_workers) train_queue, valid_queue = get_dataloaders(args.dataset, args.batch_size, args.num_workers, args.dataroot, sub_policies, model.magnitudes, args.cutout, args.cutout_length, split=args.train_portion, split_idx=0, target_lb=-1) scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( optimizer, float(args.epochs), eta_min=args.learning_rate_min) architect = Architect(model, args) for epoch in range(args.epochs): scheduler.step() lr = scheduler.get_lr()[0] logging.info('epoch %d lr %e', epoch, lr) genotype = model.genotype() # logging.info('genotype = %s', genotype) print_genotype(genotype) # logging.info('%s' % str(torch.nn.functional.softmax(model.ops_weights, dim=-1))) probs = model.ops_weights # logging.info('%s' % str(probs / probs.sum(-1, keepdim=True))) logging.info('%s' % str(torch.nn.functional.softmax(probs, dim=-1))) logging.info('%s' % str(model.probabilities.clamp(0, 1))) logging.info('%s' % str(model.magnitudes.clamp(0, 1))) # training train_acc, train_obj = train(train_queue, valid_queue, model, architect, criterion, optimizer, lr) logging.info('train_acc %f', train_acc) # validation valid_acc, valid_obj = infer(valid_queue, model, criterion) logging.info('valid_acc %f', valid_acc) utils.save(model, os.path.join(args.save, 'weights.pt')) end_time = time.time() elapsed = end_time - start_time logging.info('elapsed time: %.3f Hours' % (elapsed / 3600.))
def train(cfg: DictConfig) -> None: # Determine device (GPU, CPU, etc.) device = "cuda" if torch.cuda.is_available() else "cpu" # Model model = get_network(cfg) # Data Loaders train_loader, val_loader = get_dataloaders(cfg, num_workers=cfg.data_loader_workers) # Your training loop trainer = create_training_loop(model, cfg, "trainer", device=device) # Your evaluation loop evaluator = create_evaluation_loop(model, cfg, "evaluator", device=device) ld = LogDirector(cfg, engines=[trainer, evaluator]) ######################################################################## # Logging Callbacks ######################################################################## # Helper to run the evaluation loop def run_evaluator(): evaluator.run(val_loader) return evaluator # NOTE: Must return the engine we want to log from ld.set_event_handlers( trainer, Events.ITERATION_COMPLETED(every=50), EngineStateAttr.OUTPUT, [ (LOG_OP.SAVE_IMAGE, ["im"]), # Save images to a folder (LOG_OP.LOG_MESSAGE, ["nll"],), # Log fields as message in logfile (LOG_OP.SAVE_IN_DATA_FILE, ["nll"],), # Log fields as separate data files ( LOG_OP.NUMBER_TO_VISDOM, [ # First plot, key is "p1" VisPlot( var_name="nll", plot_key="p1", split="nll_1", # Any opts that Visdom supports opts={"title": "Plot 1", "xlabel": "Iters", "fillarea": True}, ), VisPlot(var_name="nll_2", plot_key="p1", split="nll_2",), ], ), ( LOG_OP.IMAGE_TO_VISDOM, [ VisImg( var_name="im", img_key="1", env="images", opts={"caption": "a current image", "title": "title"}, ), VisImg( var_name="im", img_key="2", env="images", opts={"caption": "a current image", "title": "title"}, ), ], ), ], ) ld.set_event_handlers( trainer, Events.EPOCH_COMPLETED, EngineStateAttr.METRICS, [ ( LOG_OP.LOG_MESSAGE, ["nll", "accuracy",], ), # Log fields as message in logfile ( LOG_OP.SAVE_IN_DATA_FILE, ["accuracy"], ), # Log fields as separate data files ( LOG_OP.NUMBER_TO_VISDOM, [ # First plot, key is "p1" VisPlot( var_name="accuracy", plot_key="p3", split="acc", # Any opts that Visdom supports opts={"title": "Eval Acc", "xlabel": "Iters"}, ), # First plot, key is "p1" VisPlot( var_name="nll", plot_key="p4", split="nll", # Any opts that Visdom supports opts={"title": "Eval Nll", "xlabel": "Iters", "fillarea": True}, ), ], ), ], # Run the evaluation loop, then do log operations from the return engine pre_op=run_evaluator, ) # Execute training trainer.run(train_loader, max_epochs=cfg.mode.train.max_epochs)
def eval_net(**kwargs): cuda = kwargs['cuda'] device = kwargs['device'] model = kwargs['model'] model.eval() if cuda: model.cuda(device=device) if 'writer' in kwargs.keys(): num_classes = kwargs['num_classes'] batch_size = kwargs['batch_size'] writer = kwargs['writer'] step = kwargs['step'] denominator = kwargs['denominator'] val_loader = kwargs['val_loader'] model = kwargs['model'] criterion = kwargs['criterion'] net_accuracy, net_loss = [], [] for idx, data in enumerate(val_loader): test_x, label = data['input'], data['label'] test_x = test_x.cuda() if cuda else test_x # forward out_x, pred = model.forward(test_x) pred = pred.cpu() loss = criterion(out_x.cpu(), label) # get accuracy metric accuracy = (pred == label).sum() accuracy = accuracy * 100 / (test_x.size(0)*64**2) net_accuracy.append(accuracy) net_loss.append(loss.item()) # per class accuracies # avg = [] # for j in range(num_classes): # class_pred = (pred == j) # class_label = (label == j) # class_accuracy = (class_pred == class_label).sum() # class_accuracy = class_accuracy * 100 / (batch_size * 32 ** 2) # avg.append(class_accuracy) # writer.add_scalar(tag='class_{} accuracy'.format(j), scalar_value=class_accuracy, global_step=step) # classes_avg_acc = np.asarray(avg).mean() # writer.add_scalar(tag='classes avg. accuracy', scalar_value=classes_avg_acc, global_step=step) ################################# mean_accuracy = np.asarray(net_accuracy).mean() mean_loss = np.asarray(net_loss).mean() writer.add_scalar(tag='eval accuracy', scalar_value=mean_accuracy, global_step=step) writer.add_scalar(tag='eval loss', scalar_value=mean_loss, global_step=step) print('$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$') print('log: validation:: total loss = {:.5f}, total accuracy = {:.5f}%'.format(mean_loss, mean_accuracy)) print('$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$') else: # model, images, labels, pre_model, save_dir, sum_dir, batch_size, lr, log_after, cuda pre_model = kwargs['pre_model'] images = kwargs['images'] labels = kwargs['labels'] batch_size = kwargs['batch_size'] criterion = nn.CrossEntropyLoss() cm = CM(k=7, normalized=True) preds, labs = torch.Tensor().long(), torch.Tensor().long() model.load_state_dict(torch.load(pre_model)) print('log: resumed model {} successfully!'.format(pre_model)) _, _, test_loader = get_dataloaders(images_path=images, labels_path=labels, batch_size=batch_size) net_accuracy, net_loss = [], [] net_class_accuracy_0, net_class_accuracy_1, net_class_accuracy_2, \ net_class_accuracy_3, net_class_accuracy_4, net_class_accuracy_5,\ net_class_accuracy_6 = [], [], [], [], [], [], [] for idx, data in enumerate(test_loader): if idx == 400: break test_x, label = data['input'], data['label'] # print(test_x.shape) if cuda: test_x = test_x.cuda() # forward out_x, pred = model.forward(test_x) pred = pred.cpu() loss = criterion(out_x.cpu(), label) # get accuracy metric accuracy = (pred == label).sum() accuracy = accuracy * 100 / (pred.view(-1).size(0)) net_accuracy.append(accuracy) net_loss.append(loss.item()) if idx % 10 == 0: print('log: on {}'.format(idx)) # print(pred.view(-1).size(0)) # get per-class metrics class_pred_0 = (pred == 0) class_label_0 = (label == 0) class_accuracy_0 = (class_pred_0 == class_label_0).sum() class_accuracy_0 = class_accuracy_0 * 100 / (pred.view(-1).size(0)) net_class_accuracy_0.append(class_accuracy_0) class_pred_1 = (pred == 1) class_label_1 = (label == 1) class_accuracy_1 = (class_pred_1 == class_label_1).sum() class_accuracy_1 = class_accuracy_1 * 100 / (pred.view(-1).size(0)) net_class_accuracy_1.append(class_accuracy_1) class_pred_2 = (pred == 2) class_label_2 = (label == 2) class_accuracy_2 = (class_pred_2 == class_label_2).sum() class_accuracy_2 = class_accuracy_2 * 100 / (pred.view(-1).size(0)) net_class_accuracy_2.append(class_accuracy_2) class_pred_3 = (pred == 3) class_label_3 = (label == 3) class_accuracy_3 = (class_pred_3 == class_label_3).sum() class_accuracy_3 = class_accuracy_3 * 100 / (pred.view(-1).size(0)) net_class_accuracy_3.append(class_accuracy_3) class_pred_4 = (pred == 4) class_label_4 = (label == 4) class_accuracy_4 = (class_pred_4 == class_label_4).sum() class_accuracy_4 = class_accuracy_4 * 100 / (pred.view(-1).size(0)) net_class_accuracy_4.append(class_accuracy_4) class_pred_5 = (pred == 5) class_label_5 = (label == 5) class_accuracy_5 = (class_pred_5 == class_label_5).sum() class_accuracy_5 = class_accuracy_5 * 100 / (pred.view(-1).size(0)) net_class_accuracy_5.append(class_accuracy_5) class_pred_6 = (pred == 6) class_label_6 = (label == 6) class_accuracy_6 = (class_pred_6 == class_label_6).sum() class_accuracy_6 = class_accuracy_6 * 100 / (pred.view(-1).size(0)) net_class_accuracy_6.append(class_accuracy_6) preds = torch.cat((preds, pred.long().view(-1))) labs = torch.cat((labs, label.long().view(-1))) ################################# mean_accuracy = np.asarray(net_accuracy).mean() mean_loss = np.asarray(net_loss).mean() class_0_mean_accuracy = np.asarray(net_class_accuracy_0).mean() class_1_mean_accuracy = np.asarray(net_class_accuracy_1).mean() class_2_mean_accuracy = np.asarray(net_class_accuracy_2).mean() class_3_mean_accuracy = np.asarray(net_class_accuracy_3).mean() class_4_mean_accuracy = np.asarray(net_class_accuracy_4).mean() class_5_mean_accuracy = np.asarray(net_class_accuracy_5).mean() class_6_mean_accuracy = np.asarray(net_class_accuracy_6).mean() print('$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$') print('log: test:: total loss = {:.5f}, total accuracy = {:.5f}%'.format(mean_loss, mean_accuracy)) print('log: class 0:: total accuracy = {:.5f}%'.format(class_0_mean_accuracy)) print('log: class 1:: total accuracy = {:.5f}%'.format(class_1_mean_accuracy)) print('log: class 2:: total accuracy = {:.5f}%'.format(class_2_mean_accuracy)) print('log: class 3:: total accuracy = {:.5f}%'.format(class_3_mean_accuracy)) print('log: class 4:: total accuracy = {:.5f}%'.format(class_4_mean_accuracy)) print('log: class 5:: total accuracy = {:.5f}%'.format(class_5_mean_accuracy)) print('log: class 6:: total accuracy = {:.5f}%'.format(class_6_mean_accuracy)) print('$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$') # class_names = ['background/clutter', 'buildings', 'trees', 'cars', # 'low_vegetation', 'impervious_surfaces', 'noise'] # cm_preds = pred.view(-1).cpu().numpy() # cm_labels = label.view(-1).cpu().numpy() # cnf_matrix = confusion_matrix(cm_labels, cm_preds) # # fig1 = plt.figure() # plot_confusion_matrix(cnf_matrix, classes=class_names, # title='Confusion matrix, without normalization') # # Plot normalized confusion matrix # fig2 = plt.figure() # plot_confusion_matrix(cnf_matrix, classes=class_names, normalize=True, # title='Normalized confusion matrix') # fig2img(fig1).save('unnormalized.png') # fig2img(fig2).save('normalized.png') # # cm.add(preds.view(-1), labs.view(-1).type(torch.LongTensor)) # this = cm.value() # print(this) # df_cm = pd.DataFrame(this, index=[f for f in class_names], # columns=[f for f in class_names]) # fig = plt.figure(figsize=(10, 7)) # sn.heatmap(df_cm, annot=True) # fig2img(fig).save('sea.png') pass
def build_datasets(self): args = self.args if args.watershed: self.train_loader, self.val_loader = dataset.get_dataloaders(args.batch_size, augment=True, skip_no_lenses_frames=False, watershed_endpoints=WATERSHED_ENDPOINTS) else: self.train_loader, self.val_loader = dataset.get_classifier_dataloaders(args.batch_size, augment=True)
def train_net(model, images, labels, pre_model, save_dir, sum_dir, batch_size, lr, log_after, cuda, device): print(model) if cuda: print('GPU') model.cuda(device=device) # define loss and optimizer optimizer = RMSprop(model.parameters(), lr=lr) criterion = nn.CrossEntropyLoss() train_loader, val_dataloader, test_loader = get_dataloaders(images_path=images, labels_path=labels, batch_size=batch_size) if not os.path.exists(save_dir): os.mkdir(save_dir) if not os.path.exists(sum_dir): os.mkdir(sum_dir) writer = SummaryWriter() if True: i = 0 m_loss, m_accuracy = [], [] num_classes = 7 if pre_model: # self.load_state_dict(torch.load(pre_model)['model']) model.load_state_dict(torch.load(pre_model)) print('log: resumed model {} successfully!'.format(pre_model)) model_number = int(pre_model.split('/')[1].split('-')[1].split('.')[0]) else: print('log: starting anew...') while True: i += 1 net_loss = [] net_accuracy = [] if not pre_model: save_path = os.path.join(save_dir, 'model-{}.pt'.format(i)) else: save_path = os.path.join(save_dir, 'model-{}.pt'.format(i+model_number-1)) if i > 1 and not os.path.exists(save_path): torch.save(model.state_dict(), save_path) # remember to save only five previous models, so del_this = os.path.join(save_dir, 'model-{}.pt'.format(i+model_number-6)) if os.path.exists(del_this): os.remove(del_this) print('log: removed {}'.format(del_this)) print('log: saved {}'.format(save_path)) list_of_pred = [] list_of_labels = [] for idx, data in enumerate(train_loader): ########################## model.train() ########################## test_x, label = data['input'], data['label'] image0 = test_x[0] test_x = test_x.cuda(device=device) if cuda else test_x size = test_x.size(-1) # forward out_x, pred = model.forward(test_x) pred = pred.cpu(); out_x = out_x.cpu() image1 = pred[0] image2 = label[0] if idx % (len(train_loader)/2) == 0: writer.add_image('input', image0, i) writer.add_image('pred', image1, i) writer.add_image('label', image2, i) loss = criterion(out_x, label) # get accuracy metric accuracy = (pred == label).sum() # also convert into np arrays to be used for confusion matrix pred_np = pred.numpy(); list_of_pred.append(pred_np) label_np = label.numpy(); list_of_labels.append(label_np) writer.add_scalar(tag='loss', scalar_value=loss.item(), global_step=i) writer.add_scalar(tag='over_all accuracy', scalar_value=accuracy*100/(test_x.size(0)*size**2), global_step=i) # per class accuracies avg = [] for j in range(num_classes): class_pred = (pred == j) class_label = (label == j) class_accuracy = (class_pred == class_label).sum() class_accuracy = class_accuracy * 100 / (test_x.size(0) * size ** 2) avg.append(class_accuracy) writer.add_scalar(tag='class_{} accuracy'.format(j), scalar_value=class_accuracy, global_step=i) classes_avg_acc = np.asarray(avg).mean() writer.add_scalar(tag='classes avg. accuracy', scalar_value=classes_avg_acc, global_step=i) if idx % log_after == 0 and idx > 0: print('{}. ({}/{}) image size = {}, loss = {}: accuracy = {}/{}'.format(i, idx, len(train_loader), out_x.size(), loss.item(), accuracy, test_x.size(0)*size**2)) ################################# # three steps for backprop model.zero_grad() loss.backward() # perform gradient clipping between loss backward and optimizer step clip_grad_norm_(model.parameters(), 0.05) optimizer.step() accuracy = accuracy * 100 / (test_x.size(0)*size**2) net_accuracy.append(accuracy) net_loss.append(loss.item()) ################################# mean_accuracy = np.asarray(net_accuracy).mean() mean_loss = np.asarray(net_loss).mean() m_loss.append((i, mean_loss)) m_accuracy.append((i, mean_accuracy)) print('####################################') print('epoch {} -> total loss = {:.5f}, total accuracy = {:.5f}%'.format(i, mean_loss, mean_accuracy)) print('####################################') # # one epoch complete, get new confusion matrix! # cm_preds = np.vstack(list_of_pred) # cm_preds = cm_preds.reshape(-1) # cm_labels = np.vstack(list_of_labels) # cm_labels = cm_labels.reshape(-1) # cnf_matrix = confusion_matrix(cm_labels, cm_preds) # fig1 = plt.figure() # plot_confusion_matrix(cnf_matrix, classes=class_names, # title='Confusion matrix, without normalization') # # Plot normalized confusion matrix # fig2 = plt.figure() # plot_confusion_matrix(cnf_matrix, classes=class_names, normalize=True, # title='Normalized confusion matrix') # get1 = np.asarray(fig2img(fig1)) # get2 = np.asarray(fig2img(fig2)) # # print(get1.size) # validate model if i % 10 == 0: eval_net(model=model, criterion=criterion, val_loader=val_dataloader, denominator=batch_size * size**2, cuda=cuda, device=device, writer=writer, num_classes=num_classes, batch_size=batch_size, step=i) writer.export_scalars_to_json("./all_scalars.json") writer.close() pass
default='results', type=str, help='parent directory to write result') parser.add_argument('--phase', default='test', type=str, choices=['valid', 'test']) parser.add_argument('-b', '--batch_size', default=16, type=int, help='mini-batch size') args = parser.parse_args() dataloader = get_dataloaders(args.phase, batch_size=args.batch_size, shuffle=False, data_dir=args.data_dir) total_scores = None # voting scores labels = None st_corrects = {st: 0 for st in config.study_type} nr_stype = {st: 0 for st in config.study_type} for j in range(len(model_list)): print('single model ' + str(j), model_list[j]) if 'fuse' in model_list[j]: state_dict = torch.load(model_list[j])['state_dict'] net = fusenet() net.load_state_dict(state_dict) net.set_fcweights() net = torch.nn.DataParallel(net).cuda()
def eval_net(**kwargs): cuda = kwargs['cuda'] device = kwargs['device'] model = kwargs['model'] model.eval() if cuda: model.cuda(device=device) if 'writer' in kwargs.keys(): num_classes = kwargs['num_classes'] batch_size = kwargs['batch_size'] writer = kwargs['writer'] step = kwargs['step'] denominator = kwargs['denominator'] val_loader = kwargs['val_loader'] model = kwargs['model'] criterion = kwargs['criterion'] net_accuracy, net_loss = [], [] for idx, data in enumerate(val_loader): test_x, label = data['input'], data['label'] test_x = test_x.cuda() if cuda else test_x # forward out_x, pred = model.forward(test_x) pred = pred.cpu() loss = criterion(out_x.cpu(), label) # get accuracy metric accuracy = (pred == label).sum() accuracy = accuracy * 100 / (test_x.size(0)*64**2) net_accuracy.append(accuracy) net_loss.append(loss.item()) # per class accuracies # avg = [] # for j in range(num_classes): # class_pred = (pred == j) # class_label = (label == j) # class_accuracy = (class_pred == class_label).sum() # class_accuracy = class_accuracy * 100 / (batch_size * 32 ** 2) # avg.append(class_accuracy) # writer.add_scalar(tag='class_{} accuracy'.format(j), scalar_value=class_accuracy, global_step=step) # classes_avg_acc = np.asarray(avg).mean() # writer.add_scalar(tag='classes avg. accuracy', scalar_value=classes_avg_acc, global_step=step) ################################# mean_accuracy = np.asarray(net_accuracy).mean() mean_loss = np.asarray(net_loss).mean() writer.add_scalar(tag='eval accuracy', scalar_value=mean_accuracy, global_step=step) writer.add_scalar(tag='eval loss', scalar_value=mean_loss, global_step=step) print('$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$') print('log: validation:: total loss = {:.5f}, total accuracy = {:.5f}%'.format(mean_loss, mean_accuracy)) print('$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$') else: # model, images, labels, pre_model, save_dir, sum_dir, batch_size, lr, log_after, cuda pre_model = kwargs['pre_model'] images = kwargs['images'] labels = kwargs['labels'] batch_size = kwargs['batch_size'] criterion = nn.CrossEntropyLoss() cm = CM(k=7, normalized=True) preds, labs = torch.Tensor().long(), torch.Tensor().long() model.load_state_dict(torch.load(pre_model)) print('log: resumed model {} successfully!'.format(pre_model)) _, _, test_loader = get_dataloaders(images_path=images, labels_path=labels, batch_size=batch_size) net_accuracy, net_loss = [], [] net_class_accuracy_0, net_class_accuracy_1, net_class_accuracy_2, \ net_class_accuracy_3, net_class_accuracy_4, net_class_accuracy_5,\ net_class_accuracy_6 = [], [], [], [], [], [], [] for idx, data in enumerate(test_loader): if idx == 400: break test_x, label = data['input'], data['label'] # print(test_x.shape) if cuda: test_x = test_x.cuda() # forward out_x, pred = model.forward(test_x) pred = pred.cpu() loss = criterion(out_x.cpu(), label) # get accuracy metric accuracy = (pred == label).sum() accuracy = accuracy * 100 / (pred.view(-1).size(0)) net_accuracy.append(accuracy) net_loss.append(loss.item()) if idx % 10 == 0: print('log: on {}'.format(idx)) # print(pred.view(-1).size(0)) # get per-class metrics class_pred_0 = (pred == 0) class_label_0 = (label == 0) class_accuracy_0 = (class_pred_0 == class_label_0).sum() class_accuracy_0 = class_accuracy_0 * 100 / (pred.view(-1).size(0)) net_class_accuracy_0.append(class_accuracy_0) class_pred_1 = (pred == 1) class_label_1 = (label == 1) class_accuracy_1 = (class_pred_1 == class_label_1).sum() class_accuracy_1 = class_accuracy_1 * 100 / (pred.view(-1).size(0)) net_class_accuracy_1.append(class_accuracy_1) class_pred_2 = (pred == 2) class_label_2 = (label == 2) class_accuracy_2 = (class_pred_2 == class_label_2).sum() class_accuracy_2 = class_accuracy_2 * 100 / (pred.view(-1).size(0)) net_class_accuracy_2.append(class_accuracy_2) class_pred_3 = (pred == 3) class_label_3 = (label == 3) class_accuracy_3 = (class_pred_3 == class_label_3).sum() class_accuracy_3 = class_accuracy_3 * 100 / (pred.view(-1).size(0)) net_class_accuracy_3.append(class_accuracy_3) class_pred_4 = (pred == 4) class_label_4 = (label == 4) class_accuracy_4 = (class_pred_4 == class_label_4).sum() class_accuracy_4 = class_accuracy_4 * 100 / (pred.view(-1).size(0)) net_class_accuracy_4.append(class_accuracy_4) class_pred_5 = (pred == 5) class_label_5 = (label == 5) class_accuracy_5 = (class_pred_5 == class_label_5).sum() class_accuracy_5 = class_accuracy_5 * 100 / (pred.view(-1).size(0)) net_class_accuracy_5.append(class_accuracy_5) class_pred_6 = (pred == 6) class_label_6 = (label == 6) class_accuracy_6 = (class_pred_6 == class_label_6).sum() class_accuracy_6 = class_accuracy_6 * 100 / (pred.view(-1).size(0)) net_class_accuracy_6.append(class_accuracy_6) preds = torch.cat((preds, pred.long().view(-1))) labs = torch.cat((labs, label.long().view(-1))) ################################# mean_accuracy = np.asarray(net_accuracy).mean() mean_loss = np.asarray(net_loss).mean() class_0_mean_accuracy = np.asarray(net_class_accuracy_0).mean() class_1_mean_accuracy = np.asarray(net_class_accuracy_1).mean() class_2_mean_accuracy = np.asarray(net_class_accuracy_2).mean() class_3_mean_accuracy = np.asarray(net_class_accuracy_3).mean() class_4_mean_accuracy = np.asarray(net_class_accuracy_4).mean() class_5_mean_accuracy = np.asarray(net_class_accuracy_5).mean() class_6_mean_accuracy = np.asarray(net_class_accuracy_6).mean() print('$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$') print('log: test:: total loss = {:.5f}, total accuracy = {:.5f}%'.format(mean_loss, mean_accuracy)) print('log: class 0:: total accuracy = {:.5f}%'.format(class_0_mean_accuracy)) print('log: class 1:: total accuracy = {:.5f}%'.format(class_1_mean_accuracy)) print('log: class 2:: total accuracy = {:.5f}%'.format(class_2_mean_accuracy)) print('log: class 3:: total accuracy = {:.5f}%'.format(class_3_mean_accuracy)) print('log: class 4:: total accuracy = {:.5f}%'.format(class_4_mean_accuracy)) print('log: class 5:: total accuracy = {:.5f}%'.format(class_5_mean_accuracy)) print('log: class 6:: total accuracy = {:.5f}%'.format(class_6_mean_accuracy)) print('$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$')
parser.add_argument('--img_type', default='ALL', type=str, required=False, choices=[ 'ELBOW', 'FINGER', 'FOREARM', 'HAND', 'HUMERUS', 'SHOULDER', 'WRIST', 'ALL' ], help='type of query input') args = parser.parse_args() net = torch.load(args.model_path)['net'] if args.generate is True: dataloader = get_dataloaders('train', batch_size=args.batch_size, shuffle=False) generate_database(net, dataloader, args.save_dir) database = h5py.File(args.database_path, 'r') image = Image.open(args.img_path).convert('RGB') top5 = retrieval(image, net, database, args.img_type) print("The most similar five are:") i = 0 for path in top5: i += 1 print(path.item()) path = os.path.join(args.data_dir, str(path.item())[2:-1])
target = torch.masked_select(target, target_mask) loss = nn.BCEWithLogitsLoss()(output.float(), target.float()) return {"loss": loss, "output": output, "target": target} def validation_step(self, batch, batch_idx): inputs, target_ids, target = batch output = self(inputs["input_ids"], inputs["input_cat"], target_ids, inputs["input_rtime"]) target_mask = (target_ids != 0) output = torch.masked_select(output.squeeze(), target_mask) target = torch.masked_select(target, target_mask) loss = nn.BCEWithLogitsLoss()(output.float(), target.float()) return {"val_loss": loss, "output": output, "target": target} train_loader, val_loader = get_dataloaders() ARGS = { "n_dims": config.EMBED_DIMS, 'n_encoder': config.NUM_ENCODER, 'n_decoder': config.NUM_DECODER, 'enc_heads': config.ENC_HEADS, 'dec_heads': config.DEC_HEADS, 'total_ex': config.TOTAL_EXE, 'total_cat': config.TOTAL_CAT, 'total_responses': config.TOTAL_EXE, 'seq_len': config.MAX_SEQ } ########### TRAINING AND SAVING MODEL ####### checkpoint = ModelCheckpoint(filename="{epoch}_model",
def main(): parser = argparse.ArgumentParser() parser.add_argument('--epochs', default=100, type=int, help='epoch number') parser.add_argument('--start_epoch', default=0, type=int, help='start epoch number') parser.add_argument('-b', '--batch_size', default=8, type=int, help='mini-batch size') parser.add_argument('--lr', '--learning_rate', default=1e-4, type=float, help='initial learning rate') parser.add_argument('--weight-decay', default=0.0, type=float, help='weight decay') parser.add_argument('-c', '--continue', dest='continue_path', type=str, required=False) parser.add_argument('--exp_name', default=config.exp_name, type=str, required=False) args = parser.parse_args() print(args) config.exp_name = args.exp_name config.make_dir() save_args(args, config.log_dir) net = network() net = torch.nn.DataParallel(net).cuda() sess = Session(config, net=net) train_loader = get_dataloaders(os.path.join(config.data_dir, 'train.json'), batch_size=args.batch_size, shuffle=True) valid_loader = get_dataloaders(os.path.join(config.data_dir, 'val.json'), batch_size=args.batch_size, shuffle=True) if args.continue_path and os.path.exists(args.continue_path): sess.load_checkpoint(args.continue_path) clock = sess.clock tb_writer = sess.tb_writer criterion = nn.L1Loss().cuda() optimizer = optim.Adam(sess.net.parameters(), args.lr, weight_decay=args.weight_decay) scheduler = ReduceLROnPlateau(optimizer, 'min', factor=0.1, patience=10, verbose=True) for e in range(args.epochs): train_model(train_loader, sess.net, criterion, optimizer, clock.epoch, tb_writer) valid_out = valid_model(valid_loader, sess.net, criterion, clock.epoch, tb_writer) tb_writer.add_scalar('train/learning_rate', optimizer.param_groups[-1]['lr'], clock.epoch) scheduler.step(valid_out['epoch_loss']) if valid_out['epoch_loss'] < sess.best_val_loss: sess.best_val_loss = valid_out['epoch_loss'] sess.save_checkpoint('best_model.pth.tar') if clock.epoch % 10 == 0: sess.save_checkpoint('epoch{}.pth.tar'.format(clock.epoch)) sess.save_checkpoint('latest.pth.tar') clock.tock()
def train_net(model, base_folder, pre_model, save_dir, batch_size, lr, log_after, cuda, device): if not pre_model: print(model) writer = SummaryWriter() if cuda: print('GPU') model.cuda(device=device) print('log: training started on device: {}'.format(device)) # define loss and optimizer optimizer = Adam(model.parameters(), lr=lr) criterion = nn.CrossEntropyLoss() train_loader, val_dataloader, test_loader = get_dataloaders( base_folder=base_folder, batch_size=batch_size) if not os.path.exists(save_dir): os.mkdir(save_dir) if True: i = 1 m_loss, m_accuracy = [], [] if pre_model: # self.load_state_dict(torch.load(pre_model)['model']) model.load_state_dict(torch.load(pre_model)) print('log: resumed model {} successfully!'.format(pre_model)) print(model) # starting point # model_number = int(pre_model.split('/')[1].split('-')[1].split('.')[0]) model_number = int(re.findall('\d+', str(pre_model))[0]) i = i + model_number - 1 else: print('log: starting anew using ImageNet weights...') while True: i += 1 net_loss = [] # new model path save_path = os.path.join(save_dir, 'model-{}.pt'.format(i)) # remember to save only five previous models, so del_this = os.path.join(save_dir, 'model-{}.pt'.format(i - 6)) if os.path.exists(del_this): os.remove(del_this) print('log: removed {}'.format(del_this)) if i > 1 and not os.path.exists(save_path): torch.save(model.state_dict(), save_path) print('log: saved {}'.format(save_path)) correct_count, total_count = 0, 0 for idx, data in enumerate(train_loader): ########################## model.train() # train mode at each epoch, just in case... ########################## test_x, label = data['input'], data['label'] if cuda: test_x = test_x.cuda(device=device) label = label.cuda(device=device) # forward out_x, pred = model.forward(test_x) # out_x, pred = out_x.cpu(), pred.cpu() loss = criterion(out_x, label) net_loss.append(loss.item()) # get accuracy metric batch_correct = (label.eq(pred.long())).double().sum().item() correct_count += batch_correct # print(batch_correct) total_count += np.float(pred.size(0)) if idx % log_after == 0 and idx > 0: print( '{}. ({}/{}) image size = {}, loss = {}: accuracy = {}/{}' .format(i, idx, len(train_loader), out_x.size(), loss.item(), batch_correct, pred.size(0))) ################################# # three steps for backprop model.zero_grad() loss.backward() # perform gradient clipping between loss backward and optimizer step clip_grad_norm_(model.parameters(), 0.05) optimizer.step() ################################# mean_accuracy = correct_count / total_count * 100 mean_loss = np.asarray(net_loss).mean() m_loss.append((i, mean_loss)) m_accuracy.append((i, mean_accuracy)) writer.add_scalar(tag='train loss', scalar_value=mean_loss, global_step=i) writer.add_scalar(tag='train over_all accuracy', scalar_value=mean_accuracy, global_step=i) print('####################################') print('epoch {} -> total loss = {:.5f}, total accuracy = {:.5f}%'. format(i, mean_loss, mean_accuracy)) print('####################################') # validate model after each epoch eval_net(model=model, writer=writer, criterion=criterion, val_loader=val_dataloader, denominator=batch_size, cuda=cuda, device=device, global_step=i) pass
def main(): parser = argparse.ArgumentParser() parser.add_argument('--seed', type=int, default=42, help='Random seed') parser.add_argument('-dd', '--data-dir', type=str, default='data', help='Data directory') parser.add_argument('-l', '--loss', type=str, default='label_smooth_cross_entropy') parser.add_argument('-t1', '--temper1', type=float, default=0.2) parser.add_argument('-t2', '--temper2', type=float, default=4.0) parser.add_argument('-optim', '--optimizer', type=str, default='adam') parser.add_argument('-prep', '--prep_function', type=str, default='none') parser.add_argument('--train_on_different_datasets', action='store_true') parser.add_argument('--use-current', action='store_true') parser.add_argument('--use-extra', action='store_true') parser.add_argument('--use-unlabeled', action='store_true') parser.add_argument('--fast', action='store_true') parser.add_argument('--mixup', action='store_true') parser.add_argument('--balance', action='store_true') parser.add_argument('--balance-datasets', action='store_true') parser.add_argument('--show', action='store_true') parser.add_argument('-v', '--verbose', action='store_true') parser.add_argument('-m', '--model', type=str, default='efficientnet-b4', help='') parser.add_argument('-b', '--batch-size', type=int, default=8, help='Batch Size during training, e.g. -b 64') parser.add_argument('-e', '--epochs', type=int, default=100, help='Epoch to run') parser.add_argument('-s', '--sizes', default=380, type=int, help='Image size for training & inference') parser.add_argument('-f', '--fold', type=int, default=None) parser.add_argument('-t', '--transfer', default=None, type=str, help='') parser.add_argument('-lr', '--learning_rate', type=float, default=1e-4, help='Initial learning rate') parser.add_argument('-a', '--augmentations', default='medium', type=str, help='') parser.add_argument('-accum', '--accum-step', type=int, default=1) parser.add_argument('-metric', '--metric', type=str, default='accuracy01') args = parser.parse_args() diff_dataset_train = args.train_on_different_datasets data_dir = args.data_dir epochs = args.epochs batch_size = args.batch_size seed = args.seed loss_name = args.loss optim_name = args.optimizer prep_function = args.prep_function model_name = args.model size = args.sizes, print(size) print(size[0]) image_size = (size[0], size[0]) print(image_size) fast = args.fast fold = args.fold mixup = args.mixup balance = args.balance balance_datasets = args.balance_datasets show_batches = args.show verbose = args.verbose use_current = args.use_current use_extra = args.use_extra use_unlabeled = args.use_unlabeled learning_rate = args.learning_rate augmentations = args.augmentations transfer = args.transfer accum_step = args.accum_step #cosine_loss accuracy01 main_metric = args.metric print(data_dir) num_classes = 5 assert use_current or use_extra print(fold) current_time = datetime.now().strftime('%b%d_%H_%M') random_name = get_random_name() current_time = datetime.now().strftime('%b%d_%H_%M') random_name = get_random_name() # if folds is None or len(folds) == 0: # folds = [None] torch.cuda.empty_cache() checkpoint_prefix = f'{model_name}_{size}_{augmentations}' if transfer is not None: checkpoint_prefix += '_pretrain_from_' + str(transfer) else: if use_current: checkpoint_prefix += '_current' if use_extra: checkpoint_prefix += '_extra' if use_unlabeled: checkpoint_prefix += '_unlabeled' if fold is not None: checkpoint_prefix += f'_fold{fold}' directory_prefix = f'{current_time}_{checkpoint_prefix}' log_dir = os.path.join('runs', directory_prefix) os.makedirs(log_dir, exist_ok=False) set_manual_seed(seed) model = get_model(model_name) if transfer is not None: print("Transfering weights from model checkpoint") model.load_state_dict(torch.load(transfer)['model_state_dict']) model = model.cuda() if diff_dataset_train: train_on = ['current_train', 'extra_train'] valid_on = ['unlabeled'] train_ds, valid_ds, train_sizes = get_datasets_universal( train_on=train_on, valid_on=valid_on, image_size=image_size, augmentation=augmentations, target_dtype=int, prep_function=prep_function) else: train_ds, valid_ds, train_sizes = get_datasets( data_dir=data_dir, use_current=use_current, use_extra=use_extra, image_size=image_size, prep_function=prep_function, augmentation=augmentations, target_dtype=int, fold=fold, folds=5) train_loader, valid_loader = get_dataloaders(train_ds, valid_ds, batch_size=batch_size, train_sizes=train_sizes, num_workers=6, balance=True, balance_datasets=True, balance_unlabeled=False) loaders = collections.OrderedDict() loaders["train"] = train_loader loaders["valid"] = valid_loader runner = SupervisedRunner(input_key='image') criterions = get_loss(loss_name) # criterions_tempered = TemperedLogLoss() # optimizer = catalyst.contrib.nn.optimizers.radam.RAdam(model.parameters(), lr = learning_rate) optimizer = get_optim(optim_name, model, learning_rate) # optimizer = catalyst.contrib.nn.optimizers.Adam(model.parameters(), lr = learning_rate) # criterions = nn.CrossEntropyLoss() # optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) # scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=[25], gamma=0.8) # cappa = CappaScoreCallback() Q = math.floor(len(train_ds) / batch_size) scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=Q) if main_metric != 'accuracy01': callbacks = [ AccuracyCallback(num_classes=num_classes), CosineLossCallback(), OptimizerCallback(accumulation_steps=accum_step), CheckpointCallback(save_n_best=epochs) ] else: callbacks = [ AccuracyCallback(num_classes=num_classes), OptimizerCallback(accumulation_steps=accum_step), CheckpointCallback(save_n_best=epochs) ] # main_metric = 'accuracy01' runner.train( fp16=True, model=model, criterion=criterions, optimizer=optimizer, scheduler=scheduler, callbacks=callbacks, loaders=loaders, logdir=log_dir, num_epochs=epochs, verbose=verbose, main_metric=main_metric, minimize_metric=False, )
def main(): parser = argparse.ArgumentParser() parser.add_argument('--epochs', default=200, type=int, help='epoch number(Default:200)') parser.add_argument('--start_epoch', default=0, type=int, help='start epoch number') parser.add_argument('-b', '--batch_size', default=6, type=int, help='mini-batch size(Default:6)') parser.add_argument('--lr', '--learning_rate', default=1e-3, type=float, help='initial learning rate(Default:1e-3)') parser.add_argument('--resume', type=str, default=None, help='The path for checkpoint file') parser.add_argument('--exp', type=str, default='test', help='The name of this exp') parser.add_argument('--content', type=float, default=10.0, help='the weight of content loss(Default:10.0)') parser.add_argument('--tv', type=float, default=3e-3, help='the weight of TV loss(Default:0.003)') parser.add_argument('--adv', type=float, default=3.00, help='the weight of adv loss(Default:1.0)') parser.add_argument('--first_stage', type=str, default='./exps/2_baseline/checkpoint.pth.tar', help='first stage model') args = parser.parse_args() base_dir = './twostageExps/' exp_dir = os.path.join(base_dir, args.exp) base_results_dir = os.path.join(exp_dir, 'results/') best_metric = 0 if not os.path.exists(base_dir): os.mkdir(base_dir) if not os.path.exists(exp_dir): os.mkdir(exp_dir) if not os.path.exists(base_results_dir): os.mkdir(base_results_dir) save_args(args, exp_dir) global AdLossWeight global TvLossWeight global ContentLossWeight AdLossWeight = args.adv TvLossWeight = args.tv ContentLossWeight = args.content log_dir = os.path.join('./twostageLogs', args.exp) writer = SummaryWriter(log_dir) first_stage = network() generator = network() critic = critic_network() optimizer_G = optim.Adam(generator.parameters(), args.lr) optimizer_C = optim.Adam(critic.parameters(), args.lr) scheduler_C = ReduceLROnPlateau(optimizer_G, 'min', factor=0.2, patience=10, verbose=True) scheduler_G = ReduceLROnPlateau(optimizer_C, 'min', factor=0.2, patience=10, verbose=True) if args.resume != None: assert os.path.exists(args.resume), 'model does not exist!' print('=> loading checkpoint {}'.format(args.resume)) checkpoint = torch.load(args.resume) args.start_epoch = checkpoint['epoch'] generator.load_state_dict(checkpoint['generator']) critic.load_state_dict(checkpoint['critic']) #best_metric = checkpoint['best_metric'] optimizer_G = checkpoint['optimizer_G'] optimizer_C = checkpoint['optimizer_C'] print('=> loaded checkpoint {} - epoch:{} - best_metric:{}'.format( args.resume, args.start_epoch, best_metric)) else: print('No checkpoint. A new begining') if args.first_stage != None: assert os.path.exists( args.first_stage), 'first stage model does not exist!' print('=> loading first stage model {}'.format(args.first_stage)) first_stage_checkpoint = torch.load(args.first_stage) first_stage.load_state_dict(first_stage_checkpoint['generator']) for p in first_stage.parameters(): p.requires_grad = False vgg_for_perceptual_loss = vgg19() for p in vgg_for_perceptual_loss.parameters(): p.requires_grad = False generator.cuda() critic.cuda() vgg_for_perceptual_loss.cuda() vgg_for_perceptual_loss.eval() first_stage.cuda() first_stage.eval() clock = TrainClock() clock.epoch = args.start_epoch data_dir = config.data_dir train_loader = get_dataloaders(os.path.join(data_dir, 'train.json'), batch_size=args.batch_size, shuffle=True) valid_loader = get_dataloaders(os.path.join(config.data_dir, 'val.json'), batch_size=args.batch_size, shuffle=True) print('Begin training') for epoch in range(args.start_epoch, args.epochs): results_dir = os.path.join(base_results_dir, '{}'.format(epoch)) if not os.path.exists(results_dir): os.mkdir(results_dir) train(first_stage, generator, critic, optimizer_G, optimizer_C, train_loader, vgg_for_perceptual_loss, clock, writer, 2) save_checkpoint( { 'epoch': clock.epoch, 'generator': generator.state_dict(), 'critic': critic.state_dict(), 'optimizer_G': optimizer_G, 'optimizer_C': optimizer_C, }, is_best=True, prefix=exp_dir) torch.cuda.empty_cache() test_on_benchmark_two_stage(first_stage, generator, results_dir) torch.cuda.empty_cache() CriticRealLoss, ContentLoss = evaluate_on_val_two_stage( first_stage, generator, critic, valid_loader, vgg_for_perceptual_loss, clock, writer, os.path.join(exp_dir, 'valresults.txt')) scheduler_C.step(CriticRealLoss) scheduler_G.step(ContentLoss) torch.cuda.empty_cache()
def main(): parser = argparse.ArgumentParser() parser.add_argument('--epochs', default=50, type=int, help='epoch number') parser.add_argument('-b', '--batch_size', default=256, type=int, help='mini-batch size') parser.add_argument('--lr', '--learning_rate', default=1e-3, type=float, help='initial learning rate') parser.add_argument('-c', '--continue', dest='continue_path', type=str, required=False) parser.add_argument('--exp_name', default=config.exp_name, type=str, required=False) parser.add_argument('--drop_rate', default=0, type=float, required=False) parser.add_argument('--only_fc', action='store_true', help='only train fc layers') parser.add_argument('--net', default='densenet169', type=str, required=False) parser.add_argument('--local', action='store_true', help='train local branch') args = parser.parse_args() args.batch_size = 32 args.epochs = 150 args.net = 'densenet169' print(args) config.exp_name = args.exp_name config.make_dir() save_args(args, config.log_dir) # get network if args.net == 'resnet50': net = resnet50(pretrained=True, drop_rate=args.drop_rate) elif args.net == 'resnet101': net = resnet101(pretrained=True, drop_rate=args.drop_rate) elif args.net == 'densenet121': net = models.densenet121(pretrained=True) net.classifier = nn.Sequential(nn.Linear(1024, 1), nn.Sigmoid()) elif args.net == 'densenet169': net = densenet169(pretrained=True, drop_rate=args.drop_rate) elif args.net == 'fusenet': global_branch = torch.load(GLOBAL_BRANCH_DIR)['net'] local_branch = torch.load(LOCAL_BRANCH_DIR)['net'] net = fusenet(global_branch, local_branch) del global_branch, local_branch else: raise NameError net = net.cuda() sess = Session(config, net=net) # get dataloader # train_loader = get_dataloaders('train', batch_size=args.batch_size, # shuffle=True, is_local=args.local) # # valid_loader = get_dataloaders('valid', batch_size=args.batch_size, # shuffle=False, is_local=args.local) train_loader = get_dataloaders('train', batch_size=args.batch_size, num_workers=4, shuffle=True) valid_loader = get_dataloaders('valid', batch_size=args.batch_size, shuffle=False) if args.continue_path and os.path.exists(args.continue_path): sess.load_checkpoint(args.continue_path) # start session clock = sess.clock tb_writer = sess.tb_writer sess.save_checkpoint('start.pth.tar') # set criterion, optimizer and scheduler criterion = nn.BCELoss().cuda() # not used if args.only_fc: optimizer = optim.Adam(sess.net.module.classifier.parameters(), args.lr) else: optimizer = optim.Adam(sess.net.parameters(), args.lr) scheduler = ReduceLROnPlateau(optimizer, 'max', factor=0.1, patience=10, verbose=True) # start training for e in range(args.epochs): train_out = train_model(train_loader, sess.net, criterion, optimizer, clock.epoch) valid_out = valid_model(valid_loader, sess.net, criterion, optimizer, clock.epoch) tb_writer.add_scalars('loss', { 'train': train_out['epoch_loss'], 'valid': valid_out['epoch_loss'] }, clock.epoch) tb_writer.add_scalars('acc', { 'train': train_out['epoch_acc'], 'valid': valid_out['epoch_acc'] }, clock.epoch) tb_writer.add_scalar('auc', valid_out['epoch_auc'], clock.epoch) tb_writer.add_scalar('learning_rate', optimizer.param_groups[-1]['lr'], clock.epoch) scheduler.step(valid_out['epoch_auc']) if valid_out['epoch_auc'] > sess.best_val_acc: sess.best_val_acc = valid_out['epoch_auc'] sess.save_checkpoint('best_model.pth.tar') if clock.epoch % 10 == 0: sess.save_checkpoint('epoch{}.pth.tar'.format(clock.epoch)) sess.save_checkpoint('latest.pth.tar') clock.tock()
def eval_net(**kwargs): model = kwargs['model'] cuda = kwargs['cuda'] device = kwargs['device'] if cuda: model.cuda(device=device) if 'criterion' in kwargs.keys(): writer = kwargs['writer'] val_loader = kwargs['val_loader'] criterion = kwargs['criterion'] global_step = kwargs['global_step'] net_loss = [] model.eval() # put in eval mode first ############################ for idx, data in enumerate(val_loader, 1): test_x, label = data['input'].unsqueeze(2), data['label'] # test_x, label = data['input'].unsqueeze(2), data['label'] if cuda: test_x = test_x.cuda(device=device) label = label.cuda(device=device) # forward out_x, h_n = model.continuous_forward(test_x, out_seq_len=250000) # print(series_out.shape, series_in.shape) loss = criterion(out_x, label) net_loss.append(loss.item()) ################################# mean_loss = np.asarray(net_loss).sum() / idx # summarize mean accuracy writer.add_scalar(tag='val_loss', scalar_value=mean_loss, global_step=global_step) print('$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$`$$$$$$$$$$$') print('in_shape = {}, out_shape = {}'.format(test_x.shape, out_x.shape)) print('log: validation:: total loss = {:.5f}'.format(mean_loss)) print('$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$') if mean_loss: ref = test_x[0, :].squeeze(1) #.numpy() this = label[0, :] #.numpy() that = out_x[0, :] #.numpy() this = np.hstack( (ref, this)).astype(np.float) #/100.0 # rescale to best fit that = np.hstack((ref, that)).astype(np.float) #/100.0 fig = pl.figure() pl.plot(this, label='series_in') pl.plot(that, label='series_out') out = pl.legend(loc='lower right') pl.savefig('temp.png') evaluated_image = cv2.imread('temp.png') # os.remove('eval.png') # put it into the summary writer evaluated_image = torch.Tensor(evaluated_image.transpose(2, 0, 1)) writer.add_image('evaluation', evaluated_image, global_step) # pl.show() else: # model, images, labels, pre_model, save_dir, sum_dir, batch_size, lr, log_after, cuda pre_model = kwargs['pre_model'] base_folder = kwargs['base_folder'] batch_size = kwargs['batch_size'] log_after = kwargs['log_after'] criterion = nn.CrossEntropyLoss() un_confusion_meter = tnt.meter.ConfusionMeter(10, normalized=False) confusion_meter = tnt.meter.ConfusionMeter(10, normalized=True) model.load_state_dict(torch.load(pre_model)) print('log: resumed model {} successfully!'.format(pre_model)) _, _, test_loader = get_dataloaders(base_folder=base_folder, batch_size=batch_size) net_accuracy, net_loss = [], [] correct_count = 0 total_count = 0 for idx, data in enumerate(test_loader): model.eval() # put in eval mode first test_x, label = data['input'], data['label'] # print(test_x) # print(test_x.shape) # this = test_x.numpy().squeeze(0).transpose(1,2,0) # print(this.shape, np.min(this), np.max(this)) if cuda: test_x = test_x.cuda(device=device) label = label.cuda(device=device) # forward out_x, pred = model.forward(test_x) loss = criterion(out_x, label) un_confusion_meter.add(predicted=pred, target=label) confusion_meter.add(predicted=pred, target=label) ############################### # pred = pred.view(-1) # pred = pred.cpu().numpy() # label = label.cpu().numpy() # print(pred.shape, label.shape) ############################### # get accuracy metric # correct_count += np.sum((pred == label)) # print(pred, label) batch_correct = (label.eq(pred.long())).double().sum().item() correct_count += batch_correct # print(batch_correct) total_count += np.float(batch_size) net_loss.append(loss.item()) if idx % log_after == 0: print('log: on {}'.format(idx)) ################################# mean_loss = np.asarray(net_loss).sum() mean_accuracy = correct_count * 100 / total_count print(correct_count, total_count) print('$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$') print( 'log: test:: total loss = {:.5f}, total accuracy = {:.5f}%'.format( mean_loss, mean_accuracy)) print('$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$') with open('normalized.pkl', 'wb') as this: pkl.dump(confusion_meter.value(), this, protocol=pkl.HIGHEST_PROTOCOL) with open('un_normalized.pkl', 'wb') as this: pkl.dump(un_confusion_meter.value(), this, protocol=pkl.HIGHEST_PROTOCOL) pass pass
def tune_hyper(epoch=10, deeplab=False): """Tune hyper-parameters for alternative values.""" # All combinations of values here will be tried in a loop learning_rate = np.logspace(-3.4, -4.2, 5) try_depth = [34] random_vflip = [True] random_hflip = [True] random_rotate = [True] random_transform = [True] customs = [True, False] # Model result info model_paths = [] model_best_f1 = [] model_vflip = [] model_hflip = [] model_transform = [] model_rotate = [] model_depth = [] model_lr = [] model_custom = [] # Tuning loop for md in try_depth: for rv in random_vflip: for rh in random_hflip: for rr in random_rotate: for rt in random_transform: for lr in learning_rate: for custom in customs: # Getting dataset loaders dataloaders = get_dataloaders( random_transform=rt, random_rotate=rr, random_hflip=rh, random_vflip=rv, all_in=not custom ) print(f'Testing for learning rate {lr}') if deeplab: model = deeplabv3_resnet101(num_classes=1, pretrained=False) else: model = UNet(n_channels=3, n_classes=1, depth=md) model, epoch_stats, path, best_f1 = train_model( model, dataloaders, num_epochs=epoch, learning_rate=lr, deeplab=deeplab ) model_paths.append(path) model_best_f1.append(best_f1) model_vflip.append(rv) model_hflip.append(rh) model_transform.append(rt) model_rotate.append(rr) model_depth.append(md) model_lr.append(lr) model_custom.append(custom) print() # Collecting in dictionary for a DataFrame tune_results = { 'learning_rate': model_lr, 'model_depth': model_depth, 'model_paths': model_paths, 'random_vflip': model_vflip, 'random_hflip': model_hflip, 'random_rotate': model_rotate, 'random_transform': model_transform, 'custom': model_custom, 'best_f1': model_best_f1 } # Saving tuning results for later inspection tune_name = time.strftime("%Y%m%d-%H%M%S") + '.tune' tune_path = os.path.join(TUNE_ROOT, tune_name) pickle.dump(tune_results, open(tune_path, 'wb')) print(tune_path) print('Tuning complete') return pd.DataFrame(tune_results), tune_path
def train(args): global img, tgt_caption SEED_EVERYTHING() batch_size = args.batch_size epochs = args.epochs device = torch.device(args.device) train_dataloader, valid_dataloader = get_dataloaders(batch_size) with open('config.json','r') as f: model_config = json.load(f) model = get_model(**model_config) #Seq2SeqModel(dropout_p=0.25, hidden_size=256,num_layers=1) model.to(device) # for param in model.encoder.parameters(): # param.requires_grad = False print(model) print(model.decoder.embedding.weight.requires_grad) optimizer = torch.optim.Adam(model.parameters(), lr=1e-4) scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.7, patience=2, verbose=True, min_lr=1e-6, mode='max') if args.resume_from is not None: state = torch.load(args.resume_from) model.load_state_dict(state['model_state_dict']) optimizer.load_state_dict(state['optimizer_state_dict']) scheduler.load_state_dict(state['scheduler_state_dict']) if args.mixed_precision: scaler = torch.cuda.amp.GradScaler() loss_func = nn.CrossEntropyLoss(ignore_index=args.padding_idx) best_bleu = 0 for epoch_i in range(epochs): loss_meter = AverageMeter() bleu_meter = AverageMeter() pbar = tqdm(train_dataloader, total=len(train_dataloader)) model.train() for step, batch in enumerate(pbar): img = batch[0].to(device) tgt_caption = batch[1].to(device) optimizer.zero_grad() if args.mixed_precision: with torch.cuda.amp.autocast(): outputs = model(img, tgt_caption) loss = loss_func(outputs.view(-1, args.padding_idx), tgt_caption[1:].view(-1)) scaler.scale(loss).backward() scaler.unscale_(optimizer) torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) scaler.step(optimizer) scaler.update() else: outputs = model(img, tgt_caption) loss = loss_func(outputs.view(-1, args.padding_idx), tgt_caption[1:].view(-1)) loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) optimizer.step() pred_captions = outputs.argmax(2).cpu().numpy() true_captions = batch[1][1:].numpy() bleu = calc_bleu_score(true_captions, pred_captions) loss_meter.update(loss.item()) bleu_meter.update(bleu) pbar.set_postfix({'loss':loss_meter.avg, 'bleu':bleu_meter.avg}) valid_loss, valid_bleu = evaluate(model, valid_dataloader, device, epoch_i, args.key, loss_func) scheduler.step(valid_bleu) if valid_bleu > best_bleu: print('validation bleu improved from %.4f to %.4f'%(best_bleu,valid_loss)) print('saving model...') torch.save({'model_state_dict':model.state_dict(), 'optimizer_state_dict':optimizer.state_dict(), 'scheduler_state_dict':scheduler.state_dict()}, f'saved_models/{args.key}/state.pth') best_bleu = valid_bleu print(f'Epoch: {epoch_i+1}/{epochs}, train loss:{loss_meter.avg:.4f}, train bleu:{bleu_meter.avg:.4f}\nvalid loss: {valid_loss:.4f}, valid bleu: {valid_bleu:.4f}') torch.cuda.empty_cache()
def setup_and_run_train(n_channels, n_classes, dir_img, dir_gt, dir_results, load, val_perc, batch_size, epochs, lr, run, optimizer, loss, evaluation, dir_weights): # Use GPU or not use_cuda = torch.cuda.is_available() device = torch.device("cuda" if use_cuda else "cpu") # Create the model net = UNet(n_channels, n_classes).to(device) net = torch.nn.DataParallel(net, device_ids=list( range(torch.cuda.device_count()))).to(device) # Load old weights if load: net.load_state_dict(torch.load(load)) print('Model loaded from {}'.format(load)) # Load the dataset if loss != "WCE": train_loader, val_loader = get_dataloaders(dir_img, dir_gt, val_perc, batch_size) else: train_loader, val_loader = get_dataloaders(dir_img, dir_gt, val_perc, batch_size, isWCE = True, dir_weights = dir_weights) # Pretty print of the run print('''\n Starting training: Dataset: {} Num Channels: {} Groundtruth: {} Num Classes: {} Folder to save: {} Load previous: {} Training size: {} Validation size: {} Validation Percentage: {} Batch size: {} Epochs: {} Learning rate: {} Optimizer: {} Loss Function: {} Evaluation Function: {} CUDA: {} '''.format(dir_img, n_channels, dir_gt, n_classes, dir_results, load, len(train_loader)*batch_size, len(val_loader)*batch_size, val_perc, batch_size, epochs, lr, optimizer, loss, evaluation, use_cuda)) # Definition of the optimizer ADD MORE IF YOU WANT if optimizer == "Adam": optimizer = torch.optim.Adam(net.parameters(), lr=lr) elif optimizer == "SGD": optimizer = torch.optim.SGD(net.parameters(), lr=lr, momentum=0.9, weight_decay=0.0005) # Definition of the loss function ADD MORE IF YOU WANT if loss == "Dice": criterion = DiceLoss() elif loss == "RMSE": criterion = RMSELoss() elif loss == "MSE": criterion = nn.MSELoss() elif loss == "MAE": criterion = nn.L1Loss() elif loss == "CE": criterion = CELoss() elif loss == "WCE": criterion = WCELoss() # Saving History to csv header = ['epoch', 'train loss'] best_loss = 10000 time_start = time.time() # Run the training and validation for epoch in range(epochs): print('\nStarting epoch {}/{}.'.format(epoch + 1, epochs)) train_loss = train_net(net, device, train_loader, optimizer, criterion, batch_size, isWCE = (loss == "WCE")) #val_loss = val_net(net, device, val_loader, criterion_val, batch_size) values = [epoch+1, train_loss] export_history(header, values, dir_results, "result"+run+".csv") # save model if train_loss < best_loss: best_loss = train_loss save_checkpoint({ 'epoch': epoch + 1, 'state_dict': net.state_dict(), 'loss': train_loss, 'optimizer' : optimizer.state_dict(), }, path=dir_results, filename="weights"+run+".pth") time_dif = time.time() - time_start print("It tooks %.4f seconds to finish the run." % (time_dif))