def get_dga_sdirs(args, data, labels): device = get_device(args) sdirs = [] for x, y in zip(data, labels): # dga_bs: dist grad accum. batch size dataloader = get_dataloader(x, y, args.dga_bs, shuffle=False) count = 0 for xiter, yiter in dataloader: model, loss_type = get_model(args, False) loss_fn = get_loss_fn(loss_type) opt = get_optim(args, model) loss, _ = forward(model, xiter, yiter, opt, loss_fn, device) loss.backward() sdirs.append(get_model_grads(model, flatten=True)) count += 1 if count >= args.num_dga: break stacked = [[] for _ in range(len(sdirs[0]))] for l in range(len(sdirs[0])): for i in range(len(sdirs)): stacked[l].append(sdirs[i][l].flatten()) sdirs = [[] for _ in range(args.ncomponent)] for l, layer in enumerate(stacked): layer = torch.stack(layer, dim=0).T.cpu().numpy() layer, _ = pca_transform(layer, args.ncomponent) for i in range(args.ncomponent): sdirs[i].append(layer[:, i].flatten()) assert len(sdirs) == args.ncomponent return sdirs
def predict(data_path, model_weights_path, network, test_df_path, save_path, size, channels, neighbours, classification_head): model = get_model(network, classification_head) model.encoder.conv1 = nn.Conv2d(count_channels(channels) * neighbours, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False) model, device = UtilsFactory.prepare_model(model) if classification_head: model.load_state_dict(torch.load(model_weights_path)) else: checkpoint = torch.load(model_weights_path, map_location='cpu') model.load_state_dict(checkpoint['model_state_dict']) test_df = pd.read_csv(test_df_path) predictions_path = os.path.join(save_path, "predictions") if not os.path.exists(predictions_path): os.makedirs(predictions_path, exist_ok=True) print("Prediction directory created.") for _, image_info in tqdm(test_df.iterrows()): filename = '_'.join([image_info['name'], image_info['position']]) image_path = get_filepath(data_path, image_info['dataset_folder'], 'images', filename, file_type='tiff') image_tensor = filter_by_channels(read_tensor(image_path), channels, neighbours) if image_tensor.ndim == 2: image_tensor = np.expand_dims(image_tensor, -1) image = transforms.ToTensor()(image_tensor) if classification_head: prediction, label = model.predict( image.view(1, count_channels(channels) * neighbours, size, size).to(device, dtype=torch.float)) else: prediction = model.predict( image.view(1, count_channels(channels) * neighbours, size, size).to(device, dtype=torch.float)) result = prediction.view(size, size).detach().cpu().numpy() cv.imwrite(get_filepath(predictions_path, filename, file_type='png'), result * 255)
def generate_frechet_metric_callback(callback_args, device, outdir, dataset): mean = callback_args['transform']['mean'] std = callback_args['transform']['std'] total_samples = callback_args['total_samples'] batch_size = callback_args['sample_size'] classifier_model = get_model(callback_args['classifier_model_args']) classifier_model_layer = callback_args['classifier_model_layer'] if classifier_model_layer: classifier_model = torch.nn.Sequential( *list(classifier_model.children())[:classifier_model_layer]) classifier_model = classifier_model.to(device) transform = batch_normalize_transform(mean, std) return FrechetInceptionScoreCallback(outdir=outdir, classifier=classifier_model, batch_size=batch_size, total_samples=total_samples, transform=transform, device=device, dataset=dataset)
def generate_inception_metric_callback(callback_args, device, outdir): mode = callback_args['mode'] if mode == 'gan': mean = callback_args['transform']['mean'] std = callback_args['transform']['std'] total_samples = callback_args['total_samples'] batch_size = callback_args['sample_size'] classifier_model = get_model( callback_args['classifier_model_args']).to(device) transform = batch_normalize_transform(mean, std) return InceptionScoreCallback(classifier_model, outdir, batch_size=batch_size, total_samples=total_samples, transform=transform, mode=mode, device=device) else: raise NotImplementedError( 'generate_inception_metric_callback for classification is not implemented' )
def main(): parser = argparse.ArgumentParser() parser.add_argument('--epochs', type=int, default=50) parser.add_argument('--n_hidden', type=int, default=64) parser.add_argument('--n_layers', type=int, default=2) args = parser.parse_args() w2v_model = get_model() # vocab, embed dims VOCAB_SIZE, EMBED_DIM = w2v_model.wv.vectors.shape # w2ind from w2v w2ind = { token: token_index for token_index, token in enumerate(w2v_model.wv.index2word) } # padding token for now TRG_PAD_IDX = w2ind["."] # this is 0 # sentence marker token inds sos_ind = w2ind['<sos>'] eos_ind = w2ind['<eos>'] # adjusted sequence length SEQ_LEN = 5 + 2 # sos, eos tokens # padded vectorized states of token indexes d = torch.load('../dat/processed/padded_vectorized_states_v3.pt') # train test valid split """ train_d = {} test_d = {} valid_d = {} for index, vects in d.items(): if torch.rand(1) < 0.1: test_d[index] = vects elif torch.rand(1) < 0.2: valid_d[index] = vects else: train_d[index] = vects print(f'train % = {len(train_d)/len(d)}') print(f'test % = {len(test_d)/len(d)}') print(f'valid % = {len(valid_d)/len(d)}\n') """ # all data train_d = d valid_d = d print(len(d)) clip = 1 device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') enc = EncRnn(hidden_size=args.n_hidden, num_layers=args.n_layers, embed_size=EMBED_DIM) dec = DecRnn(hidden_size=args.n_hidden, num_layers=args.n_layers, embed_size=EMBED_DIM, output_size=VOCAB_SIZE) model = Seq2SeqAttn(enc, dec, TRG_PAD_IDX, VOCAB_SIZE, device).to(device) save_model_instance() optimizer = torch.optim.Adam(model.parameters()) criterion = nn.CrossEntropyLoss(ignore_index=TRG_PAD_IDX).to(device) assert w2v_model.vocabulary.sorted_vocab == True word_counts = { word: vocab_obj.count for word, vocab_obj in w2v_model.wv.vocab.items() } word_counts = sorted(word_counts.items(), key=lambda x: -x[1]) words = [t[0] for t in word_counts] model.apply(init_weights) train(train_d, valid_d, w2v_model, words, model, optimizer, criterion, sos_ind, eos_ind, TRG_PAD_IDX, SEQ_LEN, clip, device, args.epochs) # evaluate(test_d, w2v_model, words, model, criterion, sos_ind, eos_ind, TRG_PAD_IDX, SEQ_LEN, device, type='Test') observe(w2v_model, words, model, d, sos_ind, eos_ind, TRG_PAD_IDX, SEQ_LEN, device)
parser.add_argument('--select_nodes', default=0, type=int) args = parser.parse_args() device = 'cuda' if torch.cuda.is_available() else 'cpu' ''' Meta-name to be used as prefix on all savings''' oname = args.net + '_' + args.dataset + '/' SAVE_DIR = args.save_path + 'adjacency/' + oname START_LAYER = 3 if args.net in ['vgg', 'resnet'] else 0 THRESHOLDS = args.thresholds ''' If save directory doesn't exist create ''' if not os.path.exists(SAVE_DIR): os.makedirs(SAVE_DIR) # Build models print('==> Building model..') net = get_model(args.net, args.dataset) net = net.to(device) if device == 'cuda': net = torch.nn.DataParallel(net) cudnn.benchmark = True ''' Prepare criterion ''' if args.dataset in [ 'cifar10', 'cifar10_gray', 'vgg_cifar10_adversarial', 'imagenet' ]: criterion = nn.CrossEntropyLoss() elif args.dataset in ['mnist', 'mnist_adverarial']: criterion = F.nll_loss ''' Define label manipulator ''' manipulator = load_manipulator(args.permute_labels, args.binarize_labels) ''' Instead of building graph on the entire set of nodes, pick a subset '''
def train(args): set_random_seed(42) model = get_model(args.network) print('Loading model') model.encoder.conv1 = nn.Conv2d( count_channels(args.channels), 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False) model, device = UtilsFactory.prepare_model(model) train_df = pd.read_csv(args.train_df).to_dict('records') val_df = pd.read_csv(args.val_df).to_dict('records') ds = Dataset(args.channels, args.dataset_path, args.image_size, args.batch_size, args.num_workers) loaders = ds.create_loaders(train_df, val_df) if(args.optimizer=='Adam'): optimizer = torch.optim.Adam(model.parameters(), lr=args.lr) elif(args.optimizer=='SGD'): optimizer = torch.optim.SGD(model.parameters(), lr=args.lr) else: print('Unknown argument. Return to the default optimizer (Adam)') optimizer = torch.optim.Adam(model.parameters(), lr=args.lr) criterion = BCE_Dice_Loss(bce_weight=0.2) optimizer = torch.optim.Adam(model.parameters(), lr=args.lr) scheduler = torch.optim.lr_scheduler.MultiStepLR( optimizer, milestones=[10, 20, 40], gamma=0.3 ) save_path = os.path.join( args.logdir, args.name ) # model runner runner = SupervisedRunner() # model training runner.train( model=model, criterion=criterion, optimizer=optimizer, scheduler=scheduler, loaders=loaders, callbacks=[ DiceCallback() ], logdir=save_path, num_epochs=args.epochs, verbose=True ) infer_loader = collections.OrderedDict([('infer', loaders['valid'])]) runner.infer( model=model, loaders=infer_loader, callbacks=[ CheckpointCallback(resume=f'{save_path}/checkpoints/best.pth'), InferCallback() ], )
def main(): # general debug = False epochs = 10 batch_size = 12 num_workers = 0 lr = 0.01 # dataset base_dir = '/dataset/kaggle/38-cloud' datatype_train = 'train' # 'test' datatype_test = 'test' include_nir = True train_ratio = 0.8 test_mask = False # transforms name_trans_train = 'albu_train_0' name_trans_val = 'albu_val_0' kwargs_trans = { 'resize': None # (384, 384) } name_preprocessing = 'xxxx' # model model_name = 'unet_0' out_channels = 2 kwargs_model = {'in_channels': 4} resume = os.path.join("./logs/38_cloud_test/checkpoints", "cls_epoch_9.pth") # log log_base_dir = os.path.join("./logs/38_cloud_test", model_name) non_null_rate = 1.0 cloud_rate = None processes = 1 torch.backends.cudnn.benchmark = True if debug: device = 'cpu' else: device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") # transform transforms_test = get_transform(name=name_trans_val, **kwargs_trans) # preprocessing preprocessing = None # ENCODER = 'resnet50' # ENCODER_WEIGHTS = 'imagenet' # preprocessing_fn = smp.encoders.get_preprocessing_fn(ENCODER, ENCODER_WEIGHTS) # preprocessing = get_preprocessing(preprocessing_fn=preprocessing_fn) # dataset dataset_test = L8CLoudDataset(base_dir=base_dir, datatype=datatype_test, transforms=transforms_test, preprocessing=preprocessing, include_nir=include_nir, non_null_rate=non_null_rate, cloud_rate=cloud_rate, processes=processes, test_mask=test_mask) # DataLoader test_dl = DataLoader(dataset_test, batch_size=batch_size, shuffle=False, num_workers=num_workers) print(test_dl.__len__()) # model model = get_model(name=model_name, out_channels=out_channels, **kwargs_model) model.to(device) if resume is not None: model.load_state_dict(torch.load(resume, map_location=device)) # loss criterion = nn.CrossEntropyLoss().to(device) # check model xb, yb = next(iter(test_dl)) print(xb.shape, yb.shape) print(model) print(summary(model, input_size=tuple(xb.shape[1:]))) test_loss = test_org(model, test_dl, criterion=criterion, device=device, acc_fn=acc_metric) print(test_loss) # print figures dir_dest = "./temp/38_cloud_test" results_show(ds=dataset_test, list_index=[1, 2, 3, 4, 5, 6, 7, 8, 9, 10], model=model, save=True, dir_dest=dir_dest, fname='test.png', fname_time=True, show=False, fig_img_size=4, cmp_input='gray', cmp_out='jet', class_num=2)
def train_gan(arguments): """ Setup result directory and enable logging to file in it """ outdir = make_results_dir(arguments) logger.init(outdir, logging.INFO) logger.info('Arguments:\n{}'.format(pformat(arguments))) """ Initialize Tensorboard """ tensorboard_writer = initialize_tensorboard(outdir) """ Set random seed throughout python, pytorch and numpy """ logger.info('Using Random Seed value as: %d' % arguments['random_seed']) torch.manual_seed( arguments['random_seed']) # Set for pytorch, used for cuda as well. random.seed(arguments['random_seed']) # Set for python np.random.seed(arguments['random_seed']) # Set for numpy """ Set device - cpu or gpu """ device = torch.device( f"cuda:{opt.gpu}" if torch.cuda.is_available() else "cpu") logger.info(f'Using device - {device}') """ Load Model with weights(if available) """ G: torch.nn.Module = get_model( arguments.get('generator_model_args')).to(device) D: torch.nn.Module = get_model( arguments.get('discriminator_model_args')).to(device) if arguments['mode'] == 'dcgan': G.apply(weights_init) D.apply(weights_init) """ Create optimizer """ G_optimizer = create_optimizer(G.parameters(), arguments['generator_optimizer_args']) D_optimizer = create_optimizer(D.parameters(), arguments['discriminator_optimizer_args']) """ Create Loss """ loss = torch.nn.BCELoss().to(device=device) # GAN """ Load parameters for the Dataset """ dataset: BaseDataset = create_dataset(arguments['dataset_args'], arguments['train_data_args'], arguments['val_data_args']) """ Generate all callbacks """ callbacks: List[Callbacks] = generate_callbacks(arguments, dataset, device, outdir) # """ Create loss function """ # criterion = create_loss(arguments['loss_args']) """ Debug the inputs to model and save graph to tensorboard """ dataset.debug() # Only One model is allowed # G_dummy_input = torch.rand(size=(1, arguments['generator_model_args']['model_constructor_args']['latent_dim'])) # D_dummy_input = (torch.rand(1, # arguments['dataset_args']['name'].value['channels'], # 32, 32 # *arguments['dataset_args']['name'].value['image_size'] # ToDo Fix this # )) # tensorboard_writer.save_graph('Generator', G, G_dummy_input.to(device)) # tensorboard_writer.save_graph('Discriminator', D, D_dummy_input.to(device)) logger.info(G) logger.info(D) def reset_grad(): G.zero_grad() D.zero_grad() batch_size = arguments['train_data_args']['batch_size'] z_dim = arguments['generator_model_args']['model_constructor_args']['nz'] generator = infinite_train_gen(dataset.train_dataloader) interval_length = 10 if is_debug_mode() else 400 num_intervals = 1 if is_debug_mode() else int(arguments['num_iterations'] / interval_length) global_step = 0 # TO allocate memory required for the GPU during training and validation run_callbacks( callbacks, model=(G, D), optimizer=(G_optimizer, D_optimizer), # To Save optimizer dict for retraining. mode=CallbackMode.ON_NTH_ITERATION, iteration=global_step) reset_grad() for it in range(num_intervals): logger.info(f'Interval {it + 1}/{num_intervals}') # Set model in train mode G.train() D.train() t = trange(interval_length) for _ in t: if arguments['mode'] == 'dcgan': D_loss, G_loss = train_gan_iter(D, D_optimizer, G, G_optimizer, loss, device, generator, batch_size, reset_grad, z_dim, tensorboard_writer, global_step) elif arguments['mode'] == 'wgan-wp': D_loss, G_loss = train_wgan_iter(D, D_optimizer, G, G_optimizer, device, generator, batch_size, reset_grad, z_dim, tensorboard_writer, global_step) elif arguments['mode'] == 'wgan-noise-adversarial': D_loss, G_loss = train_noisy_wgan_iter( D, D_optimizer, G, G_optimizer, device, generator, batch_size, reset_grad, z_dim, tensorboard_writer, global_step, contamination_loss_weight=arguments[ 'contamination_loss_weight']) # Log D_Loss and G_Loss in progress_bar t.set_postfix(D_Loss=D_loss.data.cpu().item(), G_Loss=G_loss.data.cpu().item()) # Save Loss In Tensorboard tensorboard_writer.save_scalars( f'{arguments["mode"].upper()}_Loss', { 'Discriminator' if arguments['mode'] == 'dcgan' else 'Critic': D_loss.data.cpu().item(), 'Generator': G_loss.data.cpu().item() }, global_step) global_step += 1 print( f'Discriminator Loss: {D_loss.data.cpu().item()}, Generator Loss: {G_loss.data.cpu().item()}' ) run_callbacks( callbacks, model=(G, D), optimizer=(G_optimizer, D_optimizer), # To Save optimizer dict for retraining. mode=CallbackMode.ON_NTH_ITERATION, iteration=global_step) reset_grad()
_, workers = get_fl_graph(hook, args.num_workers) print('Loading data: {}'.format(paths.data_path)) X_trains, _, y_trains, _, meta = pkl.load(open(paths.data_path, 'rb')) test_loader = get_loader(args.dataset, args.test_batch_size, train=False, noise=args.noise) print('+' * 80) # ------------------------------------------------------------------------------ # Fire the engines # ------------------------------------------------------------------------------ model, loss_type = get_model(args, ckpt_path=args.load_model) if args.batch_size == 0: args.batch_size = int(meta['batch_size']) print("Resetting batch size: {}...".format(args.batch_size)) print('+' * 80) h_epoch = [] h_acc_test = [] h_acc_train = [] h_acc_train_std = [] h_loss_test = [] h_loss_train = [] h_loss_train_std = [] h_uplink = [] h_grad_agg = [] h_error = []
test_loader = get_loader(args.dataset, args.test_batch_size, train=False, shuffle=False, subset=args.repeat, force_resize=cfg.model_im_size[args.clf]) print('Train size: ', len(train_loader.dataset)) print('Test size: ', len(test_loader.dataset)) print('+' * 80) # ------------------------------------------------------------------------------ # Fire the engines # ------------------------------------------------------------------------------ model, loss_type = get_model(args) agg_type = 'averaging' if 'sgd' in args.paradigm: optimizer = optim.SGD(params=model.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=5e-4) elif 'adam' in args.paradigm: optimizer = optim.Adam(params=model.parameters(), lr=args.lr) if args.scheduler: print('Initializing scheduler...') scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=args.epochs) if loss_type == 'hinge':
def train(args): set_random_seed(42) model = get_model(args.network, args.classification_head) print('Loading model') model.encoder.conv1 = nn.Conv2d(count_channels(args.channels) * args.neighbours, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False) model, device = UtilsFactory.prepare_model(model) train_df = pd.read_csv(args.train_df).to_dict('records') val_df = pd.read_csv(args.val_df).to_dict('records') ds = Dataset(args.channels, args.dataset_path, args.image_size, args.batch_size, args.num_workers, args.neighbours, args.classification_head) loaders = ds.create_loaders(train_df, val_df) save_path = os.path.join(args.logdir, args.name) optimizer = get_optimizer(args.optimizer, args.lr, model) if not args.classification_head: scheduler = torch.optim.lr_scheduler.MultiStepLR( optimizer, milestones=[10, 40, 80, 150, 300], gamma=0.1) criterion = get_loss(args.loss) runner = SupervisedRunner() if args.model_weights_path: checkpoint = torch.load(args.model_weights_path, map_location='cpu') model.load_state_dict(checkpoint['model_state_dict']) runner.train(model=model, criterion=criterion, optimizer=optimizer, scheduler=scheduler, loaders=loaders, callbacks=[DiceCallback()], logdir=save_path, num_epochs=args.epochs, verbose=True) infer_loader = collections.OrderedDict([('infer', loaders['valid'])]) runner.infer( model=model, loaders=infer_loader, callbacks=[ CheckpointCallback(resume=f'{save_path}/checkpoints/best.pth'), InferCallback() ], ) else: criterion = get_loss('multi') net = Model(model, optimizer, criterion, batch_metrics=[ classification_head_accuracy, segmentation_head_dice ]) net = net.to(device) net.fit_generator(loaders['train'], loaders['valid'], epochs=args.epochs, callbacks=[ ModelCheckpoint( f'{save_path}/checkpoints/best.pth', ), MultiStepLR(milestones=[10, 40, 80, 150, 300], gamma=0.1) ])
def main(args): dict_args = vars(args) model_name = dict_args['model_name'] model = get_model(model_name, dict_args) checkpoint_path = dict_args['checkpoints_path'] if not os.path.exists(checkpoint_path): os.mkdir(checkpoint_path) if dict_args['log_system'] == 'wandb': logger = WandbLogger(project='source_separation', tags=model_name, offline=False, id=dict_args['run_id']) logger.log_hyperparams(model.hparams) logger.watch(model, log='all') elif dict_args['log_system'] == 'tensorboard': raise NotImplementedError else: logger = True # default model_dir = checkpoint_path +dict_args['model_name'] if not os.path.exists(model_dir): os.mkdir(model_dir) ckpt_dir = '{}/{}'.format(model_dir, dict_args['run_id']) if not os.path.exists(ckpt_dir): os.mkdir(ckpt_dir) checkpoint_callback = ModelCheckpoint( filepath=ckpt_dir, save_top_k=dict_args['save_top_k'], verbose=False, monitor='val_loss', prefix=dict_args['model_name'] + '_', save_last=True, save_weights_only= True ) early_stop_callback = EarlyStopping( monitor='val_loss', min_delta=0.0, patience=dict_args['patience'], verbose=False ) if dict_args['float16']: trainer = Trainer( gpus=dict_args['gpus'], precision=16, logger=logger, checkpoint_callback=checkpoint_callback, early_stop_callback=early_stop_callback, distributed_backend=dict_args['distributed_backend'] ) else: trainer = Trainer( gpus=dict_args['gpus'], logger=logger, checkpoint_callback=checkpoint_callback, early_stop_callback=early_stop_callback, distributed_backend=dict_args['distributed_backend'] ) data_provider = DataProvider(**dict_args) n_fft, hop_length, num_frame = [dict_args[key] for key in ['n_fft', 'hop_length', 'num_frame']] train_dataloader = data_provider.get_train_dataloader(n_fft, hop_length, num_frame) valid_dataloader = data_provider.get_valid_dataloader(n_fft, hop_length, num_frame) trainer.fit(model, train_dataloader, valid_dataloader)
import torch import sys import torch.nn as nn import torch.nn.functional as F from torch.distributions import Categorical from models.utils import get_model from models.config import TOKENS_RAW_CUTOFF from models.seq2seqattn import init_weights, EncRnn, DecRnn, Seq2SeqAttn from collections import deque import random import torch.optim as optim import math #Load in models and helper functions w2v_model = get_model() # w2ind from w2v w2ind = { token: token_index for token_index, token in enumerate(w2v_model.wv.index2word) } # sorted vocab words assert w2v_model.vocabulary.sorted_vocab == True word_counts = { word: vocab_obj.count for word, vocab_obj in w2v_model.wv.vocab.items() } word_counts = sorted(word_counts.items(), key=lambda x: -x[1]) words = [t[0] for t in word_counts] # sentence marker token inds sos_ind = w2ind['<sos>']
def objective(arguments): """ Main Pipeline for training and cross-validation. ToDo - Testing will be done separately in test.py. """ """ Setup result directory and enable logging to file in it """ outdir = make_results_dir(arguments) logger.init(outdir, logging.INFO) logger.info('Arguments:\n{}'.format(pformat(arguments))) """ Initialize Tensorboard """ tensorboard_writer = initialize_tensorboard(outdir) """ Set random seed throughout python, pytorch and numpy """ logger.info('Using Random Seed value as: %d' % arguments['random_seed']) torch.manual_seed( arguments['random_seed']) # Set for pytorch, used for cuda as well. random.seed(arguments['random_seed']) # Set for python np.random.seed(arguments['random_seed']) # Set for numpy """ Set device - cpu or gpu """ device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") logger.info(f'Using device - {device}') """ Load Model with weights(if available) """ model: torch.nn.Module = get_model(arguments.get('model_args')).to(device) """ Create loss function """ criterion = create_loss(arguments['loss_args']) """ Create optimizer """ optimizer = create_optimizer(model.parameters(), arguments['optimizer_args']) """ Load parameters for the Dataset """ dataset: BaseDataset = create_dataset(arguments['dataset_args'], arguments['train_data_args'], arguments['val_data_args']) """ Generate all callbacks """ callbacks: List[Callbacks] = generate_callbacks(arguments, dataset, device, outdir) """ Debug the inputs to model and save graph to tensorboard """ dataset.debug() dummy_input = (torch.rand( 1, arguments['dataset_args']['name'].value['channels'], *arguments['dataset_args']['name'].value['image_size'], )).to(device) tensorboard_writer.save_graph(model, dummy_input) """ Pipeline - loop over the dataset multiple times """ max_validation_accuracy = 0 itr = 0 best_model_path = None delete_old_models = True run_callbacks(callbacks, model=model, optimizer=optimizer, mode=CallbackMode.ON_TRAIN_BEGIN) for epoch in range(arguments['nb_epochs']): """ Train the model """ train_data_args = arguments['train_data_args'] if train_data_args['to_train']: train_dataloader = dataset.train_dataloader progress_bar = ProgressBar( target=len(train_dataloader), clear=True, description=f"Training {epoch + 1}/{arguments['nb_epochs']}: ") loss_running_average = RunningAverage() run_callbacks(callbacks, model=model, optimizer=optimizer, mode=CallbackMode.ON_EPOCH_BEGIN, epoch=epoch) model.train() for i, data in enumerate(train_dataloader, 0): # get the inputs inputs, labels = data inputs = inputs.to(device) labels = labels.to(device) # zero the parameter gradients optimizer.zero_grad() # Forward Pass outputs = model(inputs) classification_loss = criterion(outputs, labels) tensorboard_writer.save_scalar('Classification_Loss', classification_loss.item(), itr) classification_loss.backward() optimizer.step() # Compute running loss. Not exact but efficient. running_loss = loss_running_average.add_new_sample( classification_loss.item()) progress_bar.update(i + 1, [ ('current loss', classification_loss.item()), ('running loss', running_loss), ]) tensorboard_writer.save_scalar('Training_Loss', classification_loss, itr) itr += 1 # Callbacks ON_EPOCH_END should be run only when training is enabled. Thus call here. run_callbacks(callbacks, model=model, optimizer=optimizer, mode=CallbackMode.ON_EPOCH_END, epoch=epoch) """ Validate the model """ val_data_args = arguments['val_data_args'] if val_data_args['validate_step_size'] > 0 and \ epoch % val_data_args['validate_step_size'] == 0: correct, total = 0, 0 validation_dataloader = dataset.validation_dataloader progress_bar = ProgressBar( target=len(validation_dataloader), clear=True, description=f"Validating {epoch + 1}/{arguments['nb_epochs']}: " ) model.eval() with torch.no_grad(): for i, data in enumerate(validation_dataloader, 0): inputs, labels = data inputs = inputs.to(device) labels = labels.to(device) outputs = model(inputs) _, predicted = torch.max(outputs.data, 1) total += labels.size(0) correct += (predicted == labels).sum().item() progress_bar.update(i + 1, [ ('Batch Accuracy', 100 * correct / total), ]) val_accuracy = 100 * correct / total tensorboard_writer.save_scalar('Validation_Accuracy', val_accuracy, itr) logger.info( f'Accuracy of the network on the {dataset.get_val_dataset_size} validation images: {val_accuracy} %%' ) """ Save Model """ if val_accuracy > max_validation_accuracy: if delete_old_models and best_model_path: delete_old_file(best_model_path) best_model_path = os.path.join( outdir, f'epoch_{epoch:04}-model-val_accuracy_{val_accuracy}.pth') torch.save(model.state_dict(), best_model_path) max_validation_accuracy = val_accuracy tensorboard_writer.flush() # Exit loop if training not needed if not train_data_args['to_train']: break run_callbacks(callbacks, model=model, optimizer=optimizer, mode=CallbackMode.ON_TRAIN_END) logger.info('Finished Training') close_tensorboard() logger.info(f'Max Validation accuracy is {max_validation_accuracy}') return max_validation_accuracy # Return in case later u wanna add hyperopt.