def extract_features(self): ''' The function extracts all the features from the images in the Training Set and creates the respective labels for each image. It also saves the files with features and labels. ''' mass_images = os.listdir(self._mass_path) count_training = 1 if not ut.check_file( ): #check if the features have been already extracted print( "-------------------- [STATUS] Extracting Haralick textures -----------" ) for mass in mass_images: # read the training image image = cv.imread(self._mass_path + "\\" + mass, cv.IMREAD_GRAYSCALE) self._train_labels.append(int(1)) # extract haralick texture from the image features = self._texture_features(image) # append the feature vector and label self._train_features.append(features) print("Extracting features from image number " + str(count_training)) count_training += 1 for nomass in self._nomass_images: image = cv.imread(self._nomass_path + "\\" + nomass, cv.IMREAD_GRAYSCALE) self._train_labels.append(int(0)) # extract haralick texture from the image features = self._texture_features(image) # append the feature vector and label self._train_features.append(features) print("Extracting features from image number " + str(count_training)) count_training += 1 print( "-------------------- [NOTIFY] Features extracted ---------------------" ) ut.store(self._train_features, self._train_labels) else: print( "-------------------- [STATUS] Loading features and labels ------------" ) self._train_features, self._train_labels = ut.load()
def train(model, train_loader, val_loader, optimizer, scheduler, args): """ TODO add timings for training """ if args.tensorboard: writer = SummaryWriter(args.snap_dir) step = 0 header_msg = f'| Epoch | {"TRAIN": <14}{"Loss": >4} {"Time": >12} | {"VALIDATION": <14}{"Loss": >4} | ' header_msg += f'{"Component": >10} | {"All Trained": >12} | {"Rho": >32} | ' if args.boosted else '' header_msg += f'{"Improved": >10} |' logger.info('|' + "-" * (len(header_msg) - 2) + '|') logger.info(header_msg) logger.info('|' + "-" * (len(header_msg) - 2) + '|') best_loss = np.array([np.inf] * args.num_components) early_stop_count = 0 converged_epoch = 0 # corrects the annealing schedule when a boosted component converges early if args.boosted: model.component = 0 prev_lr = [] for c in range(args.num_components): optimizer.param_groups[c][ 'lr'] = args.learning_rate if c == model.component else 0.0 prev_lr.append(args.learning_rate) for epoch in range(1, args.epochs + 1): model.train() train_loss = [] train_times = [] for batch_id, (x, y) in enumerate(train_loader): if batch_id > 100: break t_start = time.time() optimizer.zero_grad() x = x.to(args.device) if args.y_condition: y = y.to(device) else: y = None # initialize ActNorm on first step if step < args.num_init_batches: with torch.no_grad(): if args.boosted: for c in range(args.num_components): model(x=x, y_onehot=y, components=c) else: model(x=x, y_onehot=y) step += 1 continue if args.boosted: z_g, mu_g, var_g, ldj_g, y_logits = model(x=x, y_onehot=y, components="c") fixed = '-c' if model.all_trained else '1:c-1' z_G, mu_G, var_G, ldj_G, _ = model(x=x, y_onehot=y, components=fixed) losses = compute_boosted_loss(mu_g, var_g, z_g, ldj_g, mu_G, var_G, z_G, ldj_G, y, y_logits, dim_prod=np.prod(x.shape[1:]), args=args) else: z, z_mu, z_var, logdet, y_logits = model(x, y) losses = compute_loss(z, z_mu, z_var, logdet, y, y_logits, np.prod(x.shape[1:]), args) losses["total_loss"].backward() if args.max_grad_clip > 0: torch.nn.utils.clip_grad_value_(model.parameters(), args.max_grad_clip) if args.max_grad_norm > 0: grad_norm = torch.nn.utils.clip_grad_norm_( model.parameters(), args.max_grad_norm) writer.add_scalar("grad_norm/grad_norm", grad_norm, step) if args.boosted: prev_lr[model.component] = optimizer.param_groups[ model.component]['lr'] if args.tensorboard: for i in range(len(optimizer.param_groups)): writer.add_scalar(f'lr/lr_{i}', optimizer.param_groups[i]['lr'], step) optimizer.step() if not args.no_lr_schedule: if args.lr_schedule == "plateau": scheduler.step(metrics=losses['total_loss']) else: scheduler.step() train_times.append(time.time() - t_start) train_loss.append(losses['total_loss']) if args.tensorboard: writer.add_scalar('step_loss/total_loss', losses['total_loss'].item(), step) writer.add_scalar('step_loss/bpd', losses['bpd'].item(), step) if args.y_condition: writer.add_scalar('step_loss/loss_classes', losses['loss_classes'].item(), step) step += 1 # Validation val_loss = evaluate(model, val_loader, args) # Sampling if epoch == 1 or epoch % args.sample_interval == 0: sample(model, args, step=step) # Reporting train_times = np.array(train_times) train_loss = torch.stack(train_loss).mean().item() epoch_msg = f'| {epoch: <5} | {train_loss:18.3f} {np.mean(train_times):12.1f} | {val_loss:18.3f} | ' rho_str = '[' + ', '.join([f"{val:4.2f}" for val in model.rho.data ]) + ']' if args.boosted else '' epoch_msg += f'{model.component: >10} | {str(model.all_trained)[0]: >12} | {rho_str: >32} | ' if args.boosted else '' if args.tensorboard: writer.add_scalar('epoch_loss/validation', val_loss, epoch) writer.add_scalar('epoch_loss/train', train_loss, epoch) # Assess convergence component = model.component if args.boosted else 0 converged, model_improved, early_stop_count, best_loss = check_convergence( early_stop_count, val_loss, best_loss, epoch - converged_epoch, component, args) epoch_msg += f'{"T" if model_improved else "": >10}' if model_improved: fname = f'model_c{model.component}.pt' if args.boosted else 'model.pt' save(model, optimizer, args.snap_dir + fname, scheduler) if converged: logger.info(epoch_msg + ' |') if args.boosted: converged_epoch = epoch prev_lr[model.component] = optimizer.param_groups[ model.component][ 'lr'] # save LR for LR scheduler in case we train this component again # revert back to the last best version of the model and update rho load(model, optimizer, args.snap_dir + f'model_c{model.component}.pt', args) model.update_rho(train_loader) if model.component > 0 or model.all_trained: logger.info( 'Rho Updated: ' + ' '.join([f"{val:1.2f}" for val in model.rho.data])) train_components_once = args.epochs <= ( args.epochs_per_component * args.num_components) if model.component == (args.num_components - 1) and ( model.all_trained or train_components_once): # stop the full model after all components have been trained logger.info( f"Model converged, stopping training and saving final model to: {args.snap_dir + 'model.pt'}" ) model.all_trained = True save(model, optimizer, args.snap_dir + f'model.pt', scheduler) break # else if not done training: # save model with updated rho save(model, optimizer, args.snap_dir + f'model_c{model.component}.pt', scheduler) # reset early_stop_count and train the next component model.increment_component() early_stop_count = 0 # freeze all but the new component being trained for c in range(args.num_components): optimizer.param_groups[c][ 'lr'] = prev_lr[c] if c == model.component else 0.0 for n, param in model.named_parameters(): param.requires_grad = True if n.startswith( f"flow_param.{model.component}" ) or not n.startswith("flow_param") else False else: # if a standard model converges once, break logger.info(f"Model converged, stopping training.") break else: logger.info(epoch_msg + ' |') if epoch == args.epochs: if args.boosted: # Save the best version of the model trained up to the current component with filename model.pt # This is to protect against times when the model is trained/re-trained but doesn't run long enough # for all components to converge / train completely copyfile(args.snap_dir + f'model_c{model.component}.pt', args.snap_dir + 'model.pt') logger.info( f"Resaving last improved version of {f'model_c{model.component}.pt'} as 'model.pt' for future testing" ) else: logger.info( f"Stopping training after {epoch} epochs of training.") logger.info('|' + "-" * (len(header_msg) - 2) + '|\n\n') writer.close()
def train(model, data_loaders, optimizer, scheduler, args): writer = SummaryWriter(args.snap_dir) if args.tensorboard else None header_msg = f'| Epoch | {"TRAIN": <14}{"Loss": >4} | {"VALIDATION": <14}{"Loss": >4} | {"TIMING":<8}{"(sec)":>4} | {"Improved": >8} |' header_msg += f' {"Component": >9} | {"All Trained": >11} | {"Rho": >{min(8, args.num_components) * 6}} |' if args.boosted else '' logger.info('|' + "-" * (len(header_msg) - 2) + '|') logger.info(header_msg) logger.info('|' + "-" * (len(header_msg) - 2) + '|') best_loss = np.array([np.inf] * args.num_components) early_stop_count = 0 converged_epoch = 0 # for boosting, helps keep track how long the current component has been training if args.boosted: #model.component = 0 prev_lr = init_boosted_lr(model, optimizer, args) else: prev_lr = [] grad_norm = None epoch_times = [] epoch_train = [] epoch_valid = [] pval_loss = 0.0 val_losses = {'g_nll': 9999999.9} step = 0 for epoch in range(args.init_epoch, args.epochs + 1): model.train() train_loss = [] t_start = time.time() for batch_id, (x, _) in enumerate(data_loaders['train']): # initialize data and optimizer x = x.to(args.device) optimizer.zero_grad() # initialize ActNorm on first steps if (args.flow == 'glow' or args.component_type == 'glow') and step < args.num_init_batches: with torch.no_grad(): if args.boosted: for i in range(args.num_components): model(x=x, components=i) else: model(x=x) step += 1 continue # compute loss and gradients losses = compute_kl_pq_loss(model, x, args) train_loss.append(losses['nll']) losses['nll'].backward() if args.max_grad_norm > 0: grad_norm = torch.nn.utils.clip_grad_norm_( model.parameters(), args.max_grad_norm) # Adjust learning rates for boosted model, keep fixed components frozen if args.boosted: update_learning_rates(prev_lr, model, optimizer, step, args) # batch level reporting batch_reporting(writer, optimizer, losses, grad_norm, step, args) # Perform gradient update, modify learning rate according to learning rate schedule optimizer.step() if not args.no_lr_schedule: prev_lr = update_scheduler(prev_lr, model, optimizer, scheduler, val_losses['g_nll'], step, args) if args.lr_schedule == "test": if step % 50 == 0: pval_loss = evaluate(model, data_loaders['val'], args)['nll'] writer.add_scalar('step/val_nll', pval_loss, step) step += 1 # Validation, collect results val_losses = evaluate(model, data_loaders['val'], args) train_loss = torch.stack(train_loss).mean().item() epoch_times.append(time.time() - t_start) epoch_train.append(train_loss) epoch_valid.append(val_losses['nll']) # Assess convergence component = (model.component, model.all_trained) if args.boosted else 0 converged, model_improved, early_stop_count, best_loss = check_convergence( early_stop_count, val_losses, best_loss, epoch - converged_epoch, component, args) if model_improved: fname = f'model_c{model.component}.pt' if args.boosted and args.save_intermediate_checkpoints else 'model.pt' save(model, optimizer, args.snap_dir + fname, scheduler) # epoch level reporting epoch_msg = epoch_reporting(writer, model, train_loss, val_losses, epoch_times, model_improved, epoch, args) if converged: logger.info(epoch_msg + ' |') logger.info("-" * (len(header_msg))) if args.boosted: converged_epoch = epoch # revert back to the last best version of the model and update rho fname = f'model_c{model.component}.pt' if args.save_intermediate_checkpoints else 'model.pt' load(model=model, optimizer=optimizer, path=args.snap_dir + fname, args=args, scheduler=scheduler, verbose=False) model.update_rho(data_loaders['train']) last_component = model.component == (args.num_components - 1) no_fine_tuning = args.epochs <= args.epochs_per_component * args.num_components fine_tuning_done = model.all_trained and last_component # no early stopping if burnin employed if (fine_tuning_done or no_fine_tuning) and last_component: # stop the full model after all components have been trained logger.info( f"Model converged, training complete, saving: {args.snap_dir + 'model.pt'}" ) model.all_trained = True save(model, optimizer, args.snap_dir + f'model.pt', scheduler) break # else if not done training: save model with updated rho save(model, optimizer, args.snap_dir + fname, scheduler) # tempory: look at results after each component test_loss = evaluate(model, data_loaders['test'], args) logger.info( f"Loss after training {model.component + 1} components: {test_loss['nll']:8.3f}" ) logger.info("-" * (len(header_msg))) # reset optimizer, scheduler, and early_stop_count and train the next component model.increment_component() early_stop_count = 0 val_losses = {'g_nll': 9999999.9} optimizer, scheduler = init_optimizer(model, args, verbose=False) prev_lr = init_boosted_lr(model, optimizer, args) else: # if a standard model converges once, break logger.info(f"Model converged, stopping training.") break else: logger.info(epoch_msg + ' |') if epoch == args.epochs: if args.boosted and args.save_intermediate_checkpoints: # Save the best version of the model trained up to the current component with filename model.pt # This is to protect against times when the model is trained/re-trained but doesn't run long enough # for all components to converge / train completely copyfile(args.snap_dir + f'model_c{model.component}.pt', args.snap_dir + 'model.pt') logger.info( f"Resaving last improved version of {f'model_c{model.component}.pt'} as 'model.pt' for future testing" ) else: logger.info( f"Stopping training after {epoch} epochs of training.") logger.info('|' + "-" * (len(header_msg) - 2) + '|\n') if args.tensorboard: writer.close() epoch_times, epoch_train, epoch_valid = np.array(epoch_times), np.array( epoch_train), np.array(epoch_valid) timing_msg = f"Stopped after {epoch_times.shape[0]} epochs. " timing_msg += f"Average train time per epoch: {np.mean(epoch_times):.2f} +/- {np.std(epoch_times, ddof=1):.2f}" logger.info(timing_msg + '\n') if args.save_results: np.savetxt(args.snap_dir + '/train_loss.csv', epoch_train, fmt='%f', delimiter=',') np.savetxt(args.snap_dir + '/valid_loss.csv', epoch_valid, fmt='%f', delimiter=',') np.savetxt(args.snap_dir + '/epoch_times.csv', epoch_times, fmt='%f', delimiter=',') with open(args.exp_log, 'a') as ff: timestamp = str(datetime.datetime.now())[0:19].replace(' ', '_') setup_msg = '\n'.join([timestamp, args.snap_dir ]) + '\n' + repr(args) print('\n' + setup_msg + '\n' + timing_msg, file=ff)
def main(main_args=None): """ use main_args to run this script as function in another script """ # ========================================================================= # PARSE EXPERIMENT SETTINGS, SETUP SNAPSHOTS DIRECTORY, LOGGING # ========================================================================= args, kwargs = parse_args(main_args) # ========================================================================= # LOAD DATA # ========================================================================= logger.info('LOADING DATA:') data_loaders, args = load_density_dataset(args) # ========================================================================= # SAVE EXPERIMENT SETTINGS # ========================================================================= logger.info(f'EXPERIMENT SETTINGS:\n{args}\n') torch.save(args, os.path.join(args.snap_dir, 'config.pt')) # ========================================================================= # INITIALIZE MODEL AND OPTIMIZATION # ========================================================================= model = init_model(args) optimizer, scheduler = init_optimizer(model, args) num_params = sum([param.nelement() for param in model.parameters()]) logger.info(f"MODEL:\nNumber of model parameters={num_params}\n{model}\n") if args.load: logger.info(f'LOADING CHECKPOINT FROM PRE-TRAINED MODEL: {args.load}') init_with_args = args.flow == "boosted" and args.loaded_init_component is not None and args.loaded_all_trained is not None load(model=model, optimizer=optimizer, path=args.load, args=args, init_with_args=init_with_args, scheduler=scheduler) logger.info( f'Warning: boosted models may only be loaded to train a new component (until pytorch bug is fixed), optimizer and scheduler will be reset. Non-boosted models may not be loaded at all (will fail).' ) optimizer, scheduler = init_optimizer(model, args, verbose=False) # ========================================================================= # TRAINING # ========================================================================= if args.epochs > 0: logger.info('TRAINING:') if args.tensorboard: logger.info(f'Follow progress on tensorboard: tb {args.snap_dir}') train(model, data_loaders, optimizer, scheduler, args) # ========================================================================= # VALIDATION # ========================================================================= logger.info('VALIDATION:') load(model=model, optimizer=optimizer, path=args.snap_dir + 'model.pt', args=args) val_loss = evaluate(model, data_loaders['val'], args, results_type='Validation') # ========================================================================= # TESTING # ========================================================================= if args.testing: logger.info("TESTING:") test_loss = evaluate(model, data_loaders['test'], args, results_type='Test')
def train_boosted(train_loader, val_loader, model, optimizer, scheduler, args): train_times = [] train_loss = [] train_rec = [] train_G = [] train_p = [] train_entropy = [] val_loss = [] val_rec = [] val_kl = [] # for early stopping best_loss = np.array([np.inf] * args.num_components) best_tr_ratio = np.array([-np.inf] * args.num_components) early_stop_count = 0 converged_epoch = 0 # corrects the annealing schedule when a component converges early v_loss = 9999999.9 # initialize learning rates for boosted components prev_lr = init_boosted_lr(model, optimizer, args) args.step = 0 for epoch in range(args.init_epoch, args.epochs + 1): # compute annealing rate for KL loss term beta = kl_annealing_rate(epoch - converged_epoch, model.component, model.all_trained, args) # occasionally sample from all components to keep decoder from focusing solely on new component prob_all = sample_from_all_prob(epoch - converged_epoch, model.component, model.all_trained, args) # Train model t_start = time.time() tr_loss, tr_rec, tr_G, tr_p, tr_entropy, tr_ratio, prev_lr = train_epoch_boosted( epoch, train_loader, model, optimizer, scheduler, beta, prob_all, prev_lr, v_loss, args) train_times.append(time.time() - t_start) train_loss.append(tr_loss) train_rec.append(tr_rec) train_G.append(tr_G) train_p.append(tr_p) train_entropy.append(tr_entropy) # Evaluate model v_loss, v_rec, v_kl = evaluate(val_loader, model, args, epoch=epoch) val_loss.append(v_loss) val_rec.append(v_rec) val_kl.append(v_kl) # Assess convergence component_converged, model_improved, early_stop_count, best_loss, best_tr_ratio = check_convergence( early_stop_count, v_loss, best_loss, tr_ratio, best_tr_ratio, epoch - converged_epoch, model, args) # epoch level reporting epoch_msg = epoch_reporting(model, tr_loss, tr_rec, tr_G, tr_p, tr_entropy, tr_ratio, v_loss, v_rec, v_kl, beta, prob_all, train_times, epoch, model_improved, args) if model_improved: fname = f'model_c{model.component}.pt' if args.boosted and args.save_intermediate_checkpoints else 'model.pt' save(model, optimizer, args.snap_dir + fname, scheduler) if component_converged: logger.info(epoch_msg + f'{"| ": >4}') logger.info("-" * 206) converged_epoch = epoch # revert back to the last best version of the model and update rho fname = f'model_c{model.component}.pt' if args.save_intermediate_checkpoints else 'model.pt' load(model=model, optimizer=optimizer, path=args.snap_dir + fname, args=args, scheduler=scheduler, verbose=False) model.update_rho(train_loader) last_component = model.component == (args.num_components - 1) no_fine_tuning = args.epochs <= args.epochs_per_component * args.num_components fine_tuning_done = model.all_trained and last_component if (fine_tuning_done or no_fine_tuning) and last_component: # stop the full model after all components have been trained logger.info( f"Model converged, training complete, saving: {args.snap_dir + 'model.pt'}" ) model.all_trained = True save(model, optimizer, args.snap_dir + f'model.pt', scheduler) break save(model, optimizer, args.snap_dir + f'model_c{model.component}.pt', scheduler) # reset early_stop_count and train the next component model.increment_component() early_stop_count = 0 v_loss = 9999999.9 optimizer, scheduler = init_optimizer(model, args, verbose=False) prev_lr = init_boosted_lr(model, optimizer, args) else: logger.info(epoch_msg + f'{"| ": >4}') if epoch == args.epochs: if args.boosted and args.save_intermediate_checkpoints: # Save the best version of the model trained up to the current component with filename model.pt # This is to protect against times when the model is trained/re-trained but doesn't run long enough # for all components to converge / train completely copyfile(args.snap_dir + f'model_c{model.component}.pt', args.snap_dir + 'model.pt') logger.info( f"Resaving last improved version of {f'model_c{model.component}.pt'} as 'model.pt' for future testing" ) else: logger.info( f"Stopping training after {epoch} epochs of training.") train_loss = np.hstack(train_loss) train_rec = np.hstack(train_rec) train_G = np.hstack(train_G) train_p = np.hstack(train_p) train_entropy = np.hstack(train_entropy) val_loss = np.array(val_loss) val_rec = np.array(val_rec) val_kl = np.array(val_kl) train_times = np.array(train_times) return train_loss, train_rec, train_G, train_p, train_entropy, val_loss, val_rec, val_kl, train_times
def main(main_args=None): """ use main_args to run this script as function in another script """ # ========================================================================= # PARSE EXPERIMENT SETTINGS, SETUP SNAPSHOTS DIRECTORY, LOGGING # ========================================================================= args, kwargs = parse_args(main_args) # ========================================================================= # LOAD DATA # ========================================================================= logger.info('LOADING DATA:') train_loader, val_loader, test_loader, args = load_image_dataset(args, **kwargs) # ========================================================================= # SAVE EXPERIMENT SETTINGS # ========================================================================= logger.info(f'EXPERIMENT SETTINGS:\n{args}\n') torch.save(args, os.path.join(args.snap_dir, 'config.pt')) # ========================================================================= # INITIALIZE MODEL AND OPTIMIZATION # ========================================================================= model = init_model(args) optimizer, scheduler = init_optimizer(model, args) num_params = sum([param.nelement() for param in model.parameters()]) logger.info(f"MODEL:\nNumber of model parameters={num_params}\n{model}\n") if args.load: logger.info(f'LOADING CHECKPOINT FROM PRE-TRAINED MODEL: {args.load}') init_with_args = args.flow == "boosted" and args.loaded_init_component is not None and args.loaded_all_trained is not None load(model, optimizer, args.load, args, init_with_args) # ========================================================================= # TRAINING # ========================================================================= training_required = args.epochs > 0 or args.load is None if training_required: logger.info('TRAINING:') if args.tensorboard: logger.info(f'Follow progress on tensorboard: tb {args.snap_dir}') train_loss, val_loss = train(train_loader, val_loader, model, optimizer, scheduler, args) # ========================================================================= # VALIDATION # ========================================================================= logger.info('VALIDATION:') if training_required: load(model, optimizer, args.snap_dir + 'model.pt', args) val_loss, val_rec, val_kl = evaluate(val_loader, model, args, results_type='Validation') # ========================================================================= # TESTING # ========================================================================= if args.testing: logger.info("TESTING:") test_loss, test_rec, test_kl = evaluate(test_loader, model, args, results_type='Test') test_nll = evaluate_likelihood(test_loader, model, args, S=args.nll_samples, MB=args.nll_mb, results_type='Test')