def train(config, save_path): train, test, meta_data = datasets(dataset=config['dataset'], batch_size=config['batch_size'], augmented=config['augmented'], preprocessing='center', seed=config['seed']) pytorch_model_builder = models.__dict__[config['model']] pytorch_model = pytorch_model_builder(**config.get('model_kwargs', {})) summary(pytorch_model) loss_function = torch.nn.MSELoss() # Because logsoftmax. Be careful! optimizer = torch.optim.SGD(pytorch_model.parameters(), lr=config['lr']) model = Model(pytorch_model, optimizer, loss_function, [acc]) callbacks = [] callbacks.append(LRSchedule(lr_schedule=config['lr_schedule'])) # Call training loop (warning: using test as valid. Please don't do this) steps_per_epoch = int(len(meta_data['x_train']) / config['batch_size']) training_loop(model=model, train=train, valid=test, save_path=save_path, n_epochs=config['n_epochs'], save_freq=1, reload=config['reload'], use_tb=True, steps_per_epoch=steps_per_epoch, custom_callbacks=callbacks)
def evaluate(save_path, checkpoint_name="weights.ckpt"): # Load config config = parse_gin_config(os.path.join(save_path, "config.gin")) gin.parse_config_files_and_bindings([os.path.join(os.path.join(save_path, "config.gin"))], bindings=[""]) # Create dynamically dataset generators train, valid, test, meta_data = get_dataset(batch_size=config['train.batch_size'], seed=config['train.seed']) # Load model (a bit hacky, but necessary because load_from_checkpoint seems to fail) ckpt_path = os.path.join(save_path, checkpoint_name) ckpt = torch.load(ckpt_path) model = models.__dict__[config['train.model']]() summary(model) pl_module = SupervisedLearning(model, lr=0.0) pl_module.load_state_dict(ckpt['state_dict']) # NOTE: This fails, probably due to a bug in Pytorch Lightning. The above is manually doing something similar # ckpt_path = os.path.join(save_path, checkpoint_name) # pl_module = SupervisedLearning.load_from_checkpoint(ckpt_path) trainer = pl.Trainer() results, = trainer.test(model=pl_module, test_dataloaders=test, ckpt_path=ckpt_path) logger.info(results) with open(os.path.join(save_path, "eval_results_{}.json".format(checkpoint_name)), "w") as f: json.dump(results, f)
def train(save_path, model, lr=0.1, batch_size=128, callbacks=[]): # Create dynamically dataset generators train, valid, test, meta_data = get_dataset(batch_size=batch_size) # Create dynamically model model = models.__dict__[model]() summary(model) loss_function = torch.nn.CrossEntropyLoss() optimizer = torch.optim.SGD(model.parameters(), lr=lr) # Create dynamically callbacks callbacks_constructed = [] for name in callbacks: clbk = get_callback(name, verbose=0) if clbk is not None: callbacks_constructed.append(clbk) # Pass everything to the training loop steps_per_epoch = (len(meta_data['x_train']) - 1) // batch_size + 1 training_loop(model=model, optimizer=optimizer, loss_function=loss_function, metrics=[acc], train=train, valid=test, meta_data=meta_data, steps_per_epoch=steps_per_epoch, save_path=save_path, config=_CONFIG, use_tb=True, custom_callbacks=callbacks_constructed)
def train(save_path, model, batch_size=128, seed=777, callbacks=[], resume=True, evaluate=True): # Create dynamically dataset generators train, valid, test, meta_data = get_dataset(batch_size=batch_size, seed=seed) # Create dynamically model model = models.__dict__[model]() summary(model) # Create dynamically callbacks callbacks_constructed = [] for name in callbacks: clbk = get_callback(name, verbose=0) if clbk is not None: callbacks_constructed.append(clbk) if not resume and os.path.exists(os.path.join(save_path, "last.ckpt")): raise IOError( "Please clear folder before running or pass train.resume=True") # Create module and pass to trianing checkpoint_callback = ModelCheckpoint( filepath=os.path.join(save_path, "weights"), verbose=True, save_last=True, # For resumability monitor='valid_acc', mode='max') pl_module = supervised_training.SupervisedLearning(model, meta_data=meta_data) trainer = training_loop(train, valid, pl_module=pl_module, checkpoint_callback=checkpoint_callback, callbacks=callbacks_constructed, save_path=save_path) # Evaluate if evaluate: results, = trainer.test(test_dataloaders=test) logger.info(results) with open(os.path.join(save_path, "eval_results.json"), "w") as f: json.dump(results, f)
def train(save_path, model, lr_splitting_by=None, lrs=None, wd=0, lr=0.1, batch_size=128, n_epochs=100, weights=None, fb_method=False, callbacks=[], optimizer='sgd', scheduler=None, freeze_all_but_this_layer=None, mode='train'): # Create dynamically dataset generators train, valid, test, meta_data = get_chexnet_covid(batch_size=batch_size) # Create dynamically model model = models.__dict__[model]() summary(model) loss_function = torch.nn.BCELoss() if freeze_all_but_this_layer is not None: # First freeze all layers logger.info("Freezing all layers") for i, parameter in enumerate(model.parameters()): parameter.requires_grad = False # Unfreeze layers that matches for i, (name, parameter) in enumerate(model.named_parameters()): if name.startswith(freeze_all_but_this_layer): parameter.requires_grad = True logger.info("Unfreezing {}: {}".format(name, parameter.shape)) if optimizer == 'sgd': optimizer = torch.optim.SGD(model.parameters(), lr=lr, weight_decay=wd) elif optimizer == 'adam': optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=wd) if scheduler == 'cosine': scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( optimizer, n_epochs) if lr_splitting_by is not None: optimizer, _ = create_optimizer(optimizer, model, lr_splitting_by, lrs) # Create dynamically callbacks callbacks_constructed = [] for name in callbacks: clbk = get_callback(name, verbose=0) if clbk is not None: print(name) callbacks_constructed.append(clbk) # Pass everything to the training loop if train is not None: steps_per_epoch = len(train) else: steps_per_epoch = None target_indice = None if fb_method: target_indice = weights.index(1) if 1 in weights else 0 elif weights is not None: target_indice = 0 if mode == 'train': assert train is not None, "please provide train data" assert valid is not None, "please provide validation data" training_loop( model=model, optimizer=optimizer, scheduler=scheduler, loss_function=loss_function, metrics=[acc_chexnet_covid], train=train, valid=valid, test=test, meta_data=meta_data, steps_per_epoch=steps_per_epoch, n_epochs=n_epochs, save_path=save_path, config=_CONFIG, use_tb=True, custom_callbacks=callbacks_constructed, fb_method=fb_method, target_indice=target_indice, ) else: assert test is not None, "please provide test data for evaluation" evaluation_loop( model=model, optimizer=optimizer, loss_function=loss_function, metrics=[acc_chexnet_covid], test=test, meta_data=meta_data, save_path=save_path, config=_CONFIG, custom_callbacks=callbacks_constructed, target_indice=target_indice, )
def train_megan(save_path: str, featurizer_key: str, learning_rate: float = 0.0001, train_samples_per_epoch: int = -1, valid_samples_per_epoch: int = -1, batch_size: int = 4, gen_lr_factor: float = 0.1, gen_lr_patience: int = 4, big_lr_epochs: int = -1, early_stopping: int = 16, start_epoch: int = 0, megan_warmup_epochs: int = 1, save_each_epoch: bool = False, max_n_epochs: int = 1000): """ Train MEGAN model """ if not os.path.exists(save_path): os.makedirs(save_path) checkpoints_path = os.path.join(save_path, 'checkpoints') if save_each_epoch and not os.path.exists(checkpoints_path): os.makedirs(checkpoints_path) log_current_config() conf_path = os.path.join(save_path, 'config.gin') save_current_config(conf_path) model_path = os.path.join(save_path, 'model.pt') best_model_path = os.path.join(save_path, 'model_best.pt') summary_dir = 'summary' summary_dir = os.path.join(save_path, summary_dir) tf_callback = DumpTensorflowSummaries( save_path=summary_dir, step_multiplier=train_samples_per_epoch) dataset = get_dataset() featurizer = get_featurizer(featurizer_key) assert isinstance(featurizer, MeganTrainingSamplesFeaturizer) action_vocab = featurizer.get_actions_vocabulary(dataset.feat_dir) # copy featurizer dictionary files needed for using the model feat_dir = featurizer.dir(dataset.feat_dir) model_feat_dir = featurizer.dir(save_path) if not os.path.exists(model_feat_dir): os.makedirs(model_feat_dir) copyfile(get_actions_vocab_path(feat_dir), get_actions_vocab_path(model_feat_dir)) copyfile(get_prop2oh_vocab_path(feat_dir), get_prop2oh_vocab_path(model_feat_dir)) logger.info("Creating model...") device = 'cuda:0' if torch.cuda.is_available() else 'cpu' model = Megan(n_atom_actions=action_vocab['n_atom_actions'], n_bond_actions=action_vocab['n_bond_actions'], prop2oh=action_vocab['prop2oh']).to(device) summary(model) logger.info("Loading data...") data_dict = {} logger.info(f"Training for maximum of {max_n_epochs} epochs...") start_learning_rate = learning_rate optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) def set_lr(lr: float): for param_group in optimizer.param_groups: param_group['lr'] = lr def run_batch(ind: np.ndarray, train: bool) -> dict: if train: optimizer.zero_grad() batch_ind = np.random.choice(ind, size=batch_size, replace=False) batch_metrics = {} batch = generate_batch(batch_ind, data_dict['metadata'], featurizer, data_dict['data'], action_vocab=action_vocab) batch_result = model(batch) actions = batch_result['output'] target, n_steps = batch['target'], batch['n_steps'] n_total_steps = torch.sum(n_steps) y_max_pred_prob, y_pred = torch.max(actions, dim=-1) y_val, y_true = torch.max(target, dim=-1) y_val_one = y_val == 1 is_hard = batch['is_hard'] weight = torch.ones_like(is_hard) avg_weight = torch.mean(weight.float(), axis=-1) weight = weight * y_val_one weight = weight.unsqueeze(-1).expand(*actions.shape) target_one = target == 1 eps = 1e-09 loss = -torch.log2(actions + ~target_one + eps) * target_one * weight loss = torch.sum(loss, dim=-1) path_losses = torch.sum(loss, dim=-1) / (avg_weight * 16) min_losses = [] # for each reaction, use the minimum loss for each possible path as the loss to optimize path_i = 0 for n_paths in batch['n_paths']: path_loss = torch.min(path_losses[path_i:path_i + n_paths]) min_losses.append(path_loss.unsqueeze(-1)) path_i += n_paths min_losses = torch.cat(min_losses) loss = torch.mean(min_losses) if torch.isinf(loss): raise ValueError( 'Infinite loss (correct action has predicted probability=0.0)') if loss != loss: # this is only true for NaN in pytorch raise ValueError('NaN loss') # skip accuracy metrics if there are no positive samples in batch correct = ((y_pred == y_true) & y_val_one).float() step_correct = torch.sum(correct) / n_total_steps batch_metrics['step_acc'] = step_correct.cpu().detach().numpy() total_hard = torch.sum(is_hard) if total_hard > 0: hard_correct = torch.sum(correct * is_hard) / total_hard batch_metrics['step_acc_hard'] = hard_correct.cpu().detach().numpy( ) is_easy = (1.0 - is_hard) * y_val_one total_easy = torch.sum(is_easy) if total_easy > 0: easy_correct = torch.sum(correct * is_easy) / total_easy batch_metrics['step_acc_easy'] = easy_correct.cpu().detach().numpy( ) all_correct = torch.sum(correct, dim=-1) all_correct = all_correct == n_steps acc = [] path_i = 0 for n_paths in batch['n_paths']: corr = any(all_correct[i] == 1 for i in range(path_i, path_i + n_paths)) acc.append(corr) path_i += n_paths if len(acc) > 0: batch_metrics['acc'] = np.mean(acc) if train: loss.backward() optimizer.step() batch_metrics['loss'] = loss.cpu().detach().numpy() return batch_metrics def get_lr(): for param_group in optimizer.param_groups: return param_group['lr'] def run_epoch(set_key: str, i_ep: int, all_ind: np.ndarray, train: bool, batches_per_epoch: int, lr_step: float = 0.0): torch.cuda.empty_cache() if train: model.train() else: model.eval() metrics = {} counts = Counter() for batch_i in tqdm(range(batches_per_epoch), desc=f'{save_path} {set_key} epoch {i_ep + 1}'): if lr_step > 0: set_lr(get_lr() + lr_step) try: batch_metrics = run_batch(all_ind, train) for k, v in batch_metrics.items(): if k not in metrics: metrics[k] = 0 metrics[k] += v counts[k] += 1 except AssertionError as e: # batch skipped because of zero loss logger.debug(f"Exception while running batch: {str(e)}") except Exception as e: logger.warning(f"Exception while running batch: {str(e)}") raise e metrics = dict((k, v / counts[k]) for k, v in metrics.items()) str_metrics = ', '.join("{:s}={:.4f}".format(k, v) for k, v in metrics.items()) logger.info(f'{set_key} epoch {i_ep + 1}: {str_metrics}') if train: save_weights(model_path, model, optimizer, epoch=i_ep, lr=get_lr(), no_progress=no_progress) if save_each_epoch: model_epoch_path = os.path.join( checkpoints_path, f'model_{(i_ep + 1) * train_samples_per_epoch}.pt') save_weights(model_epoch_path, model, optimizer, epoch=i_ep, lr=get_lr()) return metrics best_acc = 0 no_progress = 0 if os.path.exists(model_path): checkpoint = load_state_dict(model_path) if 'epoch' in checkpoint: start_epoch = checkpoint['epoch'] + 1 logger.info("Resuming training after {} epochs".format(start_epoch)) model.load_state_dict(checkpoint['model']) optimizer.load_state_dict(checkpoint['optimizer']) if 'lr' in checkpoint: learning_rate = checkpoint['lr'] start_learning_rate = learning_rate logger.info( "Resuming training with LR={:f} epochs".format(learning_rate)) set_lr(learning_rate) if 'valid_acc' in checkpoint: best_acc = checkpoint['valid_acc'] logger.info(f"Best acc so far: {best_acc}") megan_warmup_epochs = max(megan_warmup_epochs - start_epoch, 0) if megan_warmup_epochs > 0: learning_rate = 0.0 set_lr(learning_rate) no_progress = 0 no_progress_lr = 0 logger.info('Loading data') loaded_data = featurizer.load(dataset.feat_dir) chunk_metadata = loaded_data['reaction_metadata'] data_dict['data'] = loaded_data data_dict['metadata'] = chunk_metadata data_dict['mean_n_steps'] = np.mean(data_dict['metadata']['n_samples']) metadata = data_dict['metadata'] if 'remapped' in metadata: train_ind = (metadata['is_train'] == 1) & (metadata['remapped']) valid_ind = (metadata['is_train'] == 0) & (metadata['remapped']) else: train_ind = metadata['is_train'] == 1 valid_ind = metadata['is_train'] == 0 if 'path_i' in metadata: train_ind = train_ind & (metadata['path_i'] == 0) valid_ind = valid_ind & (metadata['path_i'] == 0) train_ind = np.argwhere(train_ind).flatten() valid_ind = np.argwhere(valid_ind).flatten() logger.info( f"Training on chunk of {len(train_ind)} training samples and {len(valid_ind)} valid samples" ) if train_samples_per_epoch == -1: train_samples_per_epoch = len(train_ind) if valid_samples_per_epoch == -1: valid_samples_per_epoch = len(valid_ind) train_batches_per_epoch = int(np.ceil(train_samples_per_epoch / batch_size)) valid_batches_per_epoch = int(np.ceil(valid_samples_per_epoch / batch_size)) logger.info( f'Starting training on epoch {start_epoch + 1} with Learning Rate={learning_rate} ' f'({megan_warmup_epochs} warmup epochs)') for epoch_i in range(start_epoch, max_n_epochs): if epoch_i == megan_warmup_epochs: set_lr(start_learning_rate) logger.info( f'Learning rate set to {start_learning_rate} after {megan_warmup_epochs} warmup epochs' ) if big_lr_epochs != -1 and epoch_i == big_lr_epochs: learning_rate *= gen_lr_factor no_progress = 0 no_progress_lr = 0 set_lr(learning_rate) logger.info(f'Changing Learning Rate to {learning_rate}') if megan_warmup_epochs > 0: warmup_lr_step = start_learning_rate / (train_batches_per_epoch * megan_warmup_epochs) else: warmup_lr_step = 0 learning_rate = get_lr() train_metrics = run_epoch( 'train', epoch_i, train_ind, True, train_batches_per_epoch, lr_step=warmup_lr_step if epoch_i < megan_warmup_epochs else 0.0) with torch.no_grad(): valid_metrics = run_epoch('valid', epoch_i, valid_ind, False, valid_batches_per_epoch) all_metrics = {} for key, val in train_metrics.items(): all_metrics[f'train_{key}'] = val for key, val in valid_metrics.items(): all_metrics[f'valid_{key}'] = val all_metrics['lr'] = learning_rate tf_callback.on_epoch_end(epoch_i + 1, all_metrics) valid_acc = valid_metrics['acc'] if valid_acc > best_acc: logger.info( f'Saving best model from epoch {epoch_i + 1} to {best_model_path} (acc={valid_acc})' ) save_weights(best_model_path, model, optimizer, epoch=epoch_i, lr=learning_rate, valid_acc=valid_acc) best_acc = valid_acc no_progress = 0 no_progress_lr = 0 else: no_progress += 1 no_progress_lr += 1 if big_lr_epochs == -1 or epoch_i >= big_lr_epochs: if no_progress_lr > gen_lr_patience: learning_rate *= gen_lr_factor logger.info(f'Changing Learning Rate to {learning_rate}') set_lr(learning_rate) no_progress_lr = 0 if no_progress > early_stopping: logger.info(f'Early stopping after {epoch_i + 1} epochs') break logger.info("Experiment finished!")
def _plot_predictions_logistic(data, predictors, properties=dict(), obs_site_name='obs'): """Plots predictions of given pyro predictors.""" fig, axs = plt.subplots( nrows=len(predictors), ncols=2, figsize=(15, 6 * len(predictors)), sharey=True, sharex=True, squeeze=False, ) x_col = properties.get('x', None) x_label = properties.get('x_label', 'x') y_label = properties.get('y_label', 'y') y_labels = properties.get('y_labels', dict()) positive_y = y_labels.get(1, 'Positive class') negative_y = y_labels.get(0, 'Negative class') cat_col = properties.get('category', None) cat_name = properties.get('category_labels', dict()) positive_cat = cat_name.get(1, 'Positive') negative_cat = cat_name.get(0, 'Negative') x = data['x'] y = data['y'] for ax, (predictor_name, predictor) in zip(axs, predictors.items()): for category in (0, 1): category_idx = x[:, cat_col] == category x_data = x[category_idx, x_col] y_data = y[category_idx] samples = predictor(x) pred_summary = summary(samples) y_pred = pred_summary[obs_site_name] xplot, ym, y_true = list( zip(*sorted(zip(x_data, y_pred["mean"], y_data), key=lambda r: r[0]))) y_positive_idx = ( torch.Tensor(y_true) == 1.).numpy().astype('bool') ax[category].plot(np.array(xplot)[y_positive_idx], np.array(ym)[y_positive_idx], marker='o', ms=10, linestyle='', alpha=1, color='green', label=positive_y) ax[category].plot(np.array(xplot)[~y_positive_idx], np.array(ym)[~y_positive_idx], marker='o', ms=10, linestyle='', alpha=1, color='red', label=negative_y) ax[category].set( xlabel=x_label, ylabel=y_label, title=positive_cat if category == 1 else negative_cat, ) ax[0].set_ylabel(f"{predictor_name}\n{y_label}") handles, labels = ax[0].get_legend_handles_labels() fig.legend(handles, labels, loc='upper right') plt.show()
def _plot_predictions_linear(data, predictors, properties=dict(), obs_site_name='obs'): """Plots predictions of given pyro predictors.""" fig, axs = plt.subplots( nrows=len(predictors), ncols=2, figsize=(15, 6 * len(predictors)), sharey=True, sharex=True, squeeze=False, ) x_col = properties.get('x', None) x_label = properties.get('x_label', 'x') y_label = properties.get('y_label', 'y') cat_col = properties.get('category', None) cat_name = properties.get('category_labels', dict()) positive_cat = cat_name.get(1, 'Positive') negative_cat = cat_name.get(0, 'Negative') x = data['x'] y = data['y'] for ax, (predictor_name, predictor) in zip(axs, predictors.items()): for category in (0, 1): category_idx = x[:, cat_col] == category x_data = x[category_idx, x_col] y_data = y[category_idx] samples = predictor(x) pred_summary = summary(samples) y_pred = pred_summary[obs_site_name] mu = pred_summary["_RETURN"] y_pred = { k: v if len(v.shape) == 1 else v.squeeze(0) for k, v in y_pred.items() } mu = { k: v if len(v.shape) == 1 else v.squeeze(0) for k, v in mu.items() } xplot, mum, mu5, mu95, ym, y5, y95, y_true = list( zip(*sorted( zip(x_data, mu["mean"], mu["5%"], mu["95%"], y_pred["mean"], y_pred["5%"], y_pred["95%"], y_data), key=lambda r: r[0]))) ax[category].fill_between( xplot, y5, y95, color='orange', alpha=0.5, label="Posterior predictive distribution with 90% CI") ax[category].fill_between(xplot, mu5, mu95, color='cornflowerblue', alpha=0.8, label="Regression line 90% CI") ax[category].plot(xplot, mum, color="red", label="Mean output") ax[category].plot(xplot, y_true, marker='o', ms=4, linestyle='', alpha=1, color='green', label="True values") ax[category].set( xlabel=x_label, ylabel=y_label, title=positive_cat if category == 1 else negative_cat, ) ax[0].set_ylabel(f"{predictor_name}\n{y_label}") handles, labels = ax[0].get_legend_handles_labels() fig.legend(handles, labels, loc='upper right') plt.show()