def run_comparison(experiment_args: Namespace, logger: logging.Logger, features_dir: str = None): for dataset_name in experiment_args.datasets: dataset_type, dataset_path, num_folds, metric = DATASETS[dataset_name] logger.info(dataset_name) # Set up args args = deepcopy(experiment_args) args.data_path = dataset_path args.dataset_type = dataset_type args.save_dir = os.path.join(args.save_dir, dataset_name) args.num_folds = num_folds args.metric = metric if features_dir is not None: args.features_path = [ os.path.join(features_dir, dataset_name + '.pckl') ] modify_train_args(args) # Set up logging for training os.makedirs(args.save_dir, exist_ok=True) fh = logging.FileHandler(os.path.join(args.save_dir, args.log_name)) fh.setLevel(logging.DEBUG) # Cross validate TRAIN_LOGGER.addHandler(fh) mean_score, std_score = cross_validate(args, TRAIN_LOGGER) TRAIN_LOGGER.removeHandler(fh) # Record results logger.info('{} +/- {} {}'.format(mean_score, std_score, metric)) temp_model = build_model(args) logger.info('num params: {:,}'.format(param_count(temp_model)))
def objective(hyperparams: Dict[str, Union[int, float]], seed: int) -> Dict: # Convert hyperparams from float to int when necessary for key in INT_KEYS: hyperparams[key] = int(hyperparams[key]) # Copy args hyper_args = deepcopy(args) # Update args with hyperparams if args.save_dir is not None: folder_name = '_'.join(f'{key}_{value}' for key, value in hyperparams.items()) hyper_args.save_dir = os.path.join(hyper_args.save_dir, folder_name) for key, value in hyperparams.items(): setattr(hyper_args, key, value) hyper_args.ffn_hidden_size = hyper_args.hidden_size # Cross validate mean_score, std_score = cross_validate(args=hyper_args, train_func=run_training) # Record results temp_model = MoleculeModel(hyper_args) num_params = param_count(temp_model) logger.info(f'Trial results with seed {seed}') logger.info(hyperparams) logger.info(f'num params: {num_params:,}') logger.info(f'{mean_score} +/- {std_score} {hyper_args.metric}') # Deal with nan if np.isnan(mean_score): if hyper_args.dataset_type == 'classification': mean_score = 0 else: raise ValueError( 'Can\'t handle nan score for non-classification dataset.') loss = (1 if hyper_args.minimize_score else -1) * mean_score return { 'loss': loss, 'status': 'ok', 'mean_score': mean_score, 'std_score': std_score, 'hyperparams': hyperparams, 'num_params': num_params, 'seed': seed, }
def objective(hyperparams: Dict[str, Union[int, float]]) -> float: # Convert hyperparams from float to int when necessary for key in INT_KEYS: hyperparams[key] = int(hyperparams[key]) # Copy args hyper_args = deepcopy(args) # Update args with hyperparams if args.save_dir is not None: folder_name = "_".join(f"{key}_{value}" for key, value in hyperparams.items()) hyper_args.save_dir = os.path.join(hyper_args.save_dir, folder_name) for key, value in hyperparams.items(): setattr(hyper_args, key, value) hyper_args.ffn_hidden_size = hyper_args.hidden_size # Record hyperparameters logger.info(hyperparams) # Cross validate mean_score, std_score = cross_validate(args=hyper_args, train_func=run_training) # Record results temp_model = MoleculeModel(hyper_args) num_params = param_count(temp_model) logger.info(f"num params: {num_params:,}") logger.info(f"{mean_score} +/- {std_score} {hyper_args.metric}") results.append({ "mean_score": mean_score, "std_score": std_score, "hyperparams": hyperparams, "num_params": num_params, }) # Deal with nan if np.isnan(mean_score): if hyper_args.dataset_type == "classification": mean_score = 0 else: raise ValueError( "Can't handle nan score for non-classification dataset.") return (1 if hyper_args.minimize_score else -1) * mean_score
def evaluate_glasses_dataset(args: Namespace, logger: Logger = None): if logger is not None: debug, info = logger.debug, logger.info else: debug = info = print debug(pformat(vars(args))) # Load data debug('Loading data') data = GlassDataset(args.test_data_path, transform=Compose([NNGraph(args.num_neighbors), Distance(False)])) # Dataset length data_length = len(data) debug('data size = {:,}'.format(data_length)) data = DataLoader(data, args.batch_size) metric_func = get_metric_func(args.metric) for model_idx in range(args.ensemble_size): # Load/build model if args.checkpoint_paths is not None: debug('Loading model {} from {}'.format(model_idx, args.checkpoint_paths[model_idx])) model = load_checkpoint(args.checkpoint_paths[model_idx]) else: debug('Must provide a checkpoint path!') exit(1) debug(model) debug('Number of parameters = {:,}'.format(param_count(model))) if args.cuda: debug('Moving model to cuda') model = model.cuda() test_scores = evaluate( model=model, data=data, metric_func=metric_func, args=args ) # Average test score avg_test_score = np.mean(test_scores) info('Model {} test {} = {:.3f}'.format(model_idx, args.metric, avg_test_score))
def objective(hyperparams: Dict[str, Union[int, float]]) -> float: # Convert hyperparams from float to int when necessary for key in INT_KEYS: hyperparams[key] = int(hyperparams[key]) # Update args with hyperparams hyper_args = deepcopy(args) if args.save_dir is not None: folder_name = '_'.join( [f'{key}_{value}' for key, value in hyperparams.items()]) hyper_args.save_dir = os.path.join(hyper_args.save_dir, folder_name) for key, value in hyperparams.items(): setattr(hyper_args, key, value) logger.info(hyperparams) # Train avg_test_score, avg_test_accuracy = run_training( hyper_args, train_logger) # Record results temp_model = build_model(hyper_args) num_params = param_count(temp_model) logger.info(f'num params: {num_params:,}') logger.info(f'{avg_test_score} {hyper_args.metric}') logger.info(f'{avg_test_accuracy}' + ' accuracy') results.append({ 'avg_test_score': avg_test_score, 'avg_test_accuracy': avg_test_accuracy, 'hyperparams': hyperparams, 'num_params': num_params }) # Deal with nan if np.isnan(avg_test_score): if hyper_args.dataset_type == 'classification': avg_test_score = 0 else: raise ValueError( 'Can\'t handle nan score for non-classification dataset.') return (1 if hyper_args.minimize_score else -1) * avg_test_score
def objective(hyperparams: Dict[str, Union[int, float]]) -> float: # Convert hyperparams from float to int when necessary for key in INT_KEYS: hyperparams[key] = int(hyperparams[key]) # Update args with hyperparams hyper_args = deepcopy(args) if args.save_dir is not None: folder_name = '_'.join([ f'{key}_{value}' if key in INT_KEYS else f'{key}_{value}' for key, value in hyperparams.items() ]) hyper_args.save_dir = os.path.join(hyper_args.save_dir, folder_name) for key, value in hyperparams.items(): setattr(hyper_args, key, value) # Record hyperparameters logger.info(hyperparams) # Cross validate mean_score, std_score = cross_validate(hyper_args, TRAIN_LOGGER) # Record results temp_model = build_model(hyper_args) num_params = param_count(temp_model) logger.info(f'num params: {num_params:,}') logger.info(f'{mean_score} +/- {std_score} {hyper_args.metric}') results.append({ 'mean_score': mean_score, 'std_score': std_score, 'hyperparams': hyperparams, 'num_params': num_params }) # Deal with nan if np.isnan(mean_score): if hyper_args.dataset_type == 'classification': mean_score = 0 else: raise ValueError( 'Can\'t handle nan score for non-classification dataset.') return (1 if hyper_args.minimize_score else -1) * mean_score
def objective(hyperparams: Dict[str, Union[int, float]]) -> float: # Convert hyperparms from float to int when necessary for key in INT_KEYS: hyperparams[key] = int(hyperparams[key]) # Copy args gs_args = deepcopy(dataset_args) for key, value in hyperparams.items(): setattr(gs_args, key, value) # Record hyperparameters logger.info(hyperparams) # Cross validate mean_score, std_score = cross_validate(gs_args, TRAIN_LOGGER) # Record results temp_model = build_model(gs_args) num_params = param_count(temp_model) logger.info('num params: {:,}'.format(num_params)) logger.info('{} +/- {} {}'.format(mean_score, std_score, metric)) results.append({ 'mean_score': mean_score, 'std_score': std_score, 'hyperparams': hyperparams, 'num_params': num_params }) # Deal with nan if np.isnan(mean_score): if gs_args.dataset_type == 'classification': mean_score = 0 else: raise ValueError( 'Can\'t handle nan score for non-classification dataset.' ) return (1 if gs_args.minimize_score else -1) * mean_score
def run_training(args: Namespace, logger: Logger = None): """ Trains a model and returns test scores on the model checkpoint with the highest validation score. :param args: args info :param logger: logger info :return: Optimal average test score (for use in hyperparameter optimization via Hyperopt) """ # Set up logger if logger is not None: debug, info = logger.debug, logger.info else: debug = info = print debug(pformat(vars(args))) # Load metadata metadata = json.load(open(args.data_path, 'r')) # Train/val/test split if args.k_fold_split: data_splits = [] kf = KFold(n_splits=args.num_folds, shuffle=True, random_state=args.seed) for train_index, test_index in kf.split(metadata): splits = [train_index, test_index] data_splits.append(splits) data_splits = data_splits[args.fold_index] if args.use_inner_test: train_indices, remaining_indices = train_test_split(data_splits[0], test_size=args.val_test_size, random_state=args.seed) validation_indices, test_indices = train_test_split(remaining_indices, test_size=0.5, random_state=args.seed) else: train_indices = data_splits[0] validation_indices, test_indices = train_test_split(data_splits[1], test_size=0.5, random_state=args.seed) train_metadata = list(np.asarray(metadata)[list(train_indices)]) validation_metadata = list(np.asarray(metadata)[list(validation_indices)]) test_metadata = list(np.asarray(metadata)[list(test_indices)]) else: train_metadata, remaining_metadata = train_test_split(metadata, test_size=args.val_test_size, random_state=args.seed) validation_metadata, test_metadata = train_test_split(remaining_metadata, test_size=0.5, random_state=args.seed) # Load datasets debug('Loading data') transform = Compose([Augmentation(args.augmentation_length), NNGraph(args.num_neighbors), Distance(False)]) train_data = GlassDataset(train_metadata, transform=transform) val_data = GlassDataset(validation_metadata, transform=transform) test_data = GlassDataset(test_metadata, transform=transform) args.atom_fdim = 3 args.bond_fdim = args.atom_fdim + 1 # Dataset lengths train_data_length, val_data_length, test_data_length = len(train_data), len(val_data), len(test_data) debug('train size = {:,} | val size = {:,} | test size = {:,}'.format( train_data_length, val_data_length, test_data_length) ) # Convert to iterators train_data = DataLoader(train_data, args.batch_size) val_data = DataLoader(val_data, args.batch_size) test_data = DataLoader(test_data, args.batch_size) # Get loss and metric functions loss_func = get_loss_func(args) metric_func = get_metric_func(args.metric) # Train ensemble of models for model_idx in range(args.ensemble_size): # Tensorboard writer save_dir = os.path.join(args.save_dir, 'model_{}'.format(model_idx)) os.makedirs(save_dir, exist_ok=True) writer = SummaryWriter(log_dir=save_dir) # Load/build model if args.checkpoint_paths is not None: debug('Loading model {} from {}'.format(model_idx, args.checkpoint_paths[model_idx])) model = load_checkpoint(args.checkpoint_paths[model_idx], args.save_dir, attention_viz=args.attention_viz) else: debug('Building model {}'.format(model_idx)) model = build_model(args) debug(model) debug('Number of parameters = {:,}'.format(param_count(model))) if args.cuda: debug('Moving model to cuda') model = model.cuda() # Ensure that model is saved in correct location for evaluation if 0 epochs save_checkpoint(model, args, os.path.join(save_dir, 'model.pt')) # Optimizer and learning rate scheduler optimizer = Adam(model.parameters(), lr=args.init_lr[model_idx], weight_decay=args.weight_decay[model_idx]) scheduler = NoamLR( optimizer, warmup_epochs=args.warmup_epochs, total_epochs=[args.epochs], steps_per_epoch=train_data_length // args.batch_size, init_lr=args.init_lr, max_lr=args.max_lr, final_lr=args.final_lr ) # Run training best_score = float('inf') if args.minimize_score else -float('inf') best_epoch, n_iter = 0, 0 for epoch in trange(args.epochs): debug('Epoch {}'.format(epoch)) n_iter = train( model=model, data=train_data, loss_func=loss_func, optimizer=optimizer, scheduler=scheduler, args=args, n_iter=n_iter, logger=logger, writer=writer ) val_scores = [] for val_runs in range(args.num_val_runs): val_batch_scores = evaluate( model=model, data=val_data, metric_func=metric_func, args=args, ) val_scores.append(np.mean(val_batch_scores)) # Average validation score avg_val_score = np.mean(val_scores) debug('Validation {} = {:.3f}'.format(args.metric, avg_val_score)) writer.add_scalar('validation_{}'.format(args.metric), avg_val_score, n_iter) # Save model checkpoint if improved validation score if args.minimize_score and avg_val_score < best_score or \ not args.minimize_score and avg_val_score > best_score: best_score, best_epoch = avg_val_score, epoch save_checkpoint(model, args, os.path.join(save_dir, 'model.pt')) # Evaluate on test set using model with best validation score info('Model {} best validation {} = {:.3f} on epoch {}'.format(model_idx, args.metric, best_score, best_epoch)) model = load_checkpoint(os.path.join(save_dir, 'model.pt'), args.save_dir, cuda=args.cuda, attention_viz=args.attention_viz) test_scores = [] for test_runs in range(args.num_test_runs): test_batch_scores = evaluate( model=model, data=test_data, metric_func=metric_func, args=args ) test_scores.append(np.mean(test_batch_scores)) # Get accuracy (assuming args.metric is set to AUC) metric_func_accuracy = get_metric_func('accuracy') test_scores_accuracy = [] for test_runs in range(args.num_test_runs): test_batch_scores = evaluate( model=model, data=test_data, metric_func=metric_func_accuracy, args=args ) test_scores_accuracy.append(np.mean(test_batch_scores)) # Average test score avg_test_score = np.mean(test_scores) avg_test_accuracy = np.mean(test_scores_accuracy) info('Model {} test {} = {:.3f}, test {} = {:.3f}'.format(model_idx, args.metric, avg_test_score, 'accuracy', avg_test_accuracy)) writer.add_scalar('test_{}'.format(args.metric), avg_test_score, n_iter) return avg_test_score, avg_test_accuracy # For hyperparameter optimization or cross validation use
def run_training(args: Namespace, logger: Logger = None) -> List[float]: """Trains a model and returns test scores on the model checkpoint with the highest validation score""" if logger is not None: debug, info = logger.debug, logger.info else: debug = info = print # Set GPU if args.gpu is not None: torch.cuda.set_device(args.gpu) # Print args debug(pformat(vars(args))) # Get data debug('Loading data') args.task_names = get_task_names(args.data_path) desired_labels = get_desired_labels(args, args.task_names) data = get_data(args.data_path, args) args.num_tasks = data.num_tasks() args.features_size = data.features_size() args.real_num_tasks = args.num_tasks - args.features_size if args.predict_features else args.num_tasks debug('Number of tasks = {}'.format(args.num_tasks)) if args.dataset_type == 'bert_pretraining': data.bert_init(args, logger) # Split data if args.dataset_type == 'regression_with_binning': # Note: for now, binning based on whole dataset, not just training set data, bin_predictions, regression_data = data args.bin_predictions = bin_predictions debug('Splitting data with seed {}'.format(args.seed)) train_data, _, _ = split_data(data=data, split_type=args.split_type, sizes=args.split_sizes, seed=args.seed, args=args, logger=logger) _, val_data, test_data = split_data(regression_data, split_type=args.split_type, sizes=args.split_sizes, seed=args.seed, args=args, logger=logger) else: debug('Splitting data with seed {}'.format(args.seed)) if args.separate_test_set: test_data = get_data(args.separate_test_set, args) if args.separate_val_set: val_data = get_data(args.separate_val_set, args) train_data = data # nothing to split; we already got our test and val sets else: train_data, val_data, _ = split_data(data=data, split_type=args.split_type, sizes=(0.8, 0.2, 0.0), seed=args.seed, args=args, logger=logger) else: train_data, val_data, test_data = split_data(data=data, split_type=args.split_type, sizes=args.split_sizes, seed=args.seed, args=args, logger=logger) # Optionally replace test data with train or val data if args.test_split == 'train': test_data = train_data elif args.test_split == 'val': test_data = val_data if args.dataset_type == 'classification': class_sizes = get_class_sizes(data) debug('Class sizes') for i, task_class_sizes in enumerate(class_sizes): debug('{} '.format(args.task_names[i]) + ', '.join('{}: {:.2f}%'.format(cls, size * 100) for cls, size in enumerate(task_class_sizes))) if args.class_balance: train_class_sizes = get_class_sizes(train_data) class_batch_counts = torch.Tensor(train_class_sizes) * args.batch_size args.class_weights = 1 / torch.Tensor(class_batch_counts) if args.save_smiles_splits: with open(args.data_path, 'r') as f: header = f.readline().strip() lines_by_smiles = {} indices_by_smiles = {} for i, line in enumerate(f): line = line.strip() smiles = line.split(',')[0] lines_by_smiles[smiles] = line indices_by_smiles[smiles] = i all_split_indices = [] for dataset, name in [(train_data, 'train'), (val_data, 'val'), (test_data, 'test')]: with open(os.path.join(args.save_dir, name + '_smiles.csv'), 'w') as f: f.write('smiles\n') for smiles in dataset.smiles(): f.write(smiles.strip() + '\n') with open(os.path.join(args.save_dir, name + '_full.csv'), 'w') as f: f.write(header + '\n') for smiles in dataset.smiles(): f.write(lines_by_smiles[smiles] + '\n') split_indices = [] for smiles in dataset.smiles(): split_indices.append(indices_by_smiles[smiles]) split_indices = sorted(split_indices) all_split_indices.append(split_indices) with open(os.path.join(args.save_dir, 'split_indices.pckl'), 'wb') as f: pickle.dump(all_split_indices, f) if args.features_scaling: features_scaler = train_data.normalize_features(replace_nan_token=None if args.predict_features else 0) val_data.normalize_features(features_scaler) test_data.normalize_features(features_scaler) else: features_scaler = None args.train_data_size = len(train_data) if args.prespecified_chunk_dir is None else args.prespecified_chunks_max_examples_per_epoch if args.adversarial or args.moe: val_smiles, test_smiles = val_data.smiles(), test_data.smiles() debug('Total size = {:,} | train size = {:,} | val size = {:,} | test size = {:,}'.format( len(data), len(train_data), len(val_data), len(test_data))) # Optionally truncate outlier values if args.truncate_outliers: print('Truncating outliers in train set') train_data = truncate_outliers(train_data) # Initialize scaler and scale training targets by subtracting mean and dividing standard deviation (regression only) if args.dataset_type == 'regression' and args.target_scaling: debug('Fitting scaler') train_smiles, train_targets = train_data.smiles(), train_data.targets() scaler = StandardScaler().fit(train_targets) scaled_targets = scaler.transform(train_targets).tolist() train_data.set_targets(scaled_targets) else: scaler = None if args.moe: train_data = cluster_split(train_data, args.num_sources, args.cluster_max_ratio, seed=args.cluster_split_seed, logger=logger) # Chunk training data if too large to load in memory all at once if args.num_chunks > 1: os.makedirs(args.chunk_temp_dir, exist_ok=True) train_paths = [] if args.moe: chunked_sources = [td.chunk(args.num_chunks) for td in train_data] chunks = [] for i in range(args.num_chunks): chunks.append([source[i] for source in chunked_sources]) else: chunks = train_data.chunk(args.num_chunks) for i in range(args.num_chunks): chunk_path = os.path.join(args.chunk_temp_dir, str(i) + '.txt') memo_path = os.path.join(args.chunk_temp_dir, 'memo' + str(i) + '.txt') with open(chunk_path, 'wb') as f: pickle.dump(chunks[i], f) train_paths.append((chunk_path, memo_path)) train_data = train_paths # Get loss and metric functions loss_func = get_loss_func(args) metric_func = get_metric_func(metric=args.metric, args=args) # Set up test set evaluation test_smiles, test_targets = test_data.smiles(), test_data.targets() if args.maml: # TODO refactor test_targets = [] for task_idx in range(len(data.data[0].targets)): _, task_test_data, _ = test_data.sample_maml_task(args, seed=0) test_targets += task_test_data.targets() if args.dataset_type == 'bert_pretraining': sum_test_preds = { 'features': np.zeros((len(test_smiles), args.features_size)) if args.features_size is not None else None, 'vocab': np.zeros((len(test_targets['vocab']), args.vocab.output_size)) } elif args.dataset_type == 'kernel': sum_test_preds = np.zeros((len(test_targets), args.num_tasks)) else: sum_test_preds = np.zeros((len(test_smiles), args.num_tasks)) if args.maml: sum_test_preds = None # annoying to determine exact size; will initialize later if args.dataset_type == 'bert_pretraining': # Only predict targets that are masked out test_targets['vocab'] = [target if mask == 0 else None for target, mask in zip(test_targets['vocab'], test_data.mask())] # Train ensemble of models for model_idx in range(args.ensemble_size): # Tensorboard writer save_dir = os.path.join(args.save_dir, 'model_{}'.format(model_idx)) os.makedirs(save_dir, exist_ok=True) writer = SummaryWriter(log_dir=save_dir) # Load/build model if args.checkpoint_paths is not None: debug('Loading model {} from {}'.format(model_idx, args.checkpoint_paths[model_idx])) model = load_checkpoint(args.checkpoint_paths[model_idx], current_args=args, logger=logger) else: debug('Building model {}'.format(model_idx)) model = build_model(args) debug(model) debug('Number of parameters = {:,}'.format(param_count(model))) if args.cuda: debug('Moving model to cuda') model = model.cuda() # Ensure that model is saved in correct location for evaluation if 0 epochs save_checkpoint(os.path.join(save_dir, 'model.pt'), model, scaler, features_scaler, args) if args.adjust_weight_decay: args.pnorm_target = compute_pnorm(model) # Optimizers optimizer = build_optimizer(model, args) # Learning rate schedulers scheduler = build_lr_scheduler(optimizer, args) # Run training best_score = float('inf') if args.minimize_score else -float('inf') best_epoch, n_iter = 0, 0 for epoch in trange(args.epochs): debug('Epoch {}'.format(epoch)) if args.prespecified_chunk_dir is not None: # load some different random chunks each epoch train_data, val_data = load_prespecified_chunks(args, logger) debug('Loaded prespecified chunks for epoch') if args.dataset_type == 'unsupervised': # won't work with moe full_data = MoleculeDataset(train_data.data + val_data.data) generate_unsupervised_cluster_labels(build_model(args), full_data, args) # cluster with a new random init model.create_ffn(args) # reset the ffn since we're changing targets-- we're just pretraining the encoder. optimizer.param_groups.pop() # remove ffn parameters optimizer.add_param_group({'params': model.ffn.parameters(), 'lr': args.init_lr[1], 'weight_decay': args.weight_decay[1]}) if args.cuda: model.ffn.cuda() if args.gradual_unfreezing: if epoch % args.epochs_per_unfreeze == 0: unfroze_layer = model.unfreeze_next() # consider just stopping early after we have nothing left to unfreeze? if unfroze_layer: debug('Unfroze last frozen layer') n_iter = train( model=model, data=train_data, loss_func=loss_func, optimizer=optimizer, scheduler=scheduler, args=args, n_iter=n_iter, logger=logger, writer=writer, chunk_names=(args.num_chunks > 1), val_smiles=val_smiles if args.adversarial else None, test_smiles=test_smiles if args.adversarial or args.moe else None ) if isinstance(scheduler, ExponentialLR): scheduler.step() val_scores = evaluate( model=model, data=val_data, metric_func=metric_func, args=args, scaler=scaler, logger=logger ) if args.dataset_type == 'bert_pretraining': if val_scores['features'] is not None: debug('Validation features rmse = {:.6f}'.format(val_scores['features'])) writer.add_scalar('validation_features_rmse', val_scores['features'], n_iter) val_scores = [val_scores['vocab']] # Average validation score avg_val_score = np.nanmean(val_scores) debug('Validation {} = {:.6f}'.format(args.metric, avg_val_score)) writer.add_scalar('validation_{}'.format(args.metric), avg_val_score, n_iter) if args.show_individual_scores: # Individual validation scores for task_name, val_score in zip(args.task_names, val_scores): if task_name in desired_labels: debug('Validation {} {} = {:.6f}'.format(task_name, args.metric, val_score)) writer.add_scalar('validation_{}_{}'.format(task_name, args.metric), val_score, n_iter) # Save model checkpoint if improved validation score, or always save it if unsupervised if args.minimize_score and avg_val_score < best_score or \ not args.minimize_score and avg_val_score > best_score or \ args.dataset_type == 'unsupervised': best_score, best_epoch = avg_val_score, epoch save_checkpoint(os.path.join(save_dir, 'model.pt'), model, scaler, features_scaler, args) if args.dataset_type == 'unsupervised': return [0] # rest of this is meaningless when unsupervised # Evaluate on test set using model with best validation score info('Model {} best validation {} = {:.6f} on epoch {}'.format(model_idx, args.metric, best_score, best_epoch)) model = load_checkpoint(os.path.join(save_dir, 'model.pt'), cuda=args.cuda, logger=logger) if args.split_test_by_overlap_dataset is not None: overlap_data = get_data(args.split_test_by_overlap_dataset) overlap_smiles = set(overlap_data.smiles()) test_data_intersect, test_data_nonintersect = [], [] for d in test_data.data: if d.smiles in overlap_smiles: test_data_intersect.append(d) else: test_data_nonintersect.append(d) test_data_intersect, test_data_nonintersect = MoleculeDataset(test_data_intersect), MoleculeDataset(test_data_nonintersect) for name, td in [('Intersect', test_data_intersect), ('Nonintersect', test_data_nonintersect)]: test_preds = predict( model=model, data=td, args=args, scaler=scaler, logger=logger ) test_scores = evaluate_predictions( preds=test_preds, targets=td.targets(), metric_func=metric_func, dataset_type=args.dataset_type, args=args, logger=logger ) avg_test_score = np.nanmean(test_scores) info('Model {} test {} for {} = {:.6f}'.format(model_idx, args.metric, name, avg_test_score)) if len(test_data) == 0: # just get some garbage results without crashing; in this case we didn't care anyway test_preds, test_scores = sum_test_preds, [0 for _ in range(len(args.task_names))] else: test_preds = predict( model=model, data=test_data, args=args, scaler=scaler, logger=logger ) test_scores = evaluate_predictions( preds=test_preds, targets=test_targets, metric_func=metric_func, dataset_type=args.dataset_type, args=args, logger=logger ) if args.maml: if sum_test_preds is None: sum_test_preds = np.zeros(np.array(test_preds).shape) if args.dataset_type == 'bert_pretraining': if test_preds['features'] is not None: sum_test_preds['features'] += np.array(test_preds['features']) sum_test_preds['vocab'] += np.array(test_preds['vocab']) else: sum_test_preds += np.array(test_preds) if args.dataset_type == 'bert_pretraining': if test_preds['features'] is not None: debug('Model {} test features rmse = {:.6f}'.format(model_idx, test_scores['features'])) writer.add_scalar('test_features_rmse', test_scores['features'], 0) test_scores = [test_scores['vocab']] # Average test score avg_test_score = np.nanmean(test_scores) info('Model {} test {} = {:.6f}'.format(model_idx, args.metric, avg_test_score)) writer.add_scalar('test_{}'.format(args.metric), avg_test_score, 0) if args.show_individual_scores: # Individual test scores for task_name, test_score in zip(args.task_names, test_scores): if task_name in desired_labels: info('Model {} test {} {} = {:.6f}'.format(model_idx, task_name, args.metric, test_score)) writer.add_scalar('test_{}_{}'.format(task_name, args.metric), test_score, n_iter) # Evaluate ensemble on test set if args.dataset_type == 'bert_pretraining': avg_test_preds = { 'features': (sum_test_preds['features'] / args.ensemble_size).tolist() if sum_test_preds['features'] is not None else None, 'vocab': (sum_test_preds['vocab'] / args.ensemble_size).tolist() } else: avg_test_preds = (sum_test_preds / args.ensemble_size).tolist() if len(test_data) == 0: # just return some garbage when we didn't want test data ensemble_scores = test_scores else: ensemble_scores = evaluate_predictions( preds=avg_test_preds, targets=test_targets, metric_func=metric_func, dataset_type=args.dataset_type, args=args, logger=logger ) # Average ensemble score if args.dataset_type == 'bert_pretraining': if ensemble_scores['features'] is not None: info('Ensemble test features rmse = {:.6f}'.format(ensemble_scores['features'])) writer.add_scalar('ensemble_test_features_rmse', ensemble_scores['features'], 0) ensemble_scores = [ensemble_scores['vocab']] avg_ensemble_test_score = np.nanmean(ensemble_scores) info('Ensemble test {} = {:.6f}'.format(args.metric, avg_ensemble_test_score)) writer.add_scalar('ensemble_test_{}'.format(args.metric), avg_ensemble_test_score, 0) # Individual ensemble scores if args.show_individual_scores: for task_name, ensemble_score in zip(args.task_names, ensemble_scores): info('Ensemble test {} {} = {:.6f}'.format(task_name, args.metric, ensemble_score)) return ensemble_scores
def run_evaluation(args: Namespace, logger: Logger = None): """ Evaluates a saved model :param args: Set of args :param logger: Logger saved in save_dir """ # Set up logger if logger is not None: debug, info = logger.debug, logger.info else: debug = info = print debug(pformat(vars(args))) # Load metadata metadata = json.load(open(args.data_path, 'r')) # Train/val/test split train_metadata, remaining_metadata = train_test_split(metadata, test_size=0.3, random_state=0) validation_metadata, test_metadata = train_test_split(remaining_metadata, test_size=0.5, random_state=0) # Load data debug('Loading data') transform = Compose([ Augmentation(args.augmentation_length), NNGraph(args.num_neighbors), Distance(False) ]) test_data = GlassDataset(test_metadata, transform=transform) args.atom_fdim = 3 args.bond_fdim = args.atom_fdim + 1 # Dataset lengths test_data_length = len(test_data) debug('test size = {:,}'.format(test_data_length)) # Convert to iterators test_data = DataLoader(test_data, args.batch_size) # Get loss and metric functions metric_func = get_metric_func(args.metric) # Test ensemble of models for model_idx in range(args.ensemble_size): # Load/build model if args.checkpoint_paths is not None: debug('Loading model {} from {}'.format( model_idx, args.checkpoint_paths[model_idx])) model = load_checkpoint(args.checkpoint_paths[model_idx], args.save_dir, cuda=args.cuda, attention_viz=args.attention_viz) else: debug('Must specify a model to load') exit(1) debug(model) debug('Number of parameters = {:,}'.format(param_count(model))) # Evaluate on test set using model with best validation score test_scores = [] for test_runs in range(args.num_test_runs): test_batch_scores = evaluate(model=model, data=test_data, metric_func=metric_func, args=args) test_scores.append(np.mean(test_batch_scores)) # Average test score avg_test_score = np.mean(test_scores) info('Model {} test {} = {:.3f}'.format(model_idx, args.metric, avg_test_score))