def make_predictions( args: PredictArgs, smiles: List[List[str]] = None) -> List[List[Optional[float]]]: """ Loads data and a trained model and uses the model to make predictions on the data. If SMILES are provided, then makes predictions on smiles. Otherwise makes predictions on :code:`args.test_data`. :param args: A :class:`~chemprop.args.PredictArgs` object containing arguments for loading data and a model and making predictions. :param smiles: List of list of SMILES to make predictions on. :return: A list of lists of target predictions. """ print('Loading training args') train_args = load_args(args.checkpoint_paths[0]) num_tasks, task_names = train_args.num_tasks, train_args.task_names # If features were used during training, they must be used when predicting if ((train_args.features_path is not None or train_args.features_generator is not None) and args.features_path is None and args.features_generator is None): raise ValueError( 'Features were used during training so they must be specified again during prediction ' 'using the same type of features as before (with either --features_generator or ' '--features_path and using --no_features_scaling if applicable).') # If atom-descriptors were used during training, they must be used when predicting and vice-versa if train_args.atom_descriptors != args.atom_descriptors: raise ValueError( 'The use of atom descriptors is inconsistent between training and prediction. If atom descriptors ' ' were used during training, they must be specified again during prediction using the same type of ' ' descriptors as before. If they were not used during training, they cannot be specified during prediction.' ) # Update predict args with training arguments to create a merged args object for key, value in vars(train_args).items(): if not hasattr(args, key): setattr(args, key, value) args: Union[PredictArgs, TrainArgs] if args.atom_descriptors == 'feature': set_extra_atom_fdim(train_args.atom_features_size) print('Loading data') if smiles is not None: full_data = get_data_from_smiles( smiles=smiles, skip_invalid_smiles=False, features_generator=args.features_generator) else: full_data = get_data(path=args.test_path, target_columns=[], ignore_columns=[], skip_invalid_smiles=False, args=args, store_row=not args.drop_extra_columns) print('Validating SMILES') full_to_valid_indices = {} valid_index = 0 for full_index in range(len(full_data)): if all(mol is not None for mol in full_data[full_index].mol): full_to_valid_indices[full_index] = valid_index valid_index += 1 test_data = MoleculeDataset( [full_data[i] for i in sorted(full_to_valid_indices.keys())]) # Edge case if empty list of smiles is provided if len(test_data) == 0: return [None] * len(full_data) print(f'Test size = {len(test_data):,}') # Predict with each model individually and sum predictions if args.dataset_type == 'multiclass': sum_preds = np.zeros( (len(test_data), num_tasks, args.multiclass_num_classes)) else: sum_preds = np.zeros((len(test_data), num_tasks)) # Create data loader test_data_loader = MoleculeDataLoader(dataset=test_data, batch_size=args.batch_size, num_workers=args.num_workers) print( f'Predicting with an ensemble of {len(args.checkpoint_paths)} models') for checkpoint_path in tqdm(args.checkpoint_paths, total=len(args.checkpoint_paths)): # Load model and scalers model = load_checkpoint(checkpoint_path, device=args.device) scaler, features_scaler = load_scalers(checkpoint_path) # Normalize features if args.features_scaling: test_data.reset_features_and_targets() test_data.normalize_features(features_scaler) # Make predictions model_preds = predict(model=model, data_loader=test_data_loader, scaler=scaler) sum_preds += np.array(model_preds) # Ensemble predictions avg_preds = sum_preds / len(args.checkpoint_paths) avg_preds = avg_preds.tolist() # Save predictions print(f'Saving predictions to {args.preds_path}') assert len(test_data) == len(avg_preds) makedirs(args.preds_path, isfile=True) # Get prediction column names if args.dataset_type == 'multiclass': task_names = [ f'{name}_class_{i}' for name in task_names for i in range(args.multiclass_num_classes) ] else: task_names = task_names # Copy predictions over to full_data for full_index, datapoint in enumerate(full_data): valid_index = full_to_valid_indices.get(full_index, None) preds = avg_preds[valid_index] if valid_index is not None else [ 'Invalid SMILES' ] * len(task_names) # If extra columns have been dropped, add back in SMILES columns if args.drop_extra_columns: datapoint.row = OrderedDict() smiles_columns = args.smiles_columns if None in smiles_columns: smiles_columns = get_header( args.test_path)[:len(smiles_columns)] for column, smiles in zip(smiles_columns, datapoint.smiles): datapoint.row[column] = smiles # Add predictions columns for pred_name, pred in zip(task_names, preds): datapoint.row[pred_name] = pred # Save with open(args.preds_path, 'w') as f: writer = csv.DictWriter(f, fieldnames=full_data[0].row.keys()) writer.writeheader() for datapoint in full_data: writer.writerow(datapoint.row) return avg_preds
def run_training(args: Namespace, logger: Logger = None) -> List[float]: """ Trains a model and returns test scores on the model checkpoint with the highest validation score. :param args: Arguments. :param logger: Logger. :return: A list of ensemble scores for each task. """ if logger is not None: debug, info = logger.debug, logger.info else: debug = info = print # Set GPU if args.gpu is not None: torch.cuda.set_device(args.gpu) # Print args debug(pformat(vars(args))) # Get data debug('Loading data') args.task_names = get_task_names(args.data_path) desired_labels = get_desired_labels(args, args.task_names) data = get_data(path=args.data_path, args=args, logger=logger) args.num_tasks = data.num_tasks() args.features_size = data.features_size() args.real_num_tasks = args.num_tasks - args.features_size if args.predict_features else args.num_tasks debug(f'Number of tasks = {args.num_tasks}') if args.dataset_type == 'bert_pretraining': data.bert_init(args, logger) # Split data if args.dataset_type == 'regression_with_binning': # Note: for now, binning based on whole dataset, not just training set data, bin_predictions, regression_data = data args.bin_predictions = bin_predictions debug(f'Splitting data with seed {args.seed}') train_data, _, _ = split_data(data=data, split_type=args.split_type, sizes=args.split_sizes, seed=args.seed, args=args, logger=logger) _, val_data, test_data = split_data(regression_data, split_type=args.split_type, sizes=args.split_sizes, seed=args.seed, args=args, logger=logger) else: debug(f'Splitting data with seed {args.seed}') if args.separate_test_set: test_data = get_data(path=args.separate_test_set, args=args, features_path=args.separate_test_set_features, logger=logger) if args.separate_val_set: val_data = get_data( path=args.separate_val_set, args=args, features_path=args.separate_val_set_features, logger=logger) train_data = data # nothing to split; we already got our test and val sets else: train_data, val_data, _ = split_data( data=data, split_type=args.split_type, sizes=(0.8, 0.2, 0.0), seed=args.seed, args=args, logger=logger) else: train_data, val_data, test_data = split_data( data=data, split_type=args.split_type, sizes=args.split_sizes, seed=args.seed, args=args, logger=logger) # Optionally replace test data with train or val data if args.test_split == 'train': test_data = train_data elif args.test_split == 'val': test_data = val_data if args.dataset_type == 'classification': class_sizes = get_class_sizes(data) debug('Class sizes') for i, task_class_sizes in enumerate(class_sizes): debug( f'{args.task_names[i]} ' f'{", ".join(f"{cls}: {size * 100:.2f}%" for cls, size in enumerate(task_class_sizes))}' ) if args.class_balance: train_class_sizes = get_class_sizes(train_data) class_batch_counts = torch.Tensor( train_class_sizes) * args.batch_size args.class_weights = 1 / torch.Tensor(class_batch_counts) if args.save_smiles_splits: with open(args.data_path, 'r') as f: reader = csv.reader(f) header = next(reader) lines_by_smiles = {} indices_by_smiles = {} for i, line in enumerate(reader): smiles = line[0] lines_by_smiles[smiles] = line indices_by_smiles[smiles] = i all_split_indices = [] for dataset, name in [(train_data, 'train'), (val_data, 'val'), (test_data, 'test')]: with open(os.path.join(args.save_dir, name + '_smiles.csv'), 'w') as f: writer = csv.writer(f) writer.writerow(['smiles']) for smiles in dataset.smiles(): writer.writerow([smiles]) with open(os.path.join(args.save_dir, name + '_full.csv'), 'w') as f: writer = csv.writer(f) writer.writerow(header) for smiles in dataset.smiles(): writer.writerow(lines_by_smiles[smiles]) split_indices = [] for smiles in dataset.smiles(): split_indices.append(indices_by_smiles[smiles]) split_indices = sorted(split_indices) all_split_indices.append(split_indices) with open(os.path.join(args.save_dir, 'split_indices.pckl'), 'wb') as f: pickle.dump(all_split_indices, f) return [1 for _ in range(args.num_tasks) ] # short circuit out when just generating splits if args.features_scaling: features_scaler = train_data.normalize_features( replace_nan_token=None if args.predict_features else 0) val_data.normalize_features(features_scaler) test_data.normalize_features(features_scaler) else: features_scaler = None args.train_data_size = len( train_data ) if args.prespecified_chunk_dir is None else args.prespecified_chunks_max_examples_per_epoch if args.adversarial or args.moe: val_smiles, test_smiles = val_data.smiles(), test_data.smiles() debug( f'Total size = {len(data):,} | ' f'train size = {len(train_data):,} | val size = {len(val_data):,} | test size = {len(test_data):,}' ) # Optionally truncate outlier values if args.truncate_outliers: print('Truncating outliers in train set') train_data = truncate_outliers(train_data) # Initialize scaler and scale training targets by subtracting mean and dividing standard deviation (regression only) if args.dataset_type == 'regression' and args.target_scaling: debug('Fitting scaler') train_smiles, train_targets = train_data.smiles(), train_data.targets() scaler = StandardScaler().fit(train_targets) scaled_targets = scaler.transform(train_targets).tolist() train_data.set_targets(scaled_targets) else: scaler = None if args.moe: train_data = cluster_split(train_data, args.num_sources, args.cluster_max_ratio, seed=args.cluster_split_seed, logger=logger) # Chunk training data if too large to load in memory all at once if args.num_chunks > 1: os.makedirs(args.chunk_temp_dir, exist_ok=True) train_paths = [] if args.moe: chunked_sources = [td.chunk(args.num_chunks) for td in train_data] chunks = [] for i in range(args.num_chunks): chunks.append([source[i] for source in chunked_sources]) else: chunks = train_data.chunk(args.num_chunks) for i in range(args.num_chunks): chunk_path = os.path.join(args.chunk_temp_dir, str(i) + '.txt') memo_path = os.path.join(args.chunk_temp_dir, 'memo' + str(i) + '.txt') with open(chunk_path, 'wb') as f: pickle.dump(chunks[i], f) train_paths.append((chunk_path, memo_path)) train_data = train_paths # Get loss and metric functions loss_func = get_loss_func(args) metric_func = get_metric_func(metric=args.metric, args=args) # Set up test set evaluation test_smiles, test_targets = test_data.smiles(), test_data.targets() if args.maml: # TODO refactor test_targets = [] for task_idx in range(len(data.data[0].targets)): _, task_test_data, _ = test_data.sample_maml_task(args, seed=0) test_targets += task_test_data.targets() if args.dataset_type == 'bert_pretraining': sum_test_preds = { 'features': np.zeros((len(test_smiles), args.features_size)) if args.features_size is not None else None, 'vocab': np.zeros((len(test_targets['vocab']), args.vocab.output_size)) } elif args.dataset_type == 'kernel': sum_test_preds = np.zeros((len(test_targets), args.num_tasks)) else: sum_test_preds = np.zeros((len(test_smiles), args.num_tasks)) if args.maml: sum_test_preds = None # annoying to determine exact size; will initialize later if args.dataset_type == 'bert_pretraining': # Only predict targets that are masked out test_targets['vocab'] = [ target if mask == 0 else None for target, mask in zip(test_targets['vocab'], test_data.mask()) ] # Train ensemble of models for model_idx in range(args.ensemble_size): # Tensorboard writer save_dir = os.path.join(args.save_dir, f'model_{model_idx}') os.makedirs(save_dir, exist_ok=True) writer = SummaryWriter(log_dir=save_dir) # Load/build model if args.checkpoint_paths is not None: debug( f'Loading model {model_idx} from {args.checkpoint_paths[model_idx]}' ) model = load_checkpoint(args.checkpoint_paths[model_idx], current_args=args, logger=logger) else: debug(f'Building model {model_idx}') model = build_model(args) debug(model) debug(f'Number of parameters = {param_count(model):,}') if args.cuda: debug('Moving model to cuda') model = model.cuda() # Ensure that model is saved in correct location for evaluation if 0 epochs save_checkpoint(os.path.join(save_dir, 'model.pt'), model, scaler, features_scaler, args) if args.adjust_weight_decay: args.pnorm_target = compute_pnorm(model) # Optimizers optimizer = build_optimizer(model, args) # Learning rate schedulers scheduler = build_lr_scheduler(optimizer, args) # Run training best_score = float('inf') if args.minimize_score else -float('inf') best_epoch, n_iter = 0, 0 for epoch in trange(args.epochs): debug(f'Epoch {epoch}') if args.prespecified_chunk_dir is not None: # load some different random chunks each epoch train_data, val_data = load_prespecified_chunks(args, logger) debug('Loaded prespecified chunks for epoch') if args.dataset_type == 'unsupervised': # won't work with moe full_data = MoleculeDataset(train_data.data + val_data.data) generate_unsupervised_cluster_labels( build_model(args), full_data, args) # cluster with a new random init model.create_ffn( args ) # reset the ffn since we're changing targets-- we're just pretraining the encoder. optimizer.param_groups.pop() # remove ffn parameters optimizer.add_param_group({ 'params': model.ffn.parameters(), 'lr': args.init_lr[1], 'weight_decay': args.weight_decay[1] }) if args.cuda: model.ffn.cuda() if args.gradual_unfreezing: if epoch % args.epochs_per_unfreeze == 0: unfroze_layer = model.unfreeze_next( ) # consider just stopping early after we have nothing left to unfreeze? if unfroze_layer: debug('Unfroze last frozen layer') n_iter = train(model=model, data=train_data, loss_func=loss_func, optimizer=optimizer, scheduler=scheduler, args=args, n_iter=n_iter, logger=logger, writer=writer, chunk_names=(args.num_chunks > 1), val_smiles=val_smiles if args.adversarial else None, test_smiles=test_smiles if args.adversarial or args.moe else None) if isinstance(scheduler, ExponentialLR): scheduler.step() val_scores = evaluate(model=model, data=val_data, metric_func=metric_func, args=args, scaler=scaler, logger=logger) if args.dataset_type == 'bert_pretraining': if val_scores['features'] is not None: debug( f'Validation features rmse = {val_scores["features"]:.6f}' ) writer.add_scalar('validation_features_rmse', val_scores['features'], n_iter) val_scores = [val_scores['vocab']] # Average validation score avg_val_score = np.nanmean(val_scores) debug(f'Validation {args.metric} = {avg_val_score:.6f}') writer.add_scalar(f'validation_{args.metric}', avg_val_score, n_iter) if args.show_individual_scores: # Individual validation scores for task_name, val_score in zip(args.task_names, val_scores): if task_name in desired_labels: debug( f'Validation {task_name} {args.metric} = {val_score:.6f}' ) writer.add_scalar( f'validation_{task_name}_{args.metric}', val_score, n_iter) # Save model checkpoint if improved validation score, or always save it if unsupervised if args.minimize_score and avg_val_score < best_score or \ not args.minimize_score and avg_val_score > best_score or \ args.dataset_type == 'unsupervised': best_score, best_epoch = avg_val_score, epoch save_checkpoint(os.path.join(save_dir, 'model.pt'), model, scaler, features_scaler, args) if args.dataset_type == 'unsupervised': return [0] # rest of this is meaningless when unsupervised # Evaluate on test set using model with best validation score info( f'Model {model_idx} best validation {args.metric} = {best_score:.6f} on epoch {best_epoch}' ) model = load_checkpoint(os.path.join(save_dir, 'model.pt'), cuda=args.cuda, logger=logger) if args.split_test_by_overlap_dataset is not None: overlap_data = get_data(path=args.split_test_by_overlap_dataset, logger=logger) overlap_smiles = set(overlap_data.smiles()) test_data_intersect, test_data_nonintersect = [], [] for d in test_data.data: if d.smiles in overlap_smiles: test_data_intersect.append(d) else: test_data_nonintersect.append(d) test_data_intersect, test_data_nonintersect = MoleculeDataset( test_data_intersect), MoleculeDataset(test_data_nonintersect) for name, td in [('Intersect', test_data_intersect), ('Nonintersect', test_data_nonintersect)]: test_preds = predict(model=model, data=td, args=args, scaler=scaler, logger=logger) test_scores = evaluate_predictions( preds=test_preds, targets=td.targets(), metric_func=metric_func, dataset_type=args.dataset_type, args=args, logger=logger) avg_test_score = np.nanmean(test_scores) info( f'Model {model_idx} test {args.metric} for {name} = {avg_test_score:.6f}' ) if len( test_data ) == 0: # just get some garbage results without crashing; in this case we didn't care anyway test_preds, test_scores = sum_test_preds, [ 0 for _ in range(len(args.task_names)) ] else: test_preds = predict(model=model, data=test_data, args=args, scaler=scaler, logger=logger) test_scores = evaluate_predictions(preds=test_preds, targets=test_targets, metric_func=metric_func, dataset_type=args.dataset_type, args=args, logger=logger) if args.maml: if sum_test_preds is None: sum_test_preds = np.zeros(np.array(test_preds).shape) if args.dataset_type == 'bert_pretraining': if test_preds['features'] is not None: sum_test_preds['features'] += np.array(test_preds['features']) sum_test_preds['vocab'] += np.array(test_preds['vocab']) else: sum_test_preds += np.array(test_preds) if args.dataset_type == 'bert_pretraining': if test_preds['features'] is not None: debug( f'Model {model_idx} test features rmse = {test_scores["features"]:.6f}' ) writer.add_scalar('test_features_rmse', test_scores['features'], 0) test_scores = [test_scores['vocab']] # Average test score avg_test_score = np.nanmean(test_scores) info(f'Model {model_idx} test {args.metric} = {avg_test_score:.6f}') writer.add_scalar(f'test_{args.metric}', avg_test_score, 0) if args.show_individual_scores: # Individual test scores for task_name, test_score in zip(args.task_names, test_scores): if task_name in desired_labels: info( f'Model {model_idx} test {task_name} {args.metric} = {test_score:.6f}' ) writer.add_scalar(f'test_{task_name}_{args.metric}', test_score, n_iter) # Evaluate ensemble on test set if args.dataset_type == 'bert_pretraining': avg_test_preds = { 'features': (sum_test_preds['features'] / args.ensemble_size).tolist() if sum_test_preds['features'] is not None else None, 'vocab': (sum_test_preds['vocab'] / args.ensemble_size).tolist() } else: avg_test_preds = (sum_test_preds / args.ensemble_size).tolist() if len(test_data ) == 0: # just return some garbage when we didn't want test data ensemble_scores = test_scores else: ensemble_scores = evaluate_predictions(preds=avg_test_preds, targets=test_targets, metric_func=metric_func, dataset_type=args.dataset_type, args=args, logger=logger) # Average ensemble score if args.dataset_type == 'bert_pretraining': if ensemble_scores['features'] is not None: info( f'Ensemble test features rmse = {ensemble_scores["features"]:.6f}' ) writer.add_scalar('ensemble_test_features_rmse', ensemble_scores['features'], 0) ensemble_scores = [ensemble_scores['vocab']] avg_ensemble_test_score = np.nanmean(ensemble_scores) info(f'Ensemble test {args.metric} = {avg_ensemble_test_score:.6f}') writer.add_scalar(f'ensemble_test_{args.metric}', avg_ensemble_test_score, 0) # Individual ensemble scores if args.show_individual_scores: for task_name, ensemble_score in zip(args.task_names, ensemble_scores): info( f'Ensemble test {task_name} {args.metric} = {ensemble_score:.6f}' ) return ensemble_scores
def run_training(args: Namespace, logger: Logger = None) -> List[float]: """ Trains a model and returns test scores on the model checkpoint with the highest validation score. :param args: Arguments. :param logger: Logger. :return: A list of ensemble scores for each task. """ if logger is not None: debug, info = logger.debug, logger.info else: debug = info = print # Set GPU if args.gpu is not None: torch.cuda.set_device(args.gpu) # Print args debug(pformat(vars(args))) # Get data debug('Loading data') args.task_names = get_task_names(args.data_path) data = get_data(path=args.data_path, args=args, logger=logger) args.num_tasks = data.num_tasks() args.features_size = data.features_size() debug(f'Number of tasks = {args.num_tasks}') # Split data debug(f'Splitting data with seed {args.seed}') if args.separate_test_path: test_data = get_data(path=args.separate_test_path, args=args, features_path=args.separate_test_features_path, logger=logger) if args.separate_val_path: val_data = get_data(path=args.separate_val_path, args=args, features_path=args.separate_val_features_path, logger=logger) if args.separate_val_path and args.separate_test_path: train_data = data elif args.separate_val_path: train_data, _, test_data = split_data(data=data, split_type=args.split_type, sizes=(0.8, 0.0, 0.2), seed=args.seed, args=args, logger=logger) elif args.separate_test_path: train_data, val_data, _ = split_data(data=data, split_type=args.split_type, sizes=(0.8, 0.2, 0.0), seed=args.seed, args=args, logger=logger) elif args.split_type == 'loocv': train_data, val_data, test_data = split_loocv(data=data, args=args, logger=logger) else: train_data, val_data, test_data = split_data(data=data, split_type=args.split_type, sizes=args.split_sizes, seed=args.seed, args=args, logger=logger) if args.dataset_type == 'classification': class_sizes = get_class_sizes(test_data) debug('Class sizes in test set') for i, task_class_sizes in enumerate(class_sizes): debug(f'{args.task_names[i]} ' f'{", ".join(f"{cls}: {size * 100:.2f}%" for cls, size in enumerate(task_class_sizes))}') if not args.train_all and task_class_sizes == 0: # TODO: only works for just 1 property prediction task debug('Moved to next epoch due to homogenous targets in test set.') return [float('nan')] if args.save_smiles_splits: with open(args.data_path, 'r') as f: reader = csv.reader(f) header = next(reader) lines_by_smiles = {} indices_by_smiles = {} for i, line in enumerate(reader): smiles = (line[0], line[1]) lines_by_smiles[smiles] = line indices_by_smiles[smiles] = i all_split_indices = [] for dataset, name in [(train_data, 'train'), (val_data, 'val'), (test_data, 'test')]: with open(os.path.join(args.save_dir, name + '_smiles.csv'), 'w') as f: writer = csv.writer(f) writer.writerow(['smiles']) for smiles in dataset.smiles(): writer.writerow([smiles]) with open(os.path.join(args.save_dir, name + '_full.csv'), 'w') as f: writer = csv.writer(f) writer.writerow(header) for smiles in dataset.smiles(): writer.writerow(lines_by_smiles[smiles]) split_indices = [] for smiles in dataset.smiles(): split_indices.append(indices_by_smiles[smiles]) split_indices = sorted(split_indices) all_split_indices.append(split_indices) with open(os.path.join(args.save_dir, 'split_indices.pckl'), 'wb') as f: pickle.dump(all_split_indices, f) if args.symmetric: train_data = flip_data(train_data) if args.features_scaling: drug_scaler, cmpd_scaler = train_data.normalize_features(replace_nan_token=0) val_data.normalize_features(drug_scaler, cmpd_scaler) test_data.normalize_features(drug_scaler, cmpd_scaler) else: drug_scaler, cmpd_scaler = None, None args.train_data_size = len(train_data) debug(f'Total size = {len(data):,} | ' f'train size = {len(train_data):,} | val size = {len(val_data):,} | test size = {len(test_data):,}') # Initialize scaler and scale training targets by subtracting mean and dividing standard deviation (regression only) if args.dataset_type == 'regression': debug('Fitting scaler') train_smiles, train_targets = train_data.smiles(), train_data.targets() scaler = StandardScaler().fit(train_targets) scaled_targets = scaler.transform(train_targets).tolist() train_data.set_targets(scaled_targets) else: scaler = None # Get loss and metric functions loss_func = get_loss_func(args) metric_func = get_metric_func(metric=args.metric) # Set up test set evaluation test_smiles, test_targets = test_data.smiles(), test_data.targets() if args.dataset_type == 'multiclass': sum_test_preds = np.zeros((len(test_smiles), args.num_tasks, args.multiclass_num_classes)) else: sum_test_preds = np.zeros((len(test_smiles), args.num_tasks)) # Train ensemble of models for model_idx in range(args.ensemble_size): # Tensorboard writer save_dir = os.path.join(args.save_dir, f'model_{model_idx}') makedirs(save_dir) try: writer = SummaryWriter(log_dir=save_dir) except: writer = SummaryWriter(logdir=save_dir) # Load/build model if args.checkpoint_paths is not None: debug(f'Loading model {model_idx} from {args.checkpoint_paths[model_idx]}') model = load_checkpoint(args.checkpoint_paths[model_idx], current_args=args, logger=logger) else: debug(f'Building model {model_idx}') model = build_model(args) debug(model) debug(f'Number of parameters = {param_count(model):,}') if args.cuda: debug('Moving model to cuda') model = model.cuda() # Ensure that model is saved in correct location for evaluation if 0 epochs save_checkpoint(os.path.join(save_dir, 'model.pt'), model, scaler, drug_scaler, cmpd_scaler, args) # Optimizers optimizer = build_optimizer(model, args) # Learning rate schedulers scheduler = build_lr_scheduler(optimizer, args) # Run training best_score = float('inf') if args.minimize_score else -float('inf') best_epoch, n_iter = 0, 0 for epoch in trange(args.epochs): debug(f'Epoch {epoch}') n_iter = train( model=model, data=train_data, loss_func=loss_func, optimizer=optimizer, scheduler=scheduler, args=args, n_iter=n_iter, logger=logger, writer=writer ) if isinstance(scheduler, ExponentialLR): scheduler.step() val_scores, val_loss = evaluate( model=model, data=val_data, loss_func=loss_func, num_tasks=args.num_tasks, metric_func=metric_func, batch_size=args.batch_size, dataset_type=args.dataset_type, scaler=scaler, logger=logger ) # Average validation score avg_val_score = np.nanmean(val_scores) debug(f'Validation {args.metric} = {avg_val_score:.6f}') writer.add_scalar(f'validation_{args.metric}', avg_val_score, n_iter) debug(f'Validation loss = {val_loss:.6f}') writer.add_scalar(f'validation_loss', val_loss, n_iter) if args.show_individual_scores: # Individual validation scores for task_name, val_score in zip(args.task_names, val_scores): debug(f'Validation {task_name} {args.metric} = {val_score:.6f}') writer.add_scalar(f'validation_{task_name}_{args.metric}', val_score, n_iter) # Save model checkpoint if improved validation score if args.minimize_score and avg_val_score < best_score or \ not args.minimize_score and avg_val_score > best_score: best_score, best_epoch = avg_val_score, epoch save_checkpoint(os.path.join(save_dir, 'model.pt'), model, scaler, drug_scaler, cmpd_scaler, args) # Evaluate on test set using model with best validation score info(f'Model {model_idx} best validation {args.metric} = {best_score:.6f} on epoch {best_epoch}') model = load_checkpoint(os.path.join(save_dir, 'model.pt'), cuda=args.cuda, logger=logger) test_preds = predict( model=model, data=test_data, batch_size=args.batch_size, scaler=scaler ) if args.save_preds: val_preds = predict(model=model, data=val_data, batch_size=args.batch_size, scaler=scaler) train_preds = predict(model=model, data=train_data, batch_size=args.batch_size, scaler=scaler) save_predictions(save_dir, train_data, val_data, test_data, train_preds, val_preds, test_preds, scaler) test_scores = evaluate_predictions( preds=test_preds, targets=test_targets, num_tasks=args.num_tasks, metric_func=metric_func, dataset_type=args.dataset_type, logger=logger ) if len(test_preds) != 0: sum_test_preds += np.array(test_preds) # Average test score avg_test_score = np.nanmean(test_scores) info(f'Model {model_idx} test {args.metric} = {avg_test_score:.6f}') writer.add_scalar(f'test_{args.metric}', avg_test_score, 0) if args.show_individual_scores: # Individual test scores for task_name, test_score in zip(args.task_names, test_scores): info(f'Model {model_idx} test {task_name} {args.metric} = {test_score:.6f}') writer.add_scalar(f'test_{task_name}_{args.metric}', test_score, n_iter) # Evaluate ensemble on test set avg_test_preds = (sum_test_preds / args.ensemble_size).tolist() ensemble_scores = evaluate_predictions( preds=avg_test_preds, targets=test_targets, num_tasks=args.num_tasks, metric_func=metric_func, dataset_type=args.dataset_type, logger=logger ) # Average ensemble score avg_ensemble_test_score = np.nanmean(ensemble_scores) info(f'Ensemble test {args.metric} = {avg_ensemble_test_score:.6f}') writer.add_scalar(f'ensemble_test_{args.metric}', avg_ensemble_test_score, 0) # Individual ensemble scores if args.show_individual_scores: for task_name, ensemble_score in zip(args.task_names, ensemble_scores): info(f'Ensemble test {task_name} {args.metric} = {ensemble_score:.6f}') return ensemble_scores
def run_training(args: TrainArgs, logger: Logger = None) -> List[float]: """ Loads data, trains a Chemprop model, and returns test scores for the model checkpoint with the highest validation score. :param args: A :class:`~chemprop.args.TrainArgs` object containing arguments for loading data and training the Chemprop model. :param logger: A logger to record output. :return: A list of model scores for each task. """ if logger is not None: debug, info = logger.debug, logger.info else: debug = info = print # Print command line debug('Command line') debug(f'python {" ".join(sys.argv)}') # Print args debug('Args') debug(args) # Save args args.save(os.path.join(args.save_dir, 'args.json')) # Set pytorch seed for random initial weights torch.manual_seed(args.pytorch_seed) # Get data debug('Loading data') data = get_data(path=args.data_path, args=args, logger=logger) validate_dataset_type(data, dataset_type=args.dataset_type) args.features_size = data.features_size() debug(f'Number of tasks = {args.num_tasks}') # Split data debug(f'Splitting data with seed {args.seed}') if args.separate_test_path: test_data = get_data(path=args.separate_test_path, args=args, features_path=args.separate_test_features_path, logger=logger) if args.separate_val_path: val_data = get_data(path=args.separate_val_path, args=args, features_path=args.separate_val_features_path, logger=logger) if args.separate_val_path and args.separate_test_path: train_data = data elif args.separate_val_path: train_data, _, test_data = split_data(data=data, split_type=args.split_type, sizes=(0.8, 0.0, 0.2), seed=args.seed, args=args, logger=logger) elif args.separate_test_path: train_data, val_data, _ = split_data(data=data, split_type=args.split_type, sizes=(0.8, 0.2, 0.0), seed=args.seed, args=args, logger=logger) else: train_data, val_data, test_data = split_data( data=data, split_type=args.split_type, sizes=args.split_sizes, seed=args.seed, args=args, logger=logger) if args.dataset_type == 'classification': class_sizes = get_class_sizes(data) debug('Class sizes') for i, task_class_sizes in enumerate(class_sizes): debug( f'{args.task_names[i]} ' f'{", ".join(f"{cls}: {size * 100:.2f}%" for cls, size in enumerate(task_class_sizes))}' ) if args.save_smiles_splits: save_smiles_splits(train_data=train_data, val_data=val_data, test_data=test_data, data_path=args.data_path, save_dir=args.save_dir, smiles_column=args.smiles_column) if args.features_scaling: features_scaler = train_data.normalize_features(replace_nan_token=0) val_data.normalize_features(features_scaler) test_data.normalize_features(features_scaler) else: features_scaler = None args.train_data_size = len(train_data) debug( f'Total size = {len(data):,} | ' f'train size = {len(train_data):,} | val size = {len(val_data):,} | test size = {len(test_data):,}' ) # Initialize scaler and scale training targets by subtracting mean and dividing standard deviation (regression only) if args.dataset_type == 'regression': debug('Fitting scaler') train_smiles, train_targets = train_data.smiles(), train_data.targets() scaler = StandardScaler().fit(train_targets) scaled_targets = scaler.transform(train_targets).tolist() train_data.set_targets(scaled_targets) else: scaler = None # Get loss and metric functions loss_func = get_loss_func(args) metric_func = get_metric_func(metric=args.metric) # Set up test set evaluation test_smiles, test_targets = test_data.smiles(), test_data.targets() if args.dataset_type == 'multiclass': sum_test_preds = np.zeros( (len(test_smiles), args.num_tasks, args.multiclass_num_classes)) else: sum_test_preds = np.zeros((len(test_smiles), args.num_tasks)) # Automatically determine whether to cache if len(data) <= args.cache_cutoff: cache = True num_workers = 0 else: cache = False num_workers = args.num_workers # Create data loaders train_data_loader = MoleculeDataLoader(dataset=train_data, batch_size=args.batch_size, num_workers=num_workers, cache=cache, class_balance=args.class_balance, shuffle=True, seed=args.seed) val_data_loader = MoleculeDataLoader(dataset=val_data, batch_size=args.batch_size, num_workers=num_workers, cache=cache) test_data_loader = MoleculeDataLoader(dataset=test_data, batch_size=args.batch_size, num_workers=num_workers, cache=cache) # Train ensemble of models for model_idx in range(args.ensemble_size): # Tensorboard writer save_dir = os.path.join(args.save_dir, f'model_{model_idx}') makedirs(save_dir) try: writer = SummaryWriter(log_dir=save_dir) except: writer = SummaryWriter(logdir=save_dir) # Load/build model if args.checkpoint_paths is not None: debug( f'Loading model {model_idx} from {args.checkpoint_paths[model_idx]}' ) model = load_checkpoint(args.checkpoint_paths[model_idx], logger=logger) else: debug(f'Building model {model_idx}') model = MoleculeModel(args) debug(model) debug(f'Number of parameters = {param_count(model):,}') if args.cuda: debug('Moving model to cuda') model = model.to(args.device) # Ensure that model is saved in correct location for evaluation if 0 epochs save_checkpoint(os.path.join(save_dir, 'model.pt'), model, scaler, features_scaler, args) # Optimizers optimizer = build_optimizer(model, args) # Learning rate schedulers scheduler = build_lr_scheduler(optimizer, args) # Run training best_score = float('inf') if args.minimize_score else -float('inf') best_epoch, n_iter = 0, 0 for epoch in trange(args.epochs): debug(f'Epoch {epoch}') n_iter = train(model=model, data_loader=train_data_loader, loss_func=loss_func, optimizer=optimizer, scheduler=scheduler, args=args, n_iter=n_iter, logger=logger, writer=writer) if isinstance(scheduler, ExponentialLR): scheduler.step() val_scores = evaluate(model=model, data_loader=val_data_loader, num_tasks=args.num_tasks, metric_func=metric_func, dataset_type=args.dataset_type, scaler=scaler, logger=logger) # Average validation score avg_val_score = np.nanmean(val_scores) debug(f'Validation {args.metric} = {avg_val_score:.6f}') writer.add_scalar(f'validation_{args.metric}', avg_val_score, n_iter) if args.show_individual_scores: # Individual validation scores for task_name, val_score in zip(args.task_names, val_scores): debug( f'Validation {task_name} {args.metric} = {val_score:.6f}' ) writer.add_scalar(f'validation_{task_name}_{args.metric}', val_score, n_iter) # Save model checkpoint if improved validation score if args.minimize_score and avg_val_score < best_score or \ not args.minimize_score and avg_val_score > best_score: best_score, best_epoch = avg_val_score, epoch save_checkpoint(os.path.join(save_dir, 'model.pt'), model, scaler, features_scaler, args) # Evaluate on test set using model with best validation score info( f'Model {model_idx} best validation {args.metric} = {best_score:.6f} on epoch {best_epoch}' ) model = load_checkpoint(os.path.join(save_dir, 'model.pt'), device=args.device, logger=logger) test_preds = predict(model=model, data_loader=test_data_loader, scaler=scaler) test_scores = evaluate_predictions(preds=test_preds, targets=test_targets, num_tasks=args.num_tasks, metric_func=metric_func, dataset_type=args.dataset_type, logger=logger) if len(test_preds) != 0: sum_test_preds += np.array(test_preds) # Average test score avg_test_score = np.nanmean(test_scores) info(f'Model {model_idx} test {args.metric} = {avg_test_score:.6f}') writer.add_scalar(f'test_{args.metric}', avg_test_score, 0) if args.show_individual_scores: # Individual test scores for task_name, test_score in zip(args.task_names, test_scores): info( f'Model {model_idx} test {task_name} {args.metric} = {test_score:.6f}' ) writer.add_scalar(f'test_{task_name}_{args.metric}', test_score, n_iter) writer.close() # Evaluate ensemble on test set avg_test_preds = (sum_test_preds / args.ensemble_size).tolist() ensemble_scores = evaluate_predictions(preds=avg_test_preds, targets=test_targets, num_tasks=args.num_tasks, metric_func=metric_func, dataset_type=args.dataset_type, logger=logger) # Average ensemble score avg_ensemble_test_score = np.nanmean(ensemble_scores) info(f'Ensemble test {args.metric} = {avg_ensemble_test_score:.6f}') # Individual ensemble scores if args.show_individual_scores: for task_name, ensemble_score in zip(args.task_names, ensemble_scores): info( f'Ensemble test {task_name} {args.metric} = {ensemble_score:.6f}' ) return ensemble_scores
def visualize_encoding_property_space(args: Namespace): # Load data data = get_data(path=args.data_path) # Sort according to similarity measure if args.similarity_measure == 'property': data.sort(key=lambda d: d.targets[args.task_index]) elif args.similarity_measure == 'random': data.shuffle(args.seed) else: raise ValueError( f'similarity_measure "{args.similarity_measure}" not supported or not implemented yet.' ) # Load model and scalers model = load_checkpoint(args.checkpoint_path) scaler, features_scaler = load_scalers(args.checkpoint_path) data.normalize_features(features_scaler) # Random seed if args.seed is not None: random.seed(args.seed) # Generate visualizations for i in trange(args.num_examples): # Get random three molecules with similar properties index = random.randint(1, len(data) - 2) molecules = MoleculeDataset(data[index - 1:index + 2]) molecule_targets = [t[args.task_index] for t in molecules.targets()] # Encode three molecules molecule_encodings = model.encoder(molecules.smiles()) # Define interpolation def predict_property(point: List[int]) -> float: # Return true value on endpoints of triangle argmax = np.argmax(point) if point[argmax] == 1: return molecule_targets[argmax] # Interpolate and predict task value encoding = sum(point[j] * molecule_encodings[j] for j in range(len(molecule_encodings))) pred = model.ffn(encoding).data.cpu().numpy() pred = scaler.inverse_transform(pred) pred = pred.item() return pred # Create visualization scale = 20 fontsize = 6 figure, tax = ternary.figure(scale=scale) tax.heatmapf(predict_property, boundary=True, style="hexagonal") tax.set_title("Property Prediction") tax.right_axis_label( f'{molecules[0].smiles} ({molecules[0].targets[args.task_index]:.6f}) -->', fontsize=fontsize) tax.left_axis_label( f'{molecules[1].smiles} ({molecules[1].targets[args.task_index]:.6f}) -->', fontsize=fontsize) tax.bottom_axis_label( f'<-- {molecules[2].smiles} ({molecules[2].targets[args.task_index]:.6f})', fontsize=fontsize) tax.savefig(os.path.join(args.save_dir, f'{i}.png'))
for i, l in enumerate(f): pass return i + 1 def counter(f): acc = 0 if os.path.isdir(f): for file in os.listdir(f): if file.endswith('.csv') or file.endswith(".txt"): l = file_len(os.path.join(f, file)) acc += l - 1 return acc if os.path.isfile(f): if f.endswith('.csv') or f.endswith(".txt"): l = file_len(f) acc += l return (acc - 1) if __name__ == "__main__": model = load_checkpoint( "../multi_task_subfamily_dmpnn_25/fold_0/model_0/model.pt") df = pd.read_csv("../chembl27/chembl27-all.tsv", sep="\t", header=None).dropna() for i in range(6): featurize_file(input_df=df[i * 220000:(i + 1) * 220000], output_path="../data/chembl27-all-features_" + str(i) + ".csv", pretrained_model=model)
def run_training(args: TrainArgs, data: MoleculeDataset, logger: Logger = None) -> Dict[str, List[float]]: """ Loads data, trains a Chemprop model, and returns test scores for the model checkpoint with the highest validation score. :param args: A :class:`~chemprop.args.TrainArgs` object containing arguments for loading data and training the Chemprop model. :param data: A :class:`~chemprop.data.MoleculeDataset` containing the data. :param logger: A logger to record output. :return: A dictionary mapping each metric in :code:`args.metrics` to a list of values for each task. """ if logger is not None: debug, info = logger.debug, logger.info else: debug = info = print # Set pytorch seed for random initial weights torch.manual_seed(args.pytorch_seed) # Split data debug(f'Splitting data with seed {args.seed}') if args.separate_test_path: test_data = get_data(path=args.separate_test_path, args=args, features_path=args.separate_test_features_path, atom_descriptors_path=args.separate_test_atom_descriptors_path, bond_features_path=args.separate_test_bond_features_path, phase_features_path=args.separate_test_phase_features_path, smiles_columns=args.smiles_columns, logger=logger) if args.separate_val_path: val_data = get_data(path=args.separate_val_path, args=args, features_path=args.separate_val_features_path, atom_descriptors_path=args.separate_val_atom_descriptors_path, bond_features_path=args.separate_val_bond_features_path, phase_features_path=args.separate_val_phase_features_path, smiles_columns = args.smiles_columns, logger=logger) if args.separate_val_path and args.separate_test_path: train_data = data elif args.separate_val_path: train_data, _, test_data = split_data(data=data, split_type=args.split_type, sizes=args.split_sizes, key_molecule_index=args.split_key_molecule, seed=args.seed, num_folds=args.num_folds, args=args, logger=logger) elif args.separate_test_path: train_data, val_data, _ = split_data(data=data, split_type=args.split_type, sizes=args.split_sizes, key_molecule_index=args.split_key_molecule, seed=args.seed, num_folds=args.num_folds, args=args, logger=logger) else: train_data, val_data, test_data = split_data(data=data, split_type=args.split_type, sizes=args.split_sizes, key_molecule_index=args.split_key_molecule, seed=args.seed, num_folds=args.num_folds, args=args, logger=logger) if args.dataset_type == 'classification': class_sizes = get_class_sizes(data) debug('Class sizes') for i, task_class_sizes in enumerate(class_sizes): debug(f'{args.task_names[i]} ' f'{", ".join(f"{cls}: {size * 100:.2f}%" for cls, size in enumerate(task_class_sizes))}') if args.save_smiles_splits: save_smiles_splits( data_path=args.data_path, save_dir=args.save_dir, task_names=args.task_names, features_path=args.features_path, train_data=train_data, val_data=val_data, test_data=test_data, smiles_columns=args.smiles_columns, logger=logger, ) if args.features_scaling: features_scaler = train_data.normalize_features(replace_nan_token=0) val_data.normalize_features(features_scaler) test_data.normalize_features(features_scaler) else: features_scaler = None if args.atom_descriptor_scaling and args.atom_descriptors is not None: atom_descriptor_scaler = train_data.normalize_features(replace_nan_token=0, scale_atom_descriptors=True) val_data.normalize_features(atom_descriptor_scaler, scale_atom_descriptors=True) test_data.normalize_features(atom_descriptor_scaler, scale_atom_descriptors=True) else: atom_descriptor_scaler = None if args.bond_feature_scaling and args.bond_features_size > 0: bond_feature_scaler = train_data.normalize_features(replace_nan_token=0, scale_bond_features=True) val_data.normalize_features(bond_feature_scaler, scale_bond_features=True) test_data.normalize_features(bond_feature_scaler, scale_bond_features=True) else: bond_feature_scaler = None args.train_data_size = len(train_data) debug(f'Total size = {len(data):,} | ' f'train size = {len(train_data):,} | val size = {len(val_data):,} | test size = {len(test_data):,}') # Initialize scaler and scale training targets by subtracting mean and dividing standard deviation (regression only) if args.dataset_type == 'regression': debug('Fitting scaler') scaler = train_data.normalize_targets() elif args.dataset_type == 'spectra': debug('Normalizing spectra and excluding spectra regions based on phase') args.spectra_phase_mask = load_phase_mask(args.spectra_phase_mask_path) for dataset in [train_data, test_data, val_data]: data_targets = normalize_spectra( spectra=dataset.targets(), phase_features=dataset.phase_features(), phase_mask=args.spectra_phase_mask, excluded_sub_value=None, threshold=args.spectra_target_floor, ) dataset.set_targets(data_targets) scaler = None else: scaler = None # Get loss function loss_func = get_loss_func(args) # Set up test set evaluation test_smiles, test_targets = test_data.smiles(), test_data.targets() if args.dataset_type == 'multiclass': sum_test_preds = np.zeros((len(test_smiles), args.num_tasks, args.multiclass_num_classes)) else: sum_test_preds = np.zeros((len(test_smiles), args.num_tasks)) # Automatically determine whether to cache if len(data) <= args.cache_cutoff: set_cache_graph(True) num_workers = 0 else: set_cache_graph(False) num_workers = args.num_workers # Create data loaders train_data_loader = MoleculeDataLoader( dataset=train_data, batch_size=args.batch_size, num_workers=num_workers, class_balance=args.class_balance, shuffle=True, seed=args.seed ) val_data_loader = MoleculeDataLoader( dataset=val_data, batch_size=args.batch_size, num_workers=num_workers ) test_data_loader = MoleculeDataLoader( dataset=test_data, batch_size=args.batch_size, num_workers=num_workers ) if args.class_balance: debug(f'With class_balance, effective train size = {train_data_loader.iter_size:,}') # Train ensemble of models for model_idx in range(args.ensemble_size): # Tensorboard writer save_dir = os.path.join(args.save_dir, f'model_{model_idx}') makedirs(save_dir) try: writer = SummaryWriter(log_dir=save_dir) except: writer = SummaryWriter(logdir=save_dir) # Load/build model if args.checkpoint_paths is not None: debug(f'Loading model {model_idx} from {args.checkpoint_paths[model_idx]}') model = load_checkpoint(args.checkpoint_paths[model_idx], logger=logger) else: debug(f'Building model {model_idx}') model = MoleculeModel(args) # Optionally, overwrite weights: if args.checkpoint_frzn is not None: debug(f'Loading and freezing parameters from {args.checkpoint_frzn}.') model = load_frzn_model(model=model,path=args.checkpoint_frzn, current_args=args, logger=logger) debug(model) if args.checkpoint_frzn is not None: debug(f'Number of unfrozen parameters = {param_count(model):,}') debug(f'Total number of parameters = {param_count_all(model):,}') else: debug(f'Number of parameters = {param_count_all(model):,}') if args.cuda: debug('Moving model to cuda') model = model.to(args.device) # Ensure that model is saved in correct location for evaluation if 0 epochs save_checkpoint(os.path.join(save_dir, MODEL_FILE_NAME), model, scaler, features_scaler, atom_descriptor_scaler, bond_feature_scaler, args) # Optimizers optimizer = build_optimizer(model, args) # Learning rate schedulers scheduler = build_lr_scheduler(optimizer, args) # Run training best_score = float('inf') if args.minimize_score else -float('inf') best_epoch, n_iter = 0, 0 for epoch in trange(args.epochs): debug(f'Epoch {epoch}') n_iter = train( model=model, data_loader=train_data_loader, loss_func=loss_func, optimizer=optimizer, scheduler=scheduler, args=args, n_iter=n_iter, logger=logger, writer=writer ) if isinstance(scheduler, ExponentialLR): scheduler.step() val_scores = evaluate( model=model, data_loader=val_data_loader, num_tasks=args.num_tasks, metrics=args.metrics, dataset_type=args.dataset_type, scaler=scaler, logger=logger ) for metric, scores in val_scores.items(): # Average validation score avg_val_score = np.nanmean(scores) debug(f'Validation {metric} = {avg_val_score:.6f}') writer.add_scalar(f'validation_{metric}', avg_val_score, n_iter) if args.show_individual_scores: # Individual validation scores for task_name, val_score in zip(args.task_names, scores): debug(f'Validation {task_name} {metric} = {val_score:.6f}') writer.add_scalar(f'validation_{task_name}_{metric}', val_score, n_iter) # Save model checkpoint if improved validation score avg_val_score = np.nanmean(val_scores[args.metric]) if args.minimize_score and avg_val_score < best_score or \ not args.minimize_score and avg_val_score > best_score: best_score, best_epoch = avg_val_score, epoch save_checkpoint(os.path.join(save_dir, MODEL_FILE_NAME), model, scaler, features_scaler, atom_descriptor_scaler, bond_feature_scaler, args) # Evaluate on test set using model with best validation score info(f'Model {model_idx} best validation {args.metric} = {best_score:.6f} on epoch {best_epoch}') model = load_checkpoint(os.path.join(save_dir, MODEL_FILE_NAME), device=args.device, logger=logger) test_preds = predict( model=model, data_loader=test_data_loader, scaler=scaler ) test_scores = evaluate_predictions( preds=test_preds, targets=test_targets, num_tasks=args.num_tasks, metrics=args.metrics, dataset_type=args.dataset_type, logger=logger ) if len(test_preds) != 0: sum_test_preds += np.array(test_preds) # Average test score for metric, scores in test_scores.items(): avg_test_score = np.nanmean(scores) info(f'Model {model_idx} test {metric} = {avg_test_score:.6f}') writer.add_scalar(f'test_{metric}', avg_test_score, 0) if args.show_individual_scores and args.dataset_type != 'spectra': # Individual test scores for task_name, test_score in zip(args.task_names, scores): info(f'Model {model_idx} test {task_name} {metric} = {test_score:.6f}') writer.add_scalar(f'test_{task_name}_{metric}', test_score, n_iter) writer.close() # Evaluate ensemble on test set avg_test_preds = (sum_test_preds / args.ensemble_size).tolist() ensemble_scores = evaluate_predictions( preds=avg_test_preds, targets=test_targets, num_tasks=args.num_tasks, metrics=args.metrics, dataset_type=args.dataset_type, logger=logger ) for metric, scores in ensemble_scores.items(): # Average ensemble score avg_ensemble_test_score = np.nanmean(scores) info(f'Ensemble test {metric} = {avg_ensemble_test_score:.6f}') # Individual ensemble scores if args.show_individual_scores: for task_name, ensemble_score in zip(args.task_names, scores): info(f'Ensemble test {task_name} {metric} = {ensemble_score:.6f}') # Save scores with open(os.path.join(args.save_dir, 'test_scores.json'), 'w') as f: json.dump(ensemble_scores, f, indent=4, sort_keys=True) # Optionally save test preds if args.save_preds: test_preds_dataframe = pd.DataFrame(data={'smiles': test_data.smiles()}) for i, task_name in enumerate(args.task_names): test_preds_dataframe[task_name] = [pred[i] for pred in avg_test_preds] test_preds_dataframe.to_csv(os.path.join(args.save_dir, 'test_preds.csv'), index=False) return ensemble_scores
def make_predictions(args: PredictArgs, smiles: List[str] = None) -> List[Optional[List[float]]]: """ Makes predictions. If smiles is provided, makes predictions on smiles. Otherwise makes predictions on args.test_data. :param args: Arguments. :param smiles: Smiles to make predictions on. :return: A list of lists of target predictions. """ print('Loading training args') scaler, features_scaler = load_scalers(args.checkpoint_paths[0]) train_args = load_args(args.checkpoint_paths[0]) num_tasks, task_names = train_args.num_tasks, train_args.task_names # If features were used during training, they must be used when predicting if ((train_args.features_path is not None or train_args.features_generator is not None) and args.features_path is None and args.features_generator is None): raise ValueError( 'Features were used during training so they must be specified again during prediction ' 'using the same type of features as before (with either --features_generator or ' '--features_path and using --no_features_scaling if applicable).') # Update predict args with training arguments to create a merged args object for key, value in vars(train_args).items(): if not hasattr(args, key): setattr(args, key, value) args: Union[PredictArgs, TrainArgs] print('Loading data') if smiles is not None: full_data = get_data_from_smiles( smiles=smiles, skip_invalid_smiles=False, features_generator=args.features_generator) else: full_data = get_data(path=args.test_path, args=args, target_columns=[], skip_invalid_smiles=False) print('Validating SMILES') full_to_valid_indices = {} valid_index = 0 for full_index in range(len(full_data)): if full_data[full_index].mol is not None: full_to_valid_indices[full_index] = valid_index valid_index += 1 test_data = MoleculeDataset( [full_data[i] for i in sorted(full_to_valid_indices.keys())]) # Edge case if empty list of smiles is provided if len(test_data) == 0: return [None] * len(full_data) print(f'Test size = {len(test_data):,}') # Normalize features if args.features_scaling: test_data.normalize_features(features_scaler) # Initialize uncertainty estimator if args.uncertainty: uncertainty_estimator = uncertainty_estimator_builder( args.uncertainty)(args, test_data, scaler) # Predict with each model individually and sum predictions if not args.uncertainty: if args.dataset_type == 'multiclass': sum_preds = np.zeros( (len(test_data), num_tasks, args.multiclass_num_classes)) else: sum_preds = np.zeros((len(test_data), num_tasks)) # Create data loader test_data_loader = MoleculeDataLoader(dataset=test_data, batch_size=args.batch_size, num_workers=args.num_workers) print( f'Predicting with an ensemble of {len(args.checkpoint_paths)} models') for N, checkpoint_path in tqdm(enumerate(args.checkpoint_paths), total=len(args.checkpoint_paths)): # Load model model = load_checkpoint(checkpoint_path, device=args.device) model.training = False if not args.uncertainty: model_preds = predict(model=model, data_loader=test_data_loader, scaler=scaler) sum_preds += np.array(model_preds) else: uncertainty_estimator.process_model(model, N) # Ensemble predictions if not args.uncertainty: avg_preds = sum_preds / len(args.checkpoint_paths) avg_preds = avg_preds.tolist() else: avg_preds, avg_UQ = uncertainty_estimator.calculate_UQ() if type(avg_UQ) is tuple: aleatoric, epistemic = avg_UQ # Save predictions print(f'Saving predictions to {args.preds_path}') assert len(test_data) == len(avg_preds) makedirs(args.preds_path, isfile=True) # Get prediction column names if args.dataset_type == 'multiclass': task_names = [ f'{name}_class_{i}' for name in task_names for i in range(args.multiclass_num_classes) ] else: task_names = task_names # Copy predictions over to full_data for full_index, datapoint in enumerate(full_data): valid_index = full_to_valid_indices.get(full_index, None) preds = avg_preds[valid_index] if valid_index is not None else [ 'Invalid SMILES' ] * len(task_names) if args.uncertainty: if not args.split_UQ: cur_UQ = avg_UQ[valid_index] if valid_index is not None else [ 'Invalid SMILES' ] * len(task_names) datapoint.row['Uncertainty'] = cur_UQ elif args.split_UQ: cur_al = aleatoric[ valid_index] if valid_index is not None else [ 'Invalid SMILES' ] * len(task_names) cur_ep = epistemic[ valid_index] if valid_index is not None else [ 'Invalid SMILES' ] * len(task_names) datapoint.row['Aleatoric'] = cur_al datapoint.row['Epistemic'] = cur_ep if type(preds) is list: for pred_name, pred in zip(task_names, preds): datapoint.row[pred_name] = pred else: datapoint.row[task_names[0]] = preds # Save with open(args.preds_path, 'w') as f: writer = csv.DictWriter(f, fieldnames=full_data[0].row.keys()) writer.writeheader() for datapoint in full_data: writer.writerow(datapoint.row) return avg_preds
def run_training(args: TrainArgs, logger: Logger = None) -> List[float]: """ Trains a model and returns test scores on the model checkpoint with the highest validation score. :param args: Arguments. :param logger: Logger. :return: A list of ensemble scores for each task. """ debug = info = print # Print command line and args debug('Command line') debug(f'python {" ".join(sys.argv)}') debug('Args') debug(args) # Save args args.save(os.path.join(args.save_dir, 'args.json')) # Get data debug('Loading data') args.task_names = args.target_columns or get_task_names(args.data_path) data = get_data(path=args.data_path, args=args, logger=logger) args.num_tasks = data.num_tasks() args.features_size = data.features_size() debug(f'Number of tasks = {args.num_tasks}') # Split data debug(f'Splitting data with seed {args.seed}') train_data, val_data, test_data = split_data(data=data, split_type=args.split_type, sizes=args.split_sizes, seed=args.seed, args=args, logger=logger) if args.features_scaling: features_scaler = train_data.normalize_features(replace_nan_token=0) val_data.normalize_features(features_scaler) test_data.normalize_features(features_scaler) else: features_scaler = None args.train_data_size = len(train_data) debug( f'Total size = {len(data):,} | ' f'train size = {len(train_data):,} | val size = {len(val_data):,} | test size = {len(test_data):,}' ) # Initialize scaler and scale training targets by subtracting mean and dividing standard deviation (regression only) if args.dataset_type == 'regression': debug('Fitting scaler') train_smiles, train_targets = train_data.smiles(), train_data.targets() scaler = StandardScaler().fit(train_targets) scaled_targets = scaler.transform(train_targets).tolist() train_data.set_targets(scaled_targets) else: scaler = None # Get loss and metric functions loss_func = neg_log_like metric_func = get_metric_func(metric=args.metric) # Set up test set evaluation test_smiles, test_targets = test_data.smiles(), test_data.targets() sum_test_preds = np.zeros((len(test_smiles), args.num_tasks)) # Automatically determine whether to cache if len(data) <= args.cache_cutoff: cache = True num_workers = 0 else: cache = False num_workers = args.num_workers # Create data loaders train_data_loader = MoleculeDataLoader(dataset=train_data, batch_size=args.batch_size, num_workers=num_workers, cache=cache, class_balance=args.class_balance, shuffle=True, seed=args.seed) val_data_loader = MoleculeDataLoader(dataset=val_data, batch_size=args.batch_size, num_workers=num_workers, cache=cache) test_data_loader = MoleculeDataLoader(dataset=test_data, batch_size=args.batch_size, num_workers=num_workers, cache=cache) ########################################### ########## Outer loop over ensemble members ########################################### for model_idx in range(args.ensemble_start_idx, args.ensemble_start_idx + args.ensemble_size): # Set pytorch seed for random initial weights torch.manual_seed(args.pytorch_seeds[model_idx]) ######## set up all logging ######## # make save_dir save_dir = os.path.join(args.save_dir, f'model_{model_idx}') makedirs(save_dir) # make results_dir results_dir = os.path.join(args.results_dir, f'model_{model_idx}') makedirs(results_dir) # initialise wandb os.environ['WANDB_MODE'] = 'dryrun' wandb.init(name=args.wandb_name + '_' + str(model_idx), project=args.wandb_proj, reinit=True) print('WANDB directory is:') print(wandb.run.dir) #################################### # Load/build model if args.checkpoint_path is not None: debug(f'Loading model {model_idx} from {args.checkpoint_path}') model = load_checkpoint(args.checkpoint_path + f'/model_{model_idx}/model.pt', device=args.device, logger=logger) else: debug(f'Building model {model_idx}') model = MoleculeModel(args) debug(model) debug(f'Number of parameters = {param_count(model):,}') if args.cuda: debug('Moving model to cuda') model = model.to(args.device) # Ensure that model is saved in correct location for evaluation if 0 epochs save_checkpoint(os.path.join(save_dir, 'model.pt'), model, scaler, features_scaler, args) # Optimizer optimizer = Adam([{ 'params': model.encoder.parameters() }, { 'params': model.ffn.parameters() }, { 'params': model.log_noise, 'weight_decay': 0 }], lr=args.init_lr, weight_decay=args.weight_decay) # Learning rate scheduler scheduler = build_lr_scheduler(optimizer, args) # Run training best_score = float('inf') if args.minimize_score else -float('inf') best_epoch, n_iter = 0, 0 for epoch in range(args.epochs): debug(f'Epoch {epoch}') n_iter = train(model=model, data_loader=train_data_loader, loss_func=loss_func, optimizer=optimizer, scheduler=scheduler, args=args, n_iter=n_iter, logger=logger) val_scores = evaluate(model=model, data_loader=val_data_loader, args=args, num_tasks=args.num_tasks, metric_func=metric_func, dataset_type=args.dataset_type, scaler=scaler, logger=logger) # Average validation score avg_val_score = np.nanmean(val_scores) debug(f'Validation {args.metric} = {avg_val_score:.6f}') wandb.log({"Validation MAE": avg_val_score}) # Save model checkpoint if improved validation score if args.minimize_score and avg_val_score < best_score or \ not args.minimize_score and avg_val_score > best_score: best_score, best_epoch = avg_val_score, epoch save_checkpoint(os.path.join(save_dir, 'model.pt'), model, scaler, features_scaler, args) if epoch == args.noam_epochs - 1: optimizer = Adam([{ 'params': model.encoder.parameters() }, { 'params': model.ffn.parameters() }, { 'params': model.log_noise, 'weight_decay': 0 }], lr=args.final_lr, weight_decay=args.weight_decay) scheduler = scheduler_const([args.final_lr]) # load model with best validation score info( f'Model {model_idx} best validation {args.metric} = {best_score:.6f} on epoch {best_epoch}' ) model = load_checkpoint(os.path.join(save_dir, 'model.pt'), device=args.device, logger=logger) # SWAG training loop, returns swag_model if args.swag: model = train_swag(model, train_data, val_data, num_workers, cache, loss_func, metric_func, scaler, features_scaler, args, save_dir) # SGLD loop, which saves nets if args.sgld: model = train_sgld(model, train_data, val_data, num_workers, cache, loss_func, metric_func, scaler, features_scaler, args, save_dir) # GP loop if args.gp: model, likelihood = train_gp(model, train_data, val_data, num_workers, cache, metric_func, scaler, features_scaler, args, save_dir) # BBP if args.bbp: model = train_bbp(model, train_data, val_data, num_workers, cache, loss_func, metric_func, scaler, features_scaler, args, save_dir) # DUN if args.dun: model = train_dun(model, train_data, val_data, num_workers, cache, loss_func, metric_func, scaler, features_scaler, args, save_dir) ################################## ########## Inner loop over samples ################################## for sample_idx in range(args.samples): # draw model from SWAG posterior if args.swag: model.sample(scale=1.0, cov=args.cov_mat, block=args.block) # draw model from collected SGLD models if args.sgld: model = load_checkpoint(os.path.join(save_dir, f'model_{sample_idx}.pt'), device=args.device, logger=logger) # make predictions test_preds = predict(model=model, data_loader=test_data_loader, args=args, scaler=scaler, test_data=True, bbp_sample=True) ####################################################################### ####################################################################### ##### SAVING STUFF DOWN if args.gp: # get test_preds_std (scaled back to original data) test_preds_std = predict_std_gp(model=model, data_loader=test_data_loader, args=args, scaler=scaler, likelihood=likelihood) # 1 - MEANS np.savez(os.path.join(results_dir, f'preds_{sample_idx}'), np.array(test_preds)) # 2 - STD, combined aleatoric and epistemic (we save down the stds, always) np.savez(os.path.join(results_dir, f'predsSTDEV_{sample_idx}'), np.array(test_preds_std)) else: # save test_preds and aleatoric uncertainties if args.dun: log_cat = model.log_cat.detach().cpu().numpy() cat = np.exp(log_cat) / np.sum(np.exp(log_cat)) np.savez(os.path.join(results_dir, f'cat_{sample_idx}'), cat) # samples from categorical dist and saves a depth MC sample depth_sample = np.random.multinomial(1, cat).nonzero()[0][0] test_preds_MCdepth = predict_MCdepth( model=model, data_loader=test_data_loader, args=args, scaler=scaler, d=depth_sample) np.savez( os.path.join(results_dir, f'predsMCDEPTH_{sample_idx}'), np.array(test_preds_MCdepth)) if args.swag: log_noise = model.base.log_noise else: log_noise = model.log_noise noise = np.exp(log_noise.detach().cpu().numpy()) * np.array( scaler.stds) np.savez(os.path.join(results_dir, f'preds_{sample_idx}'), np.array(test_preds)) np.savez(os.path.join(results_dir, f'noise_{sample_idx}'), noise) ####################################################################### ####################################################################### # add predictions to sum_test_preds if len(test_preds) != 0: sum_test_preds += np.array(test_preds) # evaluate predictions using metric function test_scores = evaluate_predictions(preds=test_preds, targets=test_targets, num_tasks=args.num_tasks, metric_func=metric_func, dataset_type=args.dataset_type, logger=logger) # compute average test score avg_test_score = np.nanmean(test_scores) info( f'Model {model_idx}, sample {sample_idx} test {args.metric} = {avg_test_score:.6f}' ) ################################# ########## Bayesian Model Average ################################# # note: this is an average over Bayesian samples AND components in an ensemble # compute number of prediction iterations pred_iterations = args.ensemble_size * args.samples # average predictions across iterations avg_test_preds = (sum_test_preds / pred_iterations).tolist() # evaluate BMA_scores = evaluate_predictions(preds=avg_test_preds, targets=test_targets, num_tasks=args.num_tasks, metric_func=metric_func, dataset_type=args.dataset_type, logger=logger) # average scores across tasks avg_BMA_test_score = np.nanmean(BMA_scores) info(f'BMA test {args.metric} = {avg_BMA_test_score:.6f}') return BMA_scores
def run_meta_training(args: TrainArgs, logger: Logger = None) -> List[float]: """ Trains a model and returns test scores on the model checkpoint with the highest validation score. :param args: Arguments. :param logger: Logger. :return: A list of ensemble scores for each task. """ if logger is not None: debug, info = logger.debug, logger.info else: debug = info = print # Print command line debug('Command line') debug(f'python {" ".join(sys.argv)}') # Print args debug('Args') debug(args) # Save args args.save(os.path.join(args.save_dir, 'args.json')) # Set pytorch seed for random initial weights torch.manual_seed(args.pytorch_seed) # Get data debug('Loading data') args.task_names = args.target_columns or get_task_names(args.data_path) data = get_data(path=args.data_path, args=args, logger=logger) args.num_tasks = data.num_tasks() args.features_size = data.features_size() debug(f'Number of tasks = {args.num_tasks}') # Split data # debug(f'Splitting data with seed {args.seed}') # if args.separate_test_path: # test_data = get_data(path=args.separate_test_path, args=args, features_path=args.separate_test_features_path, logger=logger) # if args.separate_val_path: # val_data = get_data(path=args.separate_val_path, args=args, features_path=args.separate_val_features_path, logger=logger) # if args.separate_val_path and args.separate_test_path: # train_data = data # elif args.separate_val_path: # train_data, _, test_data = split_data(data=data, split_type=args.split_type, sizes=(0.8, 0.0, 0.2), seed=args.seed, args=args, logger=logger) # elif args.separate_test_path: # train_data, val_data, _ = split_data(data=data, split_type=args.split_type, sizes=(0.8, 0.2, 0.0), seed=args.seed, args=args, logger=logger) # else: # train_data, val_data, test_data = split_data(data=data, split_type=args.split_type, sizes=args.split_sizes, seed=args.seed, args=args, logger=logger) if args.dataset_type == 'classification': class_sizes = get_class_sizes(data) debug('Class sizes') for i, task_class_sizes in enumerate(class_sizes): debug(f'{args.task_names[i]} ' f'{", ".join(f"{cls}: {size * 100:.2f}%" for cls, size in enumerate(task_class_sizes))}') # if args.save_smiles_splits: # save_smiles_splits( # train_data=train_data, # val_data=val_data, # test_data=test_data, # data_path=args.data_path, # save_dir=args.save_dir # ) # If this happens, then need to move this logic into the task data loader # when it creates the datasets! # if args.features_scaling: # features_scaler = train_data.normalize_features(replace_nan_token=0) # val_data.normalize_features(features_scaler) # test_data.normalize_features(features_scaler) # else: # features_scaler = None # args.train_data_size = len(train_data) # debug(f'Total size = {len(data):,} | ' # f'train size = {len(train_data):,} | val size = {len(val_data):,} | test size = {len(test_data):,}') # Initialize scaler and scale training targets by subtracting mean and dividing standard deviation (regression only) # if args.dataset_type == 'regression': # debug('Fitting scaler') # train_smiles, train_targets = train_data.smiles(), train_data.targets() # scaler = StandardScaler().fit(train_targets) # scaled_targets = scaler.transform(train_targets).tolist() # train_data.set_targets(scaled_targets) # else: # scaler = None # Get loss and metric functions loss_func = get_loss_func(args) metric_func = get_metric_func(metric=args.metric) # Set up test set evaluation # test_smiles, test_targets = test_data.smiles(), test_data.targets() # if args.dataset_type == 'multiclass': # sum_test_preds = np.zeros((len(test_smiles), args.num_tasks, args.multiclass_num_classes)) # else: # sum_test_preds = np.zeros((len(test_smiles), args.num_tasks)) # Automatically determine whether to cache if len(data) <= args.cache_cutoff: cache = True num_workers = 0 else: cache = False num_workers = args.num_workers # Set up MetaTaskDataLoaders, which takes care of task splits under the hood # Set up task splits into T_tr, T_val, T_test assert args.chembl_assay_metadata_pickle_path is not None with open(args.chembl_assay_metadata_pickle_path + 'chembl_128_assay_type_to_names.pickle', 'rb') as handle: chembl_128_assay_type_to_names = pickle.load(handle) with open(args.chembl_assay_metadata_pickle_path + 'chembl_128_assay_name_to_type.pickle', 'rb') as handle: chembl_128_assay_name_to_type = pickle.load(handle) """ Copy GSK implementation of task split We have 5 Task types remaining ADME (A) Toxicity (T) Unassigned (U) Binding (B) Functional (F) resulting in 902 tasks. For T_val, randomly select 10 B and F tasks For T_test, select another 10 B and F tasks and allocate all A, T, and U tasks to the test split. For T_train, allocate the remaining B and F tasks. """ import pdb; pdb.set_trace() T_val_num_BF_tasks = args.meta_split_sizes_BF[0] T_test_num_BF_tasks = args.meta_split_sizes_BF[1] T_val_idx = T_val_num_BF_tasks T_test_idx = T_val_num_BF_tasks + T_test_num_BF_tasks chembl_id_to_idx = {chembl_id: idx for idx, chembl_id in enumerate(args.task_names)} # Shuffle B and F tasks randomized_B_tasks = np.copy(chembl_128_assay_type_to_names['B']) np.random.shuffle(randomized_B_tasks) randomized_B_task_indices = [chembl_id_to_idx[assay] for assay in randomized_B_tasks] randomized_F_tasks = np.copy(chembl_128_assay_type_to_names['F']) np.random.shuffle(randomized_F_tasks) randomized_F_task_indices = [chembl_id_to_idx[assay] for assay in randomized_F_tasks] # Grab B and F indices for T_val T_val_B_task_indices = randomized_B_task_indices[:T_val_idx] T_val_F_task_indices = randomized_F_task_indices[:T_val_idx] # Grab B and F indices for T_test T_test_B_task_indices = randomized_B_task_indices[T_val_idx:T_test_idx] T_test_F_task_indices = randomized_F_task_indices[T_val_idx:T_test_idx] # Grab all A, T and U indices for T_test T_test_A_task_indices = [chembl_id_to_idx[assay] for assay in chembl_128_assay_type_to_names['A']] T_test_T_task_indices = [chembl_id_to_idx[assay] for assay in chembl_128_assay_type_to_names['T']] T_test_U_task_indices = [chembl_id_to_idx[assay] for assay in chembl_128_assay_type_to_names['U']] # Slot remaining BF tasks into T_tr T_tr_B_task_indices = randomized_B_task_indices[T_test_idx:] T_tr_F_task_indices = randomized_F_task_indices[T_test_idx:] T_tr = [0] * len(args.task_names) T_val = [0] * len(args.task_names) T_test = [0] * len(args.task_names) # Now make task bit vectors for idx_list in (T_tr_B_task_indices, T_tr_F_task_indices): for idx in idx_list: T_tr[idx] = 1 for idx_list in (T_val_B_task_indices, T_val_F_task_indices): for idx in idx_list: T_val[idx] = 1 for idx_list in (T_test_B_task_indices, T_test_F_task_indices, T_test_A_task_indices, T_test_T_task_indices, T_test_U_task_indices): for idx in idx_list: T_test[idx] = 1 """ Random task split for testing task_indices = list(range(len(args.task_names))) np.random.shuffle(task_indices) train_task_split, val_task_split, test_task_split = 0.9, 0, 0.1 train_task_cutoff = int(len(task_indices) * train_task_split) train_task_idxs, test_task_idxs = [0] * len(task_indices), [0] * len(task_indices) for idx in task_indices[:train_task_cutoff]: train_task_idxs[idx] = 1 for idx in task_indices[train_task_cutoff:]: test_task_idxs[idx] = 1 """ train_meta_task_data_loader = MetaTaskDataLoader( dataset=data, tasks=T_tr, sizes=args.meta_train_split_sizes, args=args, logger=logger) val_meta_task_data_loader = MetaTaskDataLoader( dataset=data, tasks=T_val, sizes=args.meta_test_split_sizes, args=args, logger=logger) test_meta_task_data_loader = MetaTaskDataLoader( dataset=data, tasks=T_test, sizes=args.meta_test_split_sizes, args=args, logger=logger) import pdb; pdb.set_trace() for meta_train_batch in train_meta_task_data_loader.tasks(): for train_task in meta_train_batch: print('In inner loop') continue # Train ensemble of models for model_idx in range(args.ensemble_size): # Tensorboard writer save_dir = os.path.join(args.save_dir, f'model_{model_idx}') makedirs(save_dir) try: writer = SummaryWriter(log_dir=save_dir) except: writer = SummaryWriter(logdir=save_dir) # Load/build model if args.checkpoint_paths is not None: debug(f'Loading model {model_idx} from {args.checkpoint_paths[model_idx]}') model = load_checkpoint(args.checkpoint_paths[model_idx], logger=logger) else: debug(f'Building model {model_idx}') model = MoleculeModel(args) debug(model) debug(f'Number of parameters = {param_count(model):,}') if args.cuda: debug('Moving model to cuda') model = model.to(args.device) # Ensure that model is saved in correct location for evaluation if 0 epochs save_checkpoint(os.path.join(save_dir, 'model.pt'), model, scaler, features_scaler, args) # Optimizers optimizer = build_optimizer(model, args) # Learning rate schedulers scheduler = build_lr_scheduler(optimizer, args) # Run training best_score = float('inf') if args.minimize_score else -float('inf') best_epoch, n_iter = 0, 0 for epoch in trange(args.epochs): debug(f'Epoch {epoch}') n_iter = train( model=model, data_loader=train_data_loader, loss_func=loss_func, optimizer=optimizer, scheduler=scheduler, args=args, n_iter=n_iter, logger=logger, writer=writer ) if isinstance(scheduler, ExponentialLR): scheduler.step() val_scores = evaluate( model=model, data_loader=val_data_loader, num_tasks=args.num_tasks, metric_func=metric_func, dataset_type=args.dataset_type, scaler=scaler, logger=logger ) # Average validation score avg_val_score = np.nanmean(val_scores) debug(f'Validation {args.metric} = {avg_val_score:.6f}') writer.add_scalar(f'validation_{args.metric}', avg_val_score, n_iter) if args.show_individual_scores: # Individual validation scores for task_name, val_score in zip(args.task_names, val_scores): debug(f'Validation {task_name} {args.metric} = {val_score:.6f}') writer.add_scalar(f'validation_{task_name}_{args.metric}', val_score, n_iter) # Save model checkpoint if improved validation score if args.minimize_score and avg_val_score < best_score or \ not args.minimize_score and avg_val_score > best_score: best_score, best_epoch = avg_val_score, epoch save_checkpoint(os.path.join(save_dir, 'model.pt'), model, scaler, features_scaler, args) # Evaluate on test set using model with best validation score info(f'Model {model_idx} best validation {args.metric} = {best_score:.6f} on epoch {best_epoch}') model = load_checkpoint(os.path.join(save_dir, 'model.pt'), device=args.device, logger=logger) test_preds = predict( model=model, data_loader=test_data_loader, scaler=scaler ) test_scores = evaluate_predictions( preds=test_preds, targets=test_targets, num_tasks=args.num_tasks, metric_func=metric_func, dataset_type=args.dataset_type, logger=logger ) if len(test_preds) != 0: sum_test_preds += np.array(test_preds) # Average test score avg_test_score = np.nanmean(test_scores) info(f'Model {model_idx} test {args.metric} = {avg_test_score:.6f}') writer.add_scalar(f'test_{args.metric}', avg_test_score, 0) if args.show_individual_scores: # Individual test scores for task_name, test_score in zip(args.task_names, test_scores): info(f'Model {model_idx} test {task_name} {args.metric} = {test_score:.6f}') writer.add_scalar(f'test_{task_name}_{args.metric}', test_score, n_iter) writer.close() # Evaluate ensemble on test set avg_test_preds = (sum_test_preds / args.ensemble_size).tolist() ensemble_scores = evaluate_predictions( preds=avg_test_preds, targets=test_targets, num_tasks=args.num_tasks, metric_func=metric_func, dataset_type=args.dataset_type, logger=logger ) # Average ensemble score avg_ensemble_test_score = np.nanmean(ensemble_scores) info(f'Ensemble test {args.metric} = {avg_ensemble_test_score:.6f}') # Individual ensemble scores if args.show_individual_scores: for task_name, ensemble_score in zip(args.task_names, ensemble_scores): info(f'Ensemble test {task_name} {args.metric} = {ensemble_score:.6f}') return ensemble_scores
def pdts(args: TrainArgs, model_idx): """ preliminary experiment with PDTS (approximate BO) we use a data set size of 50k and run until we have trained with 15k data points our batch size is 50 we initialise with 1000 data points """ ######## set up all logging ######## logger = None # make save_dir save_dir = os.path.join(args.save_dir, f'model_{model_idx}') makedirs(save_dir) # make results_dir results_dir = args.results_dir makedirs(results_dir) # initialise wandb #os.environ['WANDB_MODE'] = 'dryrun' wandb.init(name=args.wandb_name + '_' + str(model_idx), project=args.wandb_proj, reinit=True) #print('WANDB directory is:') #print(wandb.run.dir) #################################### ########## get data args.task_names = args.target_columns or get_task_names(args.data_path) data = get_data(path=args.data_path, args=args, logger=logger) args.num_tasks = data.num_tasks() args.features_size = data.features_size() ########## SMILES of top 1% top1p = np.array(MoleculeDataset(data).targets()) top1p_idx = np.argsort(-top1p[:, 0])[:int(args.max_data_size * 0.01)] SMILES = np.array(MoleculeDataset(data).smiles())[top1p_idx] ########## initial data splits args.seed = args.data_seeds[model_idx] data.shuffle(seed=args.seed) sizes = args.split_sizes train_size = int(sizes[0] * len(data)) train_orig = data[:train_size] test_orig = data[train_size:] train_data, test_data = copy.deepcopy( MoleculeDataset(train_orig)), copy.deepcopy(MoleculeDataset(test_orig)) args.train_data_size = len(train_data) ########## standardising # features (train and test) features_scaler = train_data.normalize_features(replace_nan_token=0) test_data.normalize_features(features_scaler) # targets (train) train_targets = train_data.targets() test_targets = test_data.targets() scaler = StandardScaler().fit(train_targets) scaled_targets = scaler.transform(train_targets).tolist() train_data.set_targets(scaled_targets) ########## loss, metric functions loss_func = neg_log_like metric_func = get_metric_func(metric=args.metric) ########## data loaders if len(data) <= args.cache_cutoff: cache = True num_workers = 0 else: cache = False num_workers = args.num_workers train_data_loader = MoleculeDataLoader(dataset=train_data, batch_size=args.batch_size, num_workers=num_workers, cache=cache, class_balance=args.class_balance, shuffle=True, seed=args.seed) test_data_loader = MoleculeDataLoader(dataset=test_data, batch_size=args.batch_size, num_workers=num_workers, cache=cache) ########## instantiating model, optimiser, scheduler (MAP) # set pytorch seed for random initial weights torch.manual_seed(args.pytorch_seeds[model_idx]) # build model print(f'Building model {model_idx}') model = MoleculeModel(args) print(model) print(f'Number of parameters = {param_count(model):,}') if args.cuda: print('Moving model to cuda') model = model.to(args.device) # optimizer optimizer = Adam([{ 'params': model.encoder.parameters() }, { 'params': model.ffn.parameters() }, { 'params': model.log_noise, 'weight_decay': 0 }], lr=args.lr, weight_decay=args.weight_decay) # learning rate scheduler scheduler = scheduler_const([args.lr]) #################################################################### #################################################################### # FIRST THOMPSON ITERATION ### scores array ptds_scores = np.ones(args.pdts_batches + 1) batch_no = 0 ### fill for batch 0 SMILES_train = np.array(train_data.smiles()) SMILES_stack = np.hstack((SMILES, SMILES_train)) overlap = len(SMILES_stack) - len(np.unique(SMILES_stack)) prop = overlap / len(SMILES) ptds_scores[batch_no] = prop wandb.log({ "Proportion of top 1%": prop, "batch_no": batch_no }, commit=False) ### train MAP posterior gp_switch = False likelihood = None bbp_switch = None n_iter = 0 for epoch in range(args.epochs_init_map): n_iter = train(model=model, data_loader=train_data_loader, loss_func=loss_func, optimizer=optimizer, scheduler=scheduler, args=args, n_iter=n_iter, bbp_switch=bbp_switch) # save to save_dir #if epoch == args.epochs_init_map - 1: #save_checkpoint(os.path.join(save_dir, f'model_{batch_no}.pt'), model, scaler, features_scaler, args) # if X load from checkpoint path if args.bbp or args.gp or args.swag or args.sgld: model = load_checkpoint(args.checkpoint_path + f'/model_{model_idx}/model_{batch_no}.pt', device=args.device, logger=None) ########## BBP if args.bbp: model_bbp = MoleculeModelBBP( args) # instantiate with bayesian linear layers for (_, param_bbp), (_, param_pre) in zip(model_bbp.named_parameters(), model.named_parameters()): param_bbp.data = copy.deepcopy( param_pre.data.T) # copy over parameters # instantiate rhos for layer in model_bbp.children(): if isinstance(layer, BayesLinear): layer.init_rho(args.rho_min_bbp, args.rho_max_bbp) for layer in model_bbp.encoder.encoder.children(): if isinstance(layer, BayesLinear): layer.init_rho(args.rho_min_bbp, args.rho_max_bbp) model = model_bbp # name back # move to cuda if args.cuda: print('Moving bbp model to cuda') model = model.to(args.device) # optimiser and scheduler optimizer = torch.optim.Adam(model.parameters(), lr=args.lr) scheduler = scheduler_const([args.lr]) bbp_switch = 2 n_iter = 0 for epoch in range(args.epochs_init): n_iter = train(model=model, data_loader=train_data_loader, loss_func=loss_func, optimizer=optimizer, scheduler=scheduler, args=args, n_iter=n_iter, bbp_switch=bbp_switch) ########## GP if args.gp: # feature_extractor model.featurizer = True feature_extractor = model # inducing points inducing_points = initial_inducing_points(train_data_loader, feature_extractor, args) # GP layer gp_layer = GPLayer(inducing_points, args.num_tasks) # full DKL model model = copy.deepcopy(DKLMoleculeModel(feature_extractor, gp_layer)) # likelihood (rank 0 restricts to diagonal matrix) likelihood = gpytorch.likelihoods.MultitaskGaussianLikelihood( num_tasks=12, rank=0) # model and likelihood to CUDA if args.cuda: model.cuda() likelihood.cuda() # loss object loss_func = gpytorch.mlls.VariationalELBO( likelihood, model.gp_layer, num_data=args.train_data_size) # optimiser and scheduler params_list = [ { 'params': model.feature_extractor.parameters(), 'weight_decay': args.weight_decay_gp }, { 'params': model.gp_layer.hyperparameters() }, { 'params': model.gp_layer.variational_parameters() }, { 'params': likelihood.parameters() }, ] optimizer = torch.optim.Adam(params_list, lr=args.lr) scheduler = scheduler_const([args.lr]) gp_switch = True n_iter = 0 for epoch in range(args.epochs_init): n_iter = train(model=model, data_loader=train_data_loader, loss_func=loss_func, optimizer=optimizer, scheduler=scheduler, args=args, n_iter=n_iter, gp_switch=gp_switch, likelihood=likelihood) ########## SWAG if args.swag: model_core = copy.deepcopy(model) model = train_swag_pdts(model_core, train_data_loader, loss_func, scaler, features_scaler, args, save_dir, batch_no) ########## SGLD if args.sgld: model = train_sgld_pdts(model, train_data_loader, loss_func, scaler, features_scaler, args, save_dir, batch_no) ### find top_idx top_idx = [] # need for thom sum_test_preds = np.zeros( (len(test_orig), args.num_tasks)) # need for greedy for sample in range(args.samples): # draw model from SWAG posterior if args.swag: model.sample(scale=1.0, cov=args.cov_mat, block=args.block) # retrieve sgld sample if args.sgld: model = load_checkpoint( args.save_dir + f'/model_{model_idx}/model_{batch_no}/model_{sample}.pt', device=args.device, logger=logger) test_preds = predict(model=model, data_loader=test_data_loader, args=args, scaler=scaler, test_data=True, gp_sample=args.thompson, bbp_sample=True) test_preds = np.array(test_preds) # thompson bit rank = 0 # base length if args.sgld: base_length = 5 * sample + 4 else: base_length = sample while args.thompson and (len(top_idx) <= base_length): top_unique_molecule = np.argsort(-test_preds[:, 0])[rank] rank += 1 if top_unique_molecule not in top_idx: top_idx.append(top_unique_molecule) # add to sum_test_preds sum_test_preds += test_preds # print print('done sample ' + str(sample)) # final top_idx if args.thompson: top_idx = np.array(top_idx) else: sum_test_preds /= args.samples top_idx = np.argsort(-sum_test_preds[:, 0])[:50] ### transfer from test to train top_idx = -np.sort(-top_idx) for idx in top_idx: train_orig.append(test_orig.pop(idx)) train_data, test_data = copy.deepcopy( MoleculeDataset(train_orig)), copy.deepcopy(MoleculeDataset(test_orig)) args.train_data_size = len(train_data) if args.gp: loss_func = gpytorch.mlls.VariationalELBO( likelihood, model.gp_layer, num_data=args.train_data_size) print(args.train_data_size) ### standardise features (train and test; using original features_scaler) train_data.normalize_features(features_scaler) test_data.normalize_features(features_scaler) ### standardise targets (train only; using original scaler) train_targets = train_data.targets() scaled_targets_tr = scaler.transform(train_targets).tolist() train_data.set_targets(scaled_targets_tr) ### create data loaders train_data_loader = MoleculeDataLoader(dataset=train_data, batch_size=args.batch_size, num_workers=num_workers, cache=cache, class_balance=args.class_balance, shuffle=True, seed=args.seed) test_data_loader = MoleculeDataLoader(dataset=test_data, batch_size=args.batch_size, num_workers=num_workers, cache=cache) #################################################################### #################################################################### ################################## ########## thompson sampling loop ################################## for batch_no in range(1, args.pdts_batches + 1): ### fill in ptds_scores SMILES_train = np.array(train_data.smiles()) SMILES_stack = np.hstack((SMILES, SMILES_train)) overlap = len(SMILES_stack) - len(np.unique(SMILES_stack)) prop = overlap / len(SMILES) ptds_scores[batch_no] = prop wandb.log({ "Proportion of top 1%": prop, "batch_no": batch_no }, commit=False) ### train posterior n_iter = 0 for epoch in range(args.epochs): n_iter = train(model=model, data_loader=train_data_loader, loss_func=loss_func, optimizer=optimizer, scheduler=scheduler, args=args, n_iter=n_iter, gp_switch=gp_switch, likelihood=likelihood, bbp_switch=bbp_switch) # save to save_dir #if epoch == args.epochs - 1: #save_checkpoint(os.path.join(save_dir, f'model_{batch_no}.pt'), model, scaler, features_scaler, args) # if swag, load checkpoint if args.swag: model_core = load_checkpoint( args.checkpoint_path + f'/model_{model_idx}/model_{batch_no}.pt', device=args.device, logger=None) ########## SWAG if args.swag: model = train_swag_pdts(model_core, train_data_loader, loss_func, scaler, features_scaler, args, save_dir, batch_no) ########## SGLD if args.sgld: model = train_sgld_pdts(model, train_data_loader, loss_func, scaler, features_scaler, args, save_dir, batch_no) ### find top_idx top_idx = [] # need for thom sum_test_preds = np.zeros( (len(test_orig), args.num_tasks)) # need for greedy for sample in range(args.samples): # draw model from SWAG posterior if args.swag: model.sample(scale=1.0, cov=args.cov_mat, block=args.block) # retrieve sgld sample if args.sgld: model = load_checkpoint( args.save_dir + f'/model_{model_idx}/model_{batch_no}/model_{sample}.pt', device=args.device, logger=logger) test_preds = predict(model=model, data_loader=test_data_loader, args=args, scaler=scaler, test_data=True, gp_sample=args.thompson, bbp_sample=True) test_preds = np.array(test_preds) # thompson bit rank = 0 # base length if args.sgld: base_length = 5 * sample + 4 else: base_length = sample while args.thompson and (len(top_idx) <= base_length): top_unique_molecule = np.argsort(-test_preds[:, 0])[rank] rank += 1 if top_unique_molecule not in top_idx: top_idx.append(top_unique_molecule) # add to sum_test_preds sum_test_preds += test_preds # print print('done sample ' + str(sample)) # final top_idx if args.thompson: top_idx = np.array(top_idx) else: sum_test_preds /= args.samples top_idx = np.argsort(-sum_test_preds[:, 0])[:50] ### transfer from test to train top_idx = -np.sort(-top_idx) for idx in top_idx: train_orig.append(test_orig.pop(idx)) train_data, test_data = copy.deepcopy( MoleculeDataset(train_orig)), copy.deepcopy( MoleculeDataset(test_orig)) args.train_data_size = len(train_data) if args.gp: loss_func = gpytorch.mlls.VariationalELBO( likelihood, model.gp_layer, num_data=args.train_data_size) print(args.train_data_size) ### standardise features (train and test; using original features_scaler) train_data.normalize_features(features_scaler) test_data.normalize_features(features_scaler) ### standardise targets (train only; using original scaler) train_targets = train_data.targets() scaled_targets_tr = scaler.transform(train_targets).tolist() train_data.set_targets(scaled_targets_tr) ### create data loaders train_data_loader = MoleculeDataLoader( dataset=train_data, batch_size=args.batch_size, num_workers=num_workers, cache=cache, class_balance=args.class_balance, shuffle=True, seed=args.seed) test_data_loader = MoleculeDataLoader(dataset=test_data, batch_size=args.batch_size, num_workers=num_workers, cache=cache) # save scores np.savez(os.path.join(results_dir, f'ptds_{model_idx}'), ptds_scores)
def molecule_fingerprint( args: FingerprintArgs, smiles: List[List[str]] = None) -> List[List[Optional[float]]]: """ Loads data and a trained model and uses the model to encode fingerprint vectors for the data. :param args: A :class:`~chemprop.args.PredictArgs` object containing arguments for loading data and a model and making predictions. :param smiles: List of list of SMILES to make predictions on. :return: A list of fingerprint vectors (list of floats) """ print('Loading training args') train_args = load_args(args.checkpoint_paths[0]) # Update args with training arguments if args.fingerprint_type == 'MPN': # only need to supply input features if using FFN latent representation and if model calls for them. validate_feature_sources = False else: validate_feature_sources = True update_prediction_args(predict_args=args, train_args=train_args, validate_feature_sources=validate_feature_sources) args: Union[FingerprintArgs, TrainArgs] #set explicit H option and reaction option reset_featurization_parameters() set_explicit_h(train_args.explicit_h) set_adding_hs(args.adding_h) set_reaction(train_args.reaction, train_args.reaction_mode) print('Loading data') if smiles is not None: full_data = get_data_from_smiles( smiles=smiles, skip_invalid_smiles=False, features_generator=args.features_generator) else: full_data = get_data(path=args.test_path, smiles_columns=args.smiles_columns, target_columns=[], ignore_columns=[], skip_invalid_smiles=False, args=args, store_row=True) print('Validating SMILES') full_to_valid_indices = {} valid_index = 0 for full_index in range(len(full_data)): if all(mol is not None for mol in full_data[full_index].mol): full_to_valid_indices[full_index] = valid_index valid_index += 1 test_data = MoleculeDataset( [full_data[i] for i in sorted(full_to_valid_indices.keys())]) # Edge case if empty list of smiles is provided if len(test_data) == 0: return [None] * len(full_data) print(f'Test size = {len(test_data):,}') # Create data loader test_data_loader = MoleculeDataLoader(dataset=test_data, batch_size=args.batch_size, num_workers=args.num_workers) # Set fingerprint size if args.fingerprint_type == 'MPN': total_fp_size = args.hidden_size * args.number_of_molecules if args.features_only: raise ValueError( 'With features_only models, there is no latent MPN representation. Use last_FFN fingerprint type instead.' ) elif args.fingerprint_type == 'last_FFN': if args.ffn_num_layers != 1: total_fp_size = args.ffn_hidden_size else: raise ValueError( 'With a ffn_num_layers of 1, there is no latent FFN representation. Use MPN fingerprint type instead.' ) else: raise ValueError( f'Fingerprint type {args.fingerprint_type} not supported') all_fingerprints = np.zeros( (len(test_data), total_fp_size, len(args.checkpoint_paths))) # Load model print( f'Encoding smiles into a fingerprint vector from {len(args.checkpoint_paths)} models.' ) for index, checkpoint_path in enumerate( tqdm(args.checkpoint_paths, total=len(args.checkpoint_paths))): model = load_checkpoint(checkpoint_path, device=args.device) scaler, features_scaler, atom_descriptor_scaler, bond_feature_scaler = load_scalers( args.checkpoint_paths[index]) # Normalize features if args.features_scaling or train_args.atom_descriptor_scaling or train_args.bond_feature_scaling: test_data.reset_features_and_targets() if args.features_scaling: test_data.normalize_features(features_scaler) if train_args.atom_descriptor_scaling and args.atom_descriptors is not None: test_data.normalize_features(atom_descriptor_scaler, scale_atom_descriptors=True) if train_args.bond_feature_scaling and args.bond_features_size > 0: test_data.normalize_features(bond_feature_scaler, scale_bond_features=True) # Make fingerprints model_fp = model_fingerprint(model=model, data_loader=test_data_loader, fingerprint_type=args.fingerprint_type) if args.fingerprint_type == 'MPN' and ( args.features_path is not None or args.features_generator ): # truncate any features from MPN fingerprint model_fp = np.array(model_fp)[:, :total_fp_size] all_fingerprints[:, :, index] = model_fp # Save predictions print(f'Saving predictions to {args.preds_path}') assert len(test_data) == len(all_fingerprints) makedirs(args.preds_path, isfile=True) # Set column names fingerprint_columns = [] if len(args.checkpoint_paths) == 1: for j in range(total_fp_size): fingerprint_columns.append(f'fp_{j}') else: for j in range(total_fp_size): for i in range(len(args.checkpoint_paths)): fingerprint_columns.append(f'fp_{j}_model_{i}') # Copy predictions over to full_data for full_index, datapoint in enumerate(full_data): valid_index = full_to_valid_indices.get(full_index, None) preds = all_fingerprints[valid_index].reshape( (len(args.checkpoint_paths) * total_fp_size )) if valid_index is not None else ['Invalid SMILES'] * len( args.checkpoint_paths) * total_fp_size for i in range(len(fingerprint_columns)): datapoint.row[fingerprint_columns[i]] = preds[i] # Write predictions with open(args.preds_path, 'w') as f: writer = csv.DictWriter(f, fieldnames=args.smiles_columns + fingerprint_columns, extrasaction='ignore') writer.writeheader() for datapoint in full_data: writer.writerow(datapoint.row) return all_fingerprints
def train_gp( model, train_data, val_data, num_workers, cache, metric_func, scaler, features_scaler, args, save_dir): # create data loaders for gp (allows different batch size) train_data_loader = MoleculeDataLoader( dataset=train_data, batch_size=args.batch_size_gp, num_workers=num_workers, cache=cache, class_balance=args.class_balance, shuffle=True, seed=args.seed ) val_data_loader = MoleculeDataLoader( dataset=val_data, batch_size=args.batch_size_gp, num_workers=num_workers, cache=cache ) # feature_extractor model.featurizer = True feature_extractor = model # inducing points inducing_points = initial_inducing_points( train_data_loader, feature_extractor, args ) # GP layer gp_layer = GPLayer(inducing_points, args.num_tasks) # full DKL model model = copy.deepcopy(DKLMoleculeModel(feature_extractor, gp_layer)) # likelihood # rank 0 restricts to diagonal matrix likelihood = gpytorch.likelihoods.MultitaskGaussianLikelihood(num_tasks=12, rank=0) # model and likelihood to CUDA if args.cuda: model.cuda() likelihood.cuda() # loss object mll = gpytorch.mlls.VariationalELBO(likelihood, model.gp_layer, num_data=args.train_data_size) # optimizer params_list = [ {'params': model.feature_extractor.parameters(), 'weight_decay': args.weight_decay_gp}, {'params': model.gp_layer.hyperparameters()}, {'params': model.gp_layer.variational_parameters()}, {'params': likelihood.parameters()}, ] optimizer = torch.optim.Adam(params_list, lr = args.init_lr_gp) # scheduler num_params = len(params_list) scheduler = NoamLR( optimizer=optimizer, warmup_epochs=[args.warmup_epochs_gp]*num_params, total_epochs=[args.noam_epochs_gp]*num_params, steps_per_epoch=args.train_data_size // args.batch_size_gp, init_lr=[args.init_lr_gp]*num_params, max_lr=[args.max_lr_gp]*num_params, final_lr=[args.final_lr_gp]*num_params) print("----------GP training----------") # training loop best_score = float('inf') if args.minimize_score else -float('inf') best_epoch, n_iter = 0, 0 for epoch in range(args.epochs_gp): print(f'GP epoch {epoch}') if epoch == args.noam_epochs_gp: scheduler = scheduler_const([args.final_lr_gp]) n_iter = train( model=model, data_loader=train_data_loader, loss_func=mll, optimizer=optimizer, scheduler=scheduler, args=args, n_iter=n_iter, gp_switch=True, likelihood = likelihood ) val_scores = evaluate( model=model, data_loader=val_data_loader, args=args, num_tasks=args.num_tasks, metric_func=metric_func, dataset_type=args.dataset_type, scaler=scaler ) # Average validation score avg_val_score = np.nanmean(val_scores) print(f'Validation {args.metric} = {avg_val_score:.6f}') wandb.log({"Validation MAE": avg_val_score}) # Save model AND LIKELIHOOD checkpoint if improved validation score if args.minimize_score and avg_val_score < best_score or \ not args.minimize_score and avg_val_score > best_score: best_score, best_epoch = avg_val_score, epoch save_checkpoint(os.path.join(save_dir, 'DKN_model.pt'), model, scaler, features_scaler, args) best_likelihood = copy.deepcopy(likelihood) # load model with best validation score # NOTE: TEMPLATE MUST BE NEWLY INSTANTIATED MODEL print(f'Loading model with best validation {args.metric} = {best_score:.6f} on epoch {best_epoch}') model = load_checkpoint(os.path.join(save_dir, 'DKN_model.pt'), device=args.device, logger=None, template = DKLMoleculeModel(MoleculeModel(args, featurizer=True), gp_layer)) return model, best_likelihood
def run_training(args: Namespace, logger: Logger = None) -> List[float]: """ Trains a model and returns test scores on the model checkpoint with the highest validation score. :param args: Arguments. :param logger: Logger. :return: A list of ensemble scores for each task. """ if logger is not None: debug, info = logger.debug, logger.info else: debug = info = print # Set GPU if args.gpu is not None: torch.cuda.set_device(args.gpu) # Print args debug(pformat(vars(args))) # Get data debug('Loading data') args.task_names = get_task_names(args.data_path) data = get_data(path=args.data_path, args=args, logger=logger) args.num_tasks = data.num_tasks() args.features_size = data.features_size() debug(f'Number of tasks = {args.num_tasks}') # Split data debug(f'Splitting data with seed {args.seed}') if args.separate_test_path: test_data = get_data(path=args.separate_test_path, args=args, features_path=args.separate_test_features_path, logger=logger) if args.separate_val_path: val_data = get_data(path=args.separate_val_path, args=args, features_path=args.separate_val_features_path, logger=logger) if args.separate_val_path and args.separate_test_path: train_data = data elif args.separate_val_path: train_data, _, test_data = split_data(data=data, split_type=args.split_type, sizes=(0.8, 0.2, 0.0), seed=args.seed, args=args, logger=logger) elif args.separate_test_path: train_data, val_data, _ = split_data(data=data, split_type=args.split_type, sizes=(0.8, 0.2, 0.0), seed=args.seed, args=args, logger=logger) else: train_data, val_data, test_data = split_data(data=data, split_type=args.split_type, sizes=args.split_sizes, seed=args.seed, args=args, logger=logger) if args.dataset_type == 'classification': class_sizes = get_class_sizes(data) debug('Class sizes') for i, task_class_sizes in enumerate(class_sizes): debug(f'{args.task_names[i]} ' f'{", ".join(f"{cls}: {size * 100:.2f}%" for cls, size in enumerate(task_class_sizes))}') if args.save_smiles_splits: with open(args.data_path, 'r') as f: reader = csv.reader(f) header = next(reader) lines_by_smiles = {} indices_by_smiles = {} for i, line in enumerate(reader): smiles = line[0] lines_by_smiles[smiles] = line indices_by_smiles[smiles] = i all_split_indices = [] for dataset, name in [(train_data, 'train'), (val_data, 'val'), (test_data, 'test')]: with open(os.path.join(args.save_dir, name + '_smiles.csv'), 'w') as f: writer = csv.writer(f) writer.writerow(['smiles']) for smiles in dataset.smiles(): writer.writerow([smiles]) with open(os.path.join(args.save_dir, name + '_full.csv'), 'w') as f: writer = csv.writer(f) writer.writerow(header) for smiles in dataset.smiles(): writer.writerow(lines_by_smiles[smiles]) split_indices = [] for smiles in dataset.smiles(): split_indices.append(indices_by_smiles[smiles]) split_indices = sorted(split_indices) all_split_indices.append(split_indices) with open(os.path.join(args.save_dir, 'split_indices.pckl'), 'wb') as f: pickle.dump(all_split_indices, f) if args.features_scaling: features_scaler = train_data.normalize_features(replace_nan_token=0) val_data.normalize_features(features_scaler) test_data.normalize_features(features_scaler) else: features_scaler = None args.train_data_size = len(train_data) debug(f'Total size = {len(data):,} | ' f'train size = {len(train_data):,} | val size = {len(val_data):,} | test size = {len(test_data):,}') # Initialize scaler and scale training targets by subtracting mean and dividing standard deviation (regression only) if args.dataset_type == 'regression': debug('Fitting scaler') train_smiles, train_targets = train_data.smiles(), train_data.targets() scaler = StandardScaler().fit(train_targets) scaled_targets = scaler.transform(train_targets).tolist() train_data.set_targets(scaled_targets) else: scaler = None # Get loss and metric functions loss_func = get_loss_func(args) metric_func = get_metric_func(metric=args.metric) # Set up test set evaluation test_smiles, test_targets = test_data.smiles(), test_data.targets() if args.dataset_type == 'multiclass': sum_test_preds = np.zeros((len(test_smiles), args.num_tasks, args.multiclass_num_classes)) else: sum_test_preds = np.zeros((len(test_smiles), args.num_tasks)) #Setup val set evaluation val_smiles, val_targets = val_data.smiles(), val_data.targets() if args.dataset_type == 'multiclass': sum_val_preds = np.zeros((len(val_smiles), args.num_tasks, args.multiclass_num_classes)) else: sum_val_preds = np.zeros((len(val_smiles), args.num_tasks)) # Train ensemble of models for model_idx in range(args.ensemble_size): # Tensorboard writer save_dir = os.path.join(args.save_dir, f'model_{model_idx}') makedirs(save_dir) writer = SummaryWriter(logdir=save_dir) # Load/build model if args.checkpoint_paths is not None: debug(f'Loading model {model_idx} from {args.checkpoint_paths[model_idx]}') model = load_checkpoint(args.checkpoint_paths[model_idx], current_args=args, logger=logger) else: debug(f'Building model {model_idx}') model = build_model(args) debug(model) debug(f'Number of parameters = {param_count(model):,}') if args.cuda: debug('Moving model to cuda') model = model.cuda() # Ensure that model is saved in correct location for evaluation if 0 epochs save_checkpoint(os.path.join(save_dir, 'model.pt'), model, scaler, features_scaler, args) # Optimizers optimizer = build_optimizer(model, args) # Learning rate schedulers scheduler = build_lr_scheduler(optimizer, args) # Run training best_score = float('inf') if args.minimize_score else -float('inf') best_epoch, n_iter = 0, 0 for epoch in trange(args.epochs): debug(f'Epoch {epoch}') n_iter = train( model=model, data=train_data, loss_func=loss_func, optimizer=optimizer, scheduler=scheduler, args=args, n_iter=n_iter, logger=logger, writer=writer ) if isinstance(scheduler, ExponentialLR): scheduler.step() val_scores = evaluate( model=model, data=val_data, num_tasks=args.num_tasks, metric_func=metric_func, batch_size=args.batch_size, dataset_type=args.dataset_type, scaler=scaler, logger=logger ) # Average validation score avg_val_score = np.nanmean(val_scores) debug(f'Validation {args.metric} = {avg_val_score:.6f}') writer.add_scalar(f'validation_{args.metric}', avg_val_score, n_iter) if args.show_individual_scores: # Individual validation scores for task_name, val_score in zip(args.task_names, val_scores): debug(f'Validation {task_name} {args.metric} = {val_score:.6f}') writer.add_scalar(f'validation_{task_name}_{args.metric}', val_score, n_iter) # Save model checkpoint if improved validation score if args.minimize_score and avg_val_score < best_score or \ not args.minimize_score and avg_val_score > best_score: best_score, best_epoch = avg_val_score, epoch save_checkpoint(os.path.join(save_dir, 'model.pt'), model, scaler, features_scaler, args) # Evaluate on test set using model with best validation score info(f'Model {model_idx} best validation {args.metric} = {best_score:.6f} on epoch {best_epoch}') model = load_checkpoint(os.path.join(save_dir, 'model.pt'), cuda=args.cuda, logger=logger) #todo: Perhaps change code here in order to analyze the model on the trained data val_preds = predict( model=model, data=val_data, batch_size=args.batch_size, scaler=scaler ) test_preds = predict( model=model, data=test_data, batch_size=args.batch_size, scaler=scaler ) test_scores = evaluate_predictions( preds=test_preds, targets=test_targets, num_tasks=args.num_tasks, metric_func=metric_func, dataset_type=args.dataset_type, logger=logger ) if len(val_preds) != 0: sum_val_preds += np.array(val_preds) if len(test_preds) != 0: sum_test_preds += np.array(test_preds) # Average test score avg_test_score = np.nanmean(test_scores) info(f'Model {model_idx} test {args.metric} = {avg_test_score:.6f}') writer.add_scalar(f'test_{args.metric}', avg_test_score, 0) if args.show_individual_scores: # Individual test scores for task_name, test_score in zip(args.task_names, test_scores): info(f'Model {model_idx} test {task_name} {args.metric} = {test_score:.6f}') writer.add_scalar(f'test_{task_name}_{args.metric}', test_score, n_iter) # Evaluate ensemble on test set avg_test_preds = (sum_test_preds / args.ensemble_size).tolist() avg_val_preds = (sum_val_preds/ args.ensemble_size).tolist() ensemble_scores = evaluate_predictions( preds=avg_test_preds, targets=test_targets, num_tasks=args.num_tasks, metric_func=metric_func, dataset_type=args.dataset_type, logger=logger ) print("Test Prediction Shape:- ", np.array(avg_test_preds).shape) avg_test_preds = np.array(avg_test_preds).reshape(1,-1) test_targets = np.array(test_targets).reshape(1,-1) avg_val_preds = np.array(avg_val_preds).reshape(1,-1) val_targets = np.array(test_targets).reshape(1, -1) smaller_count = np.sum(avg_test_preds < test_targets) smaller_frac = smaller_count / (avg_test_preds.shape[1]) print("Smaller_Fraction: ", smaller_frac) # plt.plot(np.concatenate((avg_test_preds,avg_val_preds) ,axis=1),np.concatenate((test_targets,val_targets), axis=1), 'rx') plt.plot(avg_test_preds,test_targets,'ro') # x = np.linspace(0, 11000, 110000) x = np.linspace(-7, 3, 100) y = x plt.plot(x,y,'-g') plt.xlabel("Test Predictions") plt.ylabel("Test Targets") plt.title("Prediction Distribution") plt.savefig("Prediction_Distriution_ro.png") # plt.show() plt.clf() plt.plot(avg_test_preds, test_targets, 'yo') # x = np.linspace(0, 11000, 110000) x = np.linspace(-7, 3, 100) y = x plt.plot(x, y, '-g') plt.xlabel("Test Predictions") plt.ylabel("Test Targets") plt.title("Prediction Distribution") plt.savefig("Prediction_Distriution_yo.png") # plt.show() plt.clf() plt.plot(avg_test_preds, test_targets, 'rx') # x = np.linspace(0, 11000, 110000) x = np.linspace(-7, 3, 100) y = x plt.plot(x, y, '-g') plt.xlabel("Test Predictions") plt.ylabel("Test Targets") plt.title("Prediction Distribution") plt.savefig("Prediction_Distriution_rx.png") # plt.show() plt.clf() plt.plot(avg_test_preds, test_targets, 'yx') # x = np.linspace(0, 11000, 110000) x = np.linspace(-7, 3, 100) y = x plt.plot(x, y, '-g') plt.xlabel("Test Predictions") plt.ylabel("Test Targets") plt.title("Prediction Distribution") plt.savefig("Prediction_Distriution_yx.png") # plt.show() plt.clf() x = np.linspace(-7, 3, 100) y = x-x plt.plot(x, y, '-g') plt.plot(test_targets, avg_test_preds-test_targets,'rx') plt.xlabel("Test Targets") plt.ylabel("Test Errors") plt.title("Prediction Errors") plt.savefig("Prediction_Errors.png") # plt.show() plt.clf() # Average ensemble score avg_ensemble_test_score = np.nanmean(ensemble_scores) info(f'Ensemble test {args.metric} = {avg_ensemble_test_score:.6f}') writer.add_scalar(f'ensemble_test_{args.metric}', avg_ensemble_test_score, 0) # Individual ensemble scores if args.show_individual_scores: for task_name, ensemble_score in zip(args.task_names, ensemble_scores): info(f'Ensemble test {task_name} {args.metric} = {ensemble_score:.6f}') return ensemble_scores
def make_predictions( args: PredictArgs, smiles: List[List[str]] = None) -> List[List[Optional[float]]]: """ Loads data and a trained model and uses the model to make predictions on the data. If SMILES are provided, then makes predictions on smiles. Otherwise makes predictions on :code:`args.test_data`. :param args: A :class:`~chemprop.args.PredictArgs` object containing arguments for loading data and a model and making predictions. :param smiles: List of list of SMILES to make predictions on. :return: A list of lists of target predictions. """ print('Loading training args') train_args = load_args(args.checkpoint_paths[0]) num_tasks, task_names = train_args.num_tasks, train_args.task_names update_prediction_args(predict_args=args, train_args=train_args) args: Union[PredictArgs, TrainArgs] if args.atom_descriptors == 'feature': set_extra_atom_fdim(train_args.atom_features_size) if args.bond_features_path is not None: set_extra_bond_fdim(train_args.bond_features_size) #set explicit H option and reaction option set_explicit_h(train_args.explicit_h) set_reaction(train_args.reaction, train_args.reaction_mode) print('Loading data') if smiles is not None: full_data = get_data_from_smiles( smiles=smiles, skip_invalid_smiles=False, features_generator=args.features_generator) else: full_data = get_data(path=args.test_path, smiles_columns=args.smiles_columns, target_columns=[], ignore_columns=[], skip_invalid_smiles=False, args=args, store_row=not args.drop_extra_columns) print('Validating SMILES') full_to_valid_indices = {} valid_index = 0 for full_index in range(len(full_data)): if all(mol is not None for mol in full_data[full_index].mol): full_to_valid_indices[full_index] = valid_index valid_index += 1 test_data = MoleculeDataset( [full_data[i] for i in sorted(full_to_valid_indices.keys())]) # Edge case if empty list of smiles is provided if len(test_data) == 0: return [None] * len(full_data) print(f'Test size = {len(test_data):,}') # Predict with each model individually and sum predictions if args.dataset_type == 'multiclass': sum_preds = np.zeros( (len(test_data), num_tasks, args.multiclass_num_classes)) else: sum_preds = np.zeros((len(test_data), num_tasks)) # Create data loader test_data_loader = MoleculeDataLoader(dataset=test_data, batch_size=args.batch_size, num_workers=args.num_workers) # Partial results for variance robust calculation. if args.ensemble_variance: all_preds = np.zeros( (len(test_data), num_tasks, len(args.checkpoint_paths))) print( f'Predicting with an ensemble of {len(args.checkpoint_paths)} models') for index, checkpoint_path in enumerate( tqdm(args.checkpoint_paths, total=len(args.checkpoint_paths))): # Load model and scalers model = load_checkpoint(checkpoint_path, device=args.device) scaler, features_scaler, atom_descriptor_scaler, bond_feature_scaler = load_scalers( checkpoint_path) # Normalize features if args.features_scaling or train_args.atom_descriptor_scaling or train_args.bond_feature_scaling: test_data.reset_features_and_targets() if args.features_scaling: test_data.normalize_features(features_scaler) if train_args.atom_descriptor_scaling and args.atom_descriptors is not None: test_data.normalize_features(atom_descriptor_scaler, scale_atom_descriptors=True) if train_args.bond_feature_scaling and args.bond_features_size > 0: test_data.normalize_features(bond_feature_scaler, scale_bond_features=True) # Make predictions model_preds = predict(model=model, data_loader=test_data_loader, scaler=scaler) sum_preds += np.array(model_preds) if args.ensemble_variance: all_preds[:, :, index] = model_preds # Ensemble predictions avg_preds = sum_preds / len(args.checkpoint_paths) avg_preds = avg_preds.tolist() if args.ensemble_variance: all_epi_uncs = np.var(all_preds, axis=2) all_epi_uncs = all_epi_uncs.tolist() # Save predictions print(f'Saving predictions to {args.preds_path}') assert len(test_data) == len(avg_preds) if args.ensemble_variance: assert len(test_data) == len(all_epi_uncs) makedirs(args.preds_path, isfile=True) # Get prediction column names if args.dataset_type == 'multiclass': task_names = [ f'{name}_class_{i}' for name in task_names for i in range(args.multiclass_num_classes) ] else: task_names = task_names # Copy predictions over to full_data for full_index, datapoint in enumerate(full_data): valid_index = full_to_valid_indices.get(full_index, None) preds = avg_preds[valid_index] if valid_index is not None else [ 'Invalid SMILES' ] * len(task_names) if args.ensemble_variance: epi_uncs = all_epi_uncs[ valid_index] if valid_index is not None else [ 'Invalid SMILES' ] * len(task_names) # If extra columns have been dropped, add back in SMILES columns if args.drop_extra_columns: datapoint.row = OrderedDict() smiles_columns = args.smiles_columns for column, smiles in zip(smiles_columns, datapoint.smiles): datapoint.row[column] = smiles # Add predictions columns if args.ensemble_variance: for pred_name, pred, epi_unc in zip(task_names, preds, epi_uncs): datapoint.row[pred_name] = pred datapoint.row[pred_name + '_epi_unc'] = epi_unc else: for pred_name, pred in zip(task_names, preds): datapoint.row[pred_name] = pred # Save with open(args.preds_path, 'w') as f: writer = csv.DictWriter(f, fieldnames=full_data[0].row.keys()) writer.writeheader() for datapoint in full_data: writer.writerow(datapoint.row) return avg_preds
def make_predictions(args: Namespace, smiles: List[str] = None) -> List[Optional[List[float]]]: """ Makes predictions. If smiles is provided, makes predictions on smiles. Otherwise makes predictions on args.test_data. :param args: Arguments. :param smiles: Smiles to make predictions on. :return: A list of lists of target predictions. """ if args.gpu is not None: torch.cuda.set_device(args.gpu) print('Loading training args') scaler, features_scaler = load_scalers(args.checkpoint_paths[0]) train_args = load_args(args.checkpoint_paths[0]) # Update args with training arguments for key, value in vars(train_args).items(): if not hasattr(args, key): setattr(args, key, value) print('Loading data') if smiles is not None: test_data = get_data_from_smiles(smiles=smiles, skip_invalid_smiles=False) else: if args.write_true_val: test_data, true_vals = get_data(path=args.test_path, args=args, use_compound_names=args.use_compound_names, skip_invalid_smiles=False) else: test_data = get_data(path=args.test_path, args=args, use_compound_names=args.use_compound_names, skip_invalid_smiles=False) print('Validating SMILES') valid_indices = [i for i in range(len(test_data)) if test_data[i].mol is not None] full_data = test_data test_data = MoleculeDataset([test_data[i] for i in valid_indices]) # Edge case if empty list of smiles is provided if len(test_data) == 0: return [None] * len(full_data) if args.use_compound_names: compound_names = test_data.compound_names() print(f'Test size = {len(test_data):,}') # Normalize features if train_args.features_scaling: test_data.normalize_features(features_scaler) # Predict with each model individually and sum predictions if args.dataset_type == 'multiclass': sum_preds = np.zeros((len(test_data), args.num_tasks, args.multiclass_num_classes)) else: sum_preds = np.zeros((len(test_data), args.num_tasks)) print(f'Predicting with an ensemble of {len(args.checkpoint_paths)} models') for checkpoint_path in tqdm(args.checkpoint_paths, total=len(args.checkpoint_paths)): # Load model model = load_checkpoint(checkpoint_path, cuda=args.cuda) model_preds, check_fp = predict( model=model, data=test_data, batch_size=args.batch_size, scaler=scaler ) # wei, model_preds, check_fp for check each fp sum_preds += np.array(model_preds) # Ensemble predictions avg_preds = sum_preds / len(args.checkpoint_paths) avg_preds = avg_preds.tolist() # Save predictions assert len(test_data) == len(avg_preds) print(f'Saving predictions to {args.preds_path}') # Put Nones for invalid smiles full_preds = [None] * len(full_data) for i, si in enumerate(valid_indices): full_preds[si] = avg_preds[i] avg_preds = full_preds test_smiles = full_data.smiles() # Write predictions with open(args.preds_path, 'w') as f: writer = csv.writer(f) header = [] if args.use_compound_names: header.append('compound_names') header.append('smiles') if args.dataset_type == 'multiclass': for name in args.task_names: for i in range(args.multiclass_num_classes): header.append(name + '_class' + str(i)) else: if args.write_true_val: header.append('true_'+args.task_names[0]) header.append('preds_'+args.task_names[0]) header.append('atomic_d0') # wei, check depth header.append('atomic_d1') # wei, check depth header.append('atomic_d2') # wei, check depth header.append('atomic_final') # wei, check depth header.append('mol') # wei, check depth writer.writerow(header) for i in range(len(avg_preds)): row = [] if args.use_compound_names: row.append(compound_names[i]) row.append(test_smiles[i]) if args.write_true_val: row.append(true_vals[i]) if avg_preds[i] is not None: if args.dataset_type == 'multiclass': for task_probs in avg_preds[i]: row.extend(task_probs) else: #print(i) #print('len(avg_preds):', len(avg_preds)) row.extend(avg_preds[i]) row.append(check_fp[0][i]) # atomic d0 row.append(check_fp[1][i]) # atomic d1 row.append(check_fp[2][i]) # atomic d2 row.append(check_fp[3][i]) # atomic final row.append(check_fp[4][i]) # mol else: if args.dataset_type == 'multiclass': row.extend([''] * args.num_tasks * args.multiclass_num_classes) else: row.extend([''] * args.num_tasks) writer.writerow(row) return avg_preds
def new_noise(args: TrainArgs, logger: Logger = None) -> List[float]: """ Trains a model and returns test scores on the model checkpoint with the highest validation score. :param args: Arguments. :param logger: Logger. :return: A list of ensemble scores for each task. """ debug = info = print # Get data args.task_names = args.target_columns or get_task_names(args.data_path) data = get_data(path=args.data_path, args=args, logger=logger) args.num_tasks = data.num_tasks() args.features_size = data.features_size() # Split data debug(f'Splitting data with seed {args.seed}') train_data, val_data, test_data = split_data(data=data, split_type=args.split_type, sizes=args.split_sizes, seed=args.seed, args=args, logger=logger) if args.features_scaling: features_scaler = train_data.normalize_features(replace_nan_token=0) val_data.normalize_features(features_scaler) test_data.normalize_features(features_scaler) else: features_scaler = None args.train_data_size = len(train_data) # Initialize scaler and scale training targets by subtracting mean and dividing standard deviation (regression only) if args.dataset_type == 'regression': debug('Fitting scaler') train_smiles, train_targets = train_data.smiles(), train_data.targets() scaler = StandardScaler().fit(train_targets) scaled_targets = scaler.transform(train_targets).tolist() train_data.set_targets(scaled_targets) else: scaler = None # Get loss and metric functions loss_func = neg_log_like metric_func = get_metric_func(metric=args.metric) # Set up test set evaluation test_smiles, test_targets = test_data.smiles(), test_data.targets() sum_test_preds = np.zeros((len(test_smiles), args.num_tasks)) # Automatically determine whether to cache if len(data) <= args.cache_cutoff: cache = True num_workers = 0 else: cache = False num_workers = args.num_workers # Create data loaders train_data_loader = MoleculeDataLoader(dataset=train_data, batch_size=args.batch_size, num_workers=num_workers, cache=cache) val_data_loader = MoleculeDataLoader(dataset=val_data, batch_size=args.batch_size, num_workers=num_workers, cache=cache) test_data_loader = MoleculeDataLoader(dataset=test_data, batch_size=args.batch_size, num_workers=num_workers, cache=cache) ########################################### ########## Outer loop over ensemble members ########################################### for model_idx in range(args.ensemble_start_idx, args.ensemble_start_idx + args.ensemble_size): # load the model if (args.method == 'map') or (args.method == 'swag') or (args.method == 'sgld'): model = load_checkpoint(args.checkpoint_path + f'/model_{model_idx}/model.pt', device=args.device, logger=logger) if args.method == 'gp': args.num_inducing_points = 1200 fake_model = MoleculeModel(args) fake_model.featurizer = True feature_extractor = fake_model inducing_points = initial_inducing_points(train_data_loader, feature_extractor, args) gp_layer = GPLayer(inducing_points, args.num_tasks) model = load_checkpoint( args.checkpoint_path + f'/model_{model_idx}/DKN_model.pt', device=args.device, logger=None, template=DKLMoleculeModel(MoleculeModel(args, featurizer=True), gp_layer)) if args.method == 'dropR' or args.method == 'dropA': model = load_checkpoint(args.checkpoint_path + f'/model_{model_idx}/model.pt', device=args.device, logger=logger) if args.method == 'bbp': template = MoleculeModelBBP(args) for layer in template.children(): if isinstance(layer, BayesLinear): layer.init_rho(args.rho_min_bbp, args.rho_max_bbp) for layer in template.encoder.encoder.children(): if isinstance(layer, BayesLinear): layer.init_rho(args.rho_min_bbp, args.rho_max_bbp) model = load_checkpoint(args.checkpoint_path + f'/model_{model_idx}/model_bbp.pt', device=args.device, logger=None, template=template) if args.method == 'dun': args.prior_sig_dun = 0.05 args.depth_min = 1 args.depth_max = 5 args.rho_min_dun = -5.5 args.rho_max_dun = -5 args.log_cat_init = 0 template = MoleculeModelDUN(args) for layer in template.children(): if isinstance(layer, BayesLinear): layer.init_rho(args.rho_min_dun, args.rho_max_dun) for layer in template.encoder.encoder.children(): if isinstance(layer, BayesLinear): layer.init_rho(args.rho_min_dun, args.rho_max_dun) template.create_log_cat(args) model = load_checkpoint(args.checkpoint_path + f'/model_{model_idx}/model_dun.pt', device=args.device, logger=None, template=template) # make results_dir results_dir = os.path.join(args.results_dir, f'model_{model_idx}') makedirs(results_dir) # train_preds, train_targets train_preds = predict(model=model, data_loader=train_data_loader, args=args, scaler=scaler, test_data=False, bbp_sample=False) train_preds = np.array(train_preds) train_targets = np.array(train_targets) # compute tstats tstats = np.ones((12, 3)) for task in range(12): resid = train_preds[:, task] - train_targets[:, task] tstats[task] = np.array(stats.t.fit(resid, floc=0.0)) ################################## ########## Inner loop over samples ################################## for sample_idx in range(args.samples): # save down np.savez(os.path.join(results_dir, f'tstats_{sample_idx}'), tstats) print('done one')
def run_training(args: TrainArgs, data: MoleculeDataset, logger: Logger = None) -> Dict[str, List[float]]: """ Loads data, trains a Chemprop model, and returns test scores for the model checkpoint with the highest validation score. :param args: A :class:`~chemprop.args.TrainArgs` object containing arguments for loading data and training the Chemprop model. :param data: A :class:`~chemprop.data.MoleculeDataset` containing the data. :param logger: A logger to record output. :return: A dictionary mapping each metric in :code:`args.metrics` to a list of values for each task. """ if logger is not None: debug, info = logger.debug, logger.info else: debug = info = print # Set pytorch seed for random initial weights torch.manual_seed(args.pytorch_seed) # Split data debug(f"Splitting data with seed {args.seed}") # if args.separate_test_path: # test_data = get_data( # path=args.separate_test_path, # args=args, # features_path=args.separate_test_features_path, # atom_descriptors_path=args.separate_test_atom_descriptors_path, # bond_features_path=args.separate_test_bond_features_path, # smiles_columns=args.smiles_columns, # logger=logger, # ) # if args.separate_val_path: # val_data = get_data( # path=args.separate_val_path, # args=args, # features_path=args.separate_val_features_path, # atom_descriptors_path=args.separate_val_atom_descriptors_path, # bond_features_path=args.separate_val_bond_features_path, # smiles_columns=args.smiles_columns, # logger=logger, # ) # if args.separate_val_path and args.separate_test_path: # train_data = data # elif args.separate_val_path: # train_data, _, test_data = split_data( # data=data, # split_type=args.split_type, # sizes=(0.8, 0.0, 0.2), # seed=args.seed, # num_folds=args.num_folds, # args=args, # logger=logger, # ) # elif args.separate_test_path: # train_data, val_data, _ = split_data( # data=data, # split_type=args.split_type, # sizes=(0.8, 0.2, 0.0), # seed=args.seed, # num_folds=args.num_folds, # args=args, # logger=logger, # ) # else: # Default train_data, val_data, test_data = split_data( data=data, split_type=args.split_type, sizes=args.split_sizes, seed=args.seed, num_folds=args.num_folds, args=args, logger=logger, ) if args.dataset_type == "classification": class_sizes = get_class_sizes(data) debug("Class sizes") for i, task_class_sizes in enumerate(class_sizes): debug( f"{args.task_names[i]} " f'{", ".join(f"{cls}: {size * 100:.2f}%" for cls, size in enumerate(task_class_sizes))}' ) if args.save_smiles_splits: save_smiles_splits( data_path=args.data_path, save_dir=args.save_dir, task_names=args.task_names, features_path=args.features_path, train_data=train_data, val_data=val_data, test_data=test_data, smiles_columns=args.smiles_columns, ) if args.features_scaling: features_scaler = train_data.normalize_features(replace_nan_token=0) val_data.normalize_features(features_scaler) test_data.normalize_features(features_scaler) else: features_scaler = None if args.atom_descriptor_scaling and args.atom_descriptors is not None: atom_descriptor_scaler = train_data.normalize_features( replace_nan_token=0, scale_atom_descriptors=True) val_data.normalize_features(atom_descriptor_scaler, scale_atom_descriptors=True) test_data.normalize_features(atom_descriptor_scaler, scale_atom_descriptors=True) else: atom_descriptor_scaler = None if args.bond_feature_scaling and args.bond_features_size > 0: bond_feature_scaler = train_data.normalize_features( replace_nan_token=0, scale_bond_features=True) val_data.normalize_features(bond_feature_scaler, scale_bond_features=True) test_data.normalize_features(bond_feature_scaler, scale_bond_features=True) else: bond_feature_scaler = None args.train_data_size = len(train_data) debug( f"Total size = {len(data):,} | " f"train size = {len(train_data):,} | val size = {len(val_data):,} | test size = {len(test_data):,}" ) # Initialize scaler and scale training targets by subtracting mean and dividing standard deviation (regression only) if args.dataset_type == "regression": debug("Fitting scaler") scaler = train_data.normalize_targets() else: scaler = None # Get loss function loss_func = get_loss_func(args) # Set up test set evaluation test_smiles, test_targets = test_data.smiles(), test_data.targets() if args.dataset_type == "multiclass": sum_test_preds = np.zeros( (len(test_smiles), args.num_tasks, args.multiclass_num_classes)) else: sum_test_preds = np.zeros((len(test_smiles), args.num_tasks)) # Automatically determine whether to cache if len(data) <= args.cache_cutoff: set_cache_graph(True) num_workers = 0 else: set_cache_graph(False) num_workers = args.num_workers # Create data loaders train_data_loader = MoleculeDataLoader( dataset=train_data, batch_size=args.batch_size, num_workers=num_workers, class_balance=args.class_balance, shuffle=True, seed=args.seed, ) val_data_loader = MoleculeDataLoader(dataset=val_data, batch_size=args.batch_size, num_workers=num_workers) test_data_loader = MoleculeDataLoader(dataset=test_data, batch_size=args.batch_size, num_workers=num_workers) if args.class_balance: debug( f"With class_balance, effective train size = {train_data_loader.iter_size:,}" ) # Train ensemble of models for model_idx in range(args.ensemble_size): # Tensorboard writer save_dir = os.path.join(args.save_dir, f"model_{model_idx}") makedirs(save_dir) try: writer = SummaryWriter(log_dir=save_dir) except: writer = SummaryWriter(logdir=save_dir) # Load/build model if args.checkpoint_paths is not None: debug( f"Loading model {model_idx} from {args.checkpoint_paths[model_idx]}" ) model = load_checkpoint(args.checkpoint_paths[model_idx], logger=logger) else: debug(f"Building model {model_idx}") model = MoleculeModel(args) debug(model) debug(f"Number of parameters = {param_count(model):,}") if args.cuda: debug("Moving model to cuda") model = model.to(args.device) # Ensure that model is saved in correct location for evaluation if 0 epochs save_checkpoint( os.path.join(save_dir, MODEL_FILE_NAME), model, scaler, features_scaler, atom_descriptor_scaler, bond_feature_scaler, args, ) # Optimizers optimizer = build_optimizer(model, args) # Learning rate schedulers scheduler = build_lr_scheduler(optimizer, args) # Run training best_score = float("inf") if args.minimize_score else -float("inf") best_epoch, n_iter = 0, 0 for epoch in trange(args.epochs): debug(f"Epoch {epoch}") n_iter = train( model=model, data_loader=train_data_loader, loss_func=loss_func, optimizer=optimizer, scheduler=scheduler, args=args, n_iter=n_iter, logger=logger, writer=writer, ) if isinstance(scheduler, ExponentialLR): scheduler.step() val_scores = evaluate( model=model, data_loader=val_data_loader, num_tasks=args.num_tasks, metrics=args.metrics, dataset_type=args.dataset_type, scaler=scaler, logger=logger, ) for metric, scores in val_scores.items(): # Average validation score avg_val_score = np.nanmean(scores) debug(f"Validation {metric} = {avg_val_score:.6f}") writer.add_scalar(f"validation_{metric}", avg_val_score, n_iter) if args.show_individual_scores: # Individual validation scores for task_name, val_score in zip(args.task_names, scores): debug( f"Validation {task_name} {metric} = {val_score:.6f}" ) writer.add_scalar(f"validation_{task_name}_{metric}", val_score, n_iter) # Save model checkpoint if improved validation score avg_val_score = np.nanmean(val_scores[args.metric]) if (args.minimize_score and avg_val_score < best_score or not args.minimize_score and avg_val_score > best_score): best_score, best_epoch = avg_val_score, epoch save_checkpoint( os.path.join(save_dir, MODEL_FILE_NAME), model, scaler, features_scaler, atom_descriptor_scaler, bond_feature_scaler, args, ) # Evaluate on test set using model with best validation score info( f"Model {model_idx} best validation {args.metric} = {best_score:.6f} on epoch {best_epoch}" ) model = load_checkpoint(os.path.join(save_dir, MODEL_FILE_NAME), device=args.device, logger=logger) test_preds = predict(model=model, data_loader=test_data_loader, scaler=scaler) test_scores = evaluate_predictions( preds=test_preds, targets=test_targets, num_tasks=args.num_tasks, metrics=args.metrics, dataset_type=args.dataset_type, logger=logger, ) if len(test_preds) != 0: sum_test_preds += np.array(test_preds) # Average test score for metric, scores in test_scores.items(): avg_test_score = np.nanmean(scores) info(f"Model {model_idx} test {metric} = {avg_test_score:.6f}") writer.add_scalar(f"test_{metric}", avg_test_score, 0) if args.show_individual_scores: # Individual test scores for task_name, test_score in zip(args.task_names, scores): info( f"Model {model_idx} test {task_name} {metric} = {test_score:.6f}" ) writer.add_scalar(f"test_{task_name}_{metric}", test_score, n_iter) writer.close() # Evaluate ensemble on test set avg_test_preds = (sum_test_preds / args.ensemble_size).tolist() ensemble_scores = evaluate_predictions( preds=avg_test_preds, targets=test_targets, num_tasks=args.num_tasks, metrics=args.metrics, dataset_type=args.dataset_type, logger=logger, ) for metric, scores in ensemble_scores.items(): # Average ensemble score avg_ensemble_test_score = np.nanmean(scores) info(f"Ensemble test {metric} = {avg_ensemble_test_score:.6f}") # Individual ensemble scores if args.show_individual_scores: for task_name, ensemble_score in zip(args.task_names, scores): info( f"Ensemble test {task_name} {metric} = {ensemble_score:.6f}" ) # Optionally save test preds if args.save_preds: test_preds_dataframe = pd.DataFrame( data={"smiles": test_data.smiles()}) for i, task_name in enumerate(args.task_names): test_preds_dataframe[task_name] = [ pred[i] for pred in avg_test_preds ] test_preds_dataframe.to_csv(os.path.join(args.save_dir, "test_preds.csv"), index=False) return ensemble_scores
def train_bbp(model, train_data, val_data, num_workers, cache, loss_func, metric_func, scaler, features_scaler, args, save_dir): # data loaders for bbp train_data_loader = MoleculeDataLoader(dataset=train_data, batch_size=args.batch_size_bbp, num_workers=num_workers, cache=cache, class_balance=args.class_balance, shuffle=True, seed=args.seed) val_data_loader = MoleculeDataLoader(dataset=val_data, batch_size=args.batch_size_bbp, num_workers=num_workers, cache=cache) # instantiate BBP model with Bayesian linear layers (includes log noise) model_bbp = MoleculeModelBBP(args) # copy over parameters from pretrained to BBP model # we take the transpose because the Bayes linear layers have transpose shapes for (_, param_bbp), (_, param_pre) in zip(model_bbp.named_parameters(), model.named_parameters()): param_bbp.data = copy.deepcopy(param_pre.data.T) # instantiate rho for each weight for layer in model_bbp.children(): if isinstance(layer, BayesLinear): layer.init_rho(args.rho_min_bbp, args.rho_max_bbp) for layer in model_bbp.encoder.encoder.children(): if isinstance(layer, BayesLinear): layer.init_rho(args.rho_min_bbp, args.rho_max_bbp) # move bbp model to cuda if args.cuda: print('Moving bbp model to cuda') model_bbp = model_bbp.to(args.device) # optimiser optimizer = torch.optim.Adam(model_bbp.parameters(), lr=args.lr_bbp) # scheduler scheduler = scheduler_const([args.lr_bbp]) print("----------BBP training----------") # training loop best_score = float('inf') if args.minimize_score else -float('inf') best_epoch, n_iter = 0, 0 for epoch in range(args.epochs_bbp): print(f'BBP epoch {epoch}') n_iter = train(model=model_bbp, data_loader=train_data_loader, loss_func=loss_func, optimizer=optimizer, scheduler=scheduler, args=args, n_iter=n_iter, bbp_switch=2) val_scores = evaluate(model=model_bbp, data_loader=val_data_loader, args=args, num_tasks=args.num_tasks, metric_func=metric_func, dataset_type=args.dataset_type, scaler=scaler) # Average validation score avg_val_score = np.nanmean(val_scores) print(f'Validation {args.metric} = {avg_val_score:.6f}') wandb.log({"Validation MAE": avg_val_score}) # Save model checkpoint if improved validation score if (args.minimize_score and avg_val_score < best_score or \ not args.minimize_score and avg_val_score > best_score) and (epoch >= args.presave_bbp): best_score, best_epoch = avg_val_score, epoch save_checkpoint(os.path.join(save_dir, 'model_bbp.pt'), model_bbp, scaler, features_scaler, args) # load model with best validation score template = MoleculeModelBBP(args) for layer in template.children(): if isinstance(layer, BayesLinear): layer.init_rho(args.rho_min_bbp, args.rho_max_bbp) for layer in template.encoder.encoder.children(): if isinstance(layer, BayesLinear): layer.init_rho(args.rho_min_bbp, args.rho_max_bbp) print( f'Best validation {args.metric} = {best_score:.6f} on epoch {best_epoch}' ) model_bbp = load_checkpoint(os.path.join(save_dir, 'model_bbp.pt'), device=args.device, logger=None, template=template) return model_bbp
def make_predictions(args: Namespace, smiles: List[str] = None, invalid_smiles_warning: str = None) -> List[List[float]]: """Makes predictions.""" if args.gpu is not None: torch.cuda.set_device(args.gpu) if invalid_smiles_warning is not None: success_indices = [] for i, s in enumerate(smiles): mol = Chem.MolFromSmiles(s) if mol is not None: success_indices.append(i) full_smiles = smiles smiles = [smiles[i] for i in success_indices] print('Loading training args') scaler, features_scaler = load_scalers(args.checkpoint_paths[0]) train_args = load_args(args.checkpoint_paths[0]) # Update args with training arguments for key, value in vars(train_args).items(): if not hasattr(args, key): setattr(args, key, value) print('Loading data') if smiles is not None: test_data = get_data_from_smiles(smiles) else: test_data = get_data(args.test_path, args, use_compound_names=args.compound_names) test_smiles = test_data.smiles() if args.compound_names: compound_names = test_data.compound_names() print('Test size = {:,}'.format(len(test_data))) # Normalize features if train_args.features_scaling: test_data.normalize_features(features_scaler) # Predict with each model individually and sum predictions sum_preds = np.zeros((len(test_data), args.num_tasks)) print('Predicting with an ensemble of {} models'.format( len(args.checkpoint_paths))) for checkpoint_path in tqdm(args.checkpoint_paths, total=len(args.checkpoint_paths)): # Load model model = load_checkpoint(checkpoint_path, cuda=args.cuda) model_preds = predict(model=model, data=test_data, args=args, scaler=scaler) sum_preds += np.array(model_preds) # Ensemble predictions avg_preds = sum_preds / args.ensemble_size avg_preds = avg_preds.tolist() # Save predictions assert len(test_data) == len(avg_preds) print('Saving predictions to {}'.format(args.preds_path)) with open(args.preds_path, 'w') as f: if args.write_smiles: f.write('smiles,') if args.compound_names: f.write('compound_name,') f.write(','.join(args.task_names) + '\n') for i in range(len(avg_preds)): if args.write_smiles: f.write(test_smiles[i] + ',') if args.compound_names: f.write(compound_names[i] + ',') f.write(','.join(str(p) for p in avg_preds[i]) + '\n') if invalid_smiles_warning is not None: full_preds = [[invalid_smiles_warning] for _ in range(len(full_smiles))] for i, si in enumerate(success_indices): full_preds[si] = avg_preds[i] return full_preds return avg_preds
def make_predictions(args: Namespace, smiles: List[str] = None) -> List[Optional[List[float]]]: """ Makes predictions. If smiles is provided, makes predictions on smiles. Otherwise makes predictions on args.test_data. :param args: Arguments. :param smiles: Smiles to make predictions on. :return: A list of lists of target predictions. """ if args.gpu is not None: torch.cuda.set_device(args.gpu) print('Loading training args') scaler, features_scaler = load_scalers(args.checkpoint_paths[0]) train_args = load_args(args.checkpoint_paths[0]) # Update args with training arguments for key, value in vars(train_args).items(): if not hasattr(args, key): setattr(args, key, value) print('Loading data') if smiles is not None: test_data = get_data_from_smiles(smiles=smiles, skip_invalid_smiles=False, args=args) else: test_data = get_data(path=args.test_path, args=args, use_compound_names=args.use_compound_names, skip_invalid_smiles=False) print('Validating SMILES') valid_indices = [ i for i in range(len(test_data)) if test_data[i].mol is not None ] full_data = test_data test_data = MoleculeDataset([test_data[i] for i in valid_indices]) # Edge case if empty list of smiles is provided if len(test_data) == 0: return [None] * len(full_data) if args.use_compound_names: compound_names = test_data.compound_names() print(f'Test size = {len(test_data):,}') # Normalize features if train_args.features_scaling: test_data.normalize_features(features_scaler) # Predict with each model individually and sum predictions if args.dataset_type == 'multiclass': sum_preds = np.zeros( (len(test_data), args.num_tasks, args.multiclass_num_classes)) sum_ale_uncs = np.zeros( (len(test_data), args.num_tasks, args.multiclass_num_classes)) sum_epi_uncs = np.zeros( (len(test_data), args.num_tasks, args.multiclass_num_classes)) else: sum_preds = np.zeros((len(test_data), args.num_tasks)) sum_ale_uncs = np.zeros((len(test_data), args.num_tasks)) sum_epi_uncs = np.zeros((len(test_data), args.num_tasks)) # Partial results for variance robust calculation. all_preds = np.zeros( (len(test_data), args.num_tasks, len(args.checkpoint_paths))) print( f'Predicting with an ensemble of {len(args.checkpoint_paths)} models') for index, checkpoint_path in enumerate( tqdm(args.checkpoint_paths, total=len(args.checkpoint_paths))): # Load model model = load_checkpoint(checkpoint_path, cuda=args.cuda) model_preds, ale_uncs, epi_uncs = predict( model=model, data=test_data, batch_size=args.batch_size, scaler=scaler, sampling_size=args.sampling_size) sum_preds += np.array(model_preds) if ale_uncs is not None: sum_ale_uncs += np.array(ale_uncs) if epi_uncs is not None: sum_epi_uncs += np.array(epi_uncs) if args.estimate_variance: all_preds[:, :, index] = model_preds # Ensemble predictions if args.estimate_variance: # Use ensemble variance to estimate uncertainty. This overwrites existing uncertainty estimates. # preds <- mean(preds), ale_uncs <- mean(ale_uncs), epi_uncs <- var(preds) avg_preds = sum_preds / len(args.checkpoint_paths) avg_preds = avg_preds.tolist() avg_ale_uncs = sum_ale_uncs / len(args.checkpoint_paths) avg_ale_uncs = avg_ale_uncs.tolist() avg_epi_uncs = np.var(all_preds, axis=2) avg_epi_uncs = avg_epi_uncs.tolist() else: # Use another method to estimate uncertainty. # preds <- mean(preds), ale_uncs <- mean(ale_uncs), epi_uncs <- mean(epi_uncs) avg_preds = sum_preds / len(args.checkpoint_paths) avg_preds = avg_preds.tolist() avg_ale_uncs = sum_ale_uncs / len(args.checkpoint_paths) avg_ale_uncs = avg_ale_uncs.tolist() avg_epi_uncs = sum_epi_uncs / len(args.checkpoint_paths) avg_epi_uncs = avg_epi_uncs.tolist() # Save predictions assert len(test_data) == len(avg_preds) assert len(test_data) == len(avg_ale_uncs) assert len(test_data) == len(avg_epi_uncs) print(f'Saving predictions to {args.preds_path}') # Put Nones for invalid smiles full_preds = [None] * len(full_data) full_ale_uncs = [None] * len(full_data) full_epi_uncs = [None] * len(full_data) for i, si in enumerate(valid_indices): full_preds[si] = avg_preds[i] full_ale_uncs[si] = avg_ale_uncs[i] full_epi_uncs[si] = avg_epi_uncs[i] avg_preds = full_preds avg_ale_uncs = full_ale_uncs avg_epi_uncs = full_epi_uncs test_smiles = full_data.smiles() # Write predictions with open(args.preds_path, 'w') as f: writer = csv.writer(f) header = [] if args.use_compound_names: header.append('compound_names') header.append('smiles') if args.dataset_type == 'multiclass': for name in args.task_names: for i in range(args.multiclass_num_classes): header.append(name + '_class' + str(i)) else: header.extend(args.task_names) header.extend([tn + "_ale_unc" for tn in args.task_names]) header.extend([tn + "_epi_unc" for tn in args.task_names]) writer.writerow(header) for i in range(len(avg_preds)): row = [] if args.use_compound_names: row.append(compound_names[i]) row.append(test_smiles[i]) if avg_preds[i] is not None: if args.dataset_type == 'multiclass': for task_probs in avg_preds[i]: row.extend(task_probs) else: row.extend(avg_preds[i]) row.extend(avg_ale_uncs[i]) row.extend(avg_epi_uncs[i]) else: if args.dataset_type == 'multiclass': row.extend([''] * args.num_tasks * args.multiclass_num_classes) else: # Both the prediction, the aleatoric uncertainty and the epistemic uncertainty are None row.extend([''] * 3 * args.num_tasks) writer.writerow(row) return avg_preds
def molecule_fingerprint( args: PredictArgs, smiles: List[List[str]] = None) -> List[List[Optional[float]]]: """ Loads data and a trained model and uses the model to encode fingerprint vectors for the data. :param args: A :class:`~chemprop.args.PredictArgs` object containing arguments for loading data and a model and making predictions. :param smiles: List of list of SMILES to make predictions on. :return: A list of fingerprint vectors (list of floats) """ print('Loading training args') train_args = load_args(args.checkpoint_paths[0]) # Update args with training arguments update_prediction_args(predict_args=args, train_args=train_args, validate_feature_sources=False) args: Union[PredictArgs, TrainArgs] print('Loading data') if smiles is not None: full_data = get_data_from_smiles( smiles=smiles, skip_invalid_smiles=False, features_generator=args.features_generator) else: full_data = get_data(path=args.test_path, smiles_columns=args.smiles_columns, target_columns=[], ignore_columns=[], skip_invalid_smiles=False, args=args, store_row=True) print('Validating SMILES') full_to_valid_indices = {} valid_index = 0 for full_index in range(len(full_data)): if all(mol is not None for mol in full_data[full_index].mol): full_to_valid_indices[full_index] = valid_index valid_index += 1 test_data = MoleculeDataset( [full_data[i] for i in sorted(full_to_valid_indices.keys())]) # Edge case if empty list of smiles is provided if len(test_data) == 0: return [None] * len(full_data) print(f'Test size = {len(test_data):,}') # Create data loader test_data_loader = MoleculeDataLoader(dataset=test_data, batch_size=args.batch_size, num_workers=args.num_workers) # Load model print(f'Encoding smiles into a fingerprint vector from a single model') if len(args.checkpoint_paths) != 1: raise ValueError( "Fingerprint generation only supports one model, cannot use an ensemble" ) model = load_checkpoint(args.checkpoint_paths[0], device=args.device) scaler, features_scaler, atom_descriptor_scaler, bond_feature_scaler = load_scalers( args.checkpoint_paths[0]) # Normalize features if args.features_scaling or train_args.atom_descriptor_scaling or train_args.bond_feature_scaling: test_data.reset_features_and_targets() if args.features_scaling: test_data.normalize_features(features_scaler) if train_args.atom_descriptor_scaling and args.atom_descriptors is not None: test_data.normalize_features(atom_descriptor_scaler, scale_atom_descriptors=True) if train_args.bond_feature_scaling and args.bond_features_size > 0: test_data.normalize_features(bond_feature_scaler, scale_bond_features=True) # Make fingerprints model_preds = model_fingerprint(model=model, data_loader=test_data_loader) # Save predictions print(f'Saving predictions to {args.preds_path}') assert len(test_data) == len(model_preds) makedirs(args.preds_path, isfile=True) # Copy predictions over to full_data total_hidden_size = args.hidden_size * args.number_of_molecules for full_index, datapoint in enumerate(full_data): valid_index = full_to_valid_indices.get(full_index, None) preds = model_preds[valid_index] if valid_index is not None else [ 'Invalid SMILES' ] * total_hidden_size fingerprint_columns = [f'fp_{i}' for i in range(total_hidden_size)] for i in range(len(fingerprint_columns)): datapoint.row[fingerprint_columns[i]] = preds[i] # Write predictions with open(args.preds_path, 'w') as f: writer = csv.DictWriter(f, fieldnames=args.smiles_columns + fingerprint_columns, extrasaction='ignore') writer.writeheader() for datapoint in full_data: writer.writerow(datapoint.row) return model_preds
def make_predictions(args: Namespace, smiles: List[str] = None) -> List[Optional[List[float]]]: """ Makes predictions. If smiles is provided, makes predictions on smiles. Otherwise makes predictions on args.test_data. :param args: Arguments. :param smiles: Smiles to make predictions on. :return: A list of lists of target predictions. """ if args.gpu is not None: torch.cuda.set_device(args.gpu) print('Loading training args') scaler, features_scaler = load_scalers(args.checkpoint_paths[0]) train_args = load_args(args.checkpoint_paths[0]) # Update args with training arguments for key, value in vars(train_args).items(): if not hasattr(args, key): setattr(args, key, value) print('Loading data') if smiles is not None: test_data = get_data_from_smiles(smiles=smiles, skip_invalid_smiles=False) else: test_data = get_data(path=args.test_path, args=args, use_compound_names=args.use_compound_names, skip_invalid_smiles=False) print('Validating SMILES') valid_indices = [ i for i in range(len(test_data)) if test_data[i].mol is not None ] full_data = test_data test_data = MoleculeDataset([test_data[i] for i in valid_indices]) # Edge case if empty list of smiles is provided if len(test_data) == 0: return [None] * len(full_data) if args.use_compound_names: compound_names = test_data.compound_names() print(f'Test size = {len(test_data):,}') # Normalize features if train_args.features_scaling: test_data.normalize_features(features_scaler) # Predict with each model individually and sum predictions if args.dataset_type == 'multiclass': sum_preds = np.zeros( (len(test_data), args.num_tasks, args.multiclass_num_classes)) else: sum_preds = np.zeros((len(test_data), args.num_tasks)) print( f'Predicting with an ensemble of {len(args.checkpoint_paths)} models') for checkpoint_path in tqdm(args.checkpoint_paths, total=len(args.checkpoint_paths)): # Load model model = load_checkpoint(checkpoint_path, cuda=args.cuda) model_preds = predict(model=model, data=test_data, batch_size=args.batch_size, scaler=scaler) sum_preds += np.array(model_preds) # Ensemble predictions avg_preds = sum_preds / len(args.checkpoint_paths) avg_preds = avg_preds.tolist() return avg_preds, test_data.smiles()
def make_predictions(args: Namespace, smiles: List[str] = None) -> List[Optional[List[float]]]: """ Makes predictions. If smiles is provided, makes predictions on smiles. Otherwise makes predictions on args.test_data. :param args: Arguments. :param smiles: Smiles to make predictions on. :return: A list of lists of target predictions. """ if args.gpu is not None: torch.cuda.set_device(args.gpu) print('Loading training args') scaler, features_scaler = load_scalers(args.checkpoint_paths[0]) train_args = load_args(args.checkpoint_paths[0]) data = smiles # Update args with training arguments for key, value in vars(train_args).items(): if not hasattr(args, key): setattr(args, key, value) print('Loading data') # if smiles is not None: # test_data = get_data_from_smiles_fast(smiles=smiles, skip_invalid_smiles=False) # else: # test_data = get_data(path=args.test_path, args=args, use_compound_names=args.use_compound_names, skip_invalid_smiles=False) with open(args.test_path, 'r') as f: smiles = list(map(lambda x: x.split(',')[0].strip(), f.readlines()[1:])) assert (smiles is not None) print('Validating SMILES') # # valid_indices = [i for i in range(len(test_data)) if test_data[i].mol is not None] # full_data = test_data # test_data = MoleculeDataset([test_data[i] for i in valid_indices]) # # # Edge case if empty list of smiles is provided # if len(test_data) == 0: # return [None] * len(full_data) # # if args.use_compound_names: # compound_names = test_data.compound_names() # print(f'Test size = {len(test_data):,}') # # # Normalize features # if train_args.features_scaling: # test_data.normalize_features(features_scaler) # Predict with each model individually and sum predictions # if args.dataset_type == 'multiclass': # sum_preds = np.zeros((len(smiles), args.num_tasks, args.multiclass_num_classes)) # else: # sum_preds = np.zeros((len(smiles), args.num_tasks)) print( f'Predicting with an ensemble of {len(args.checkpoint_paths)} models') for checkpoint_path in tqdm(args.checkpoint_paths, total=len(args.checkpoint_paths)): # Load model model = load_checkpoint(checkpoint_path, cuda=args.cuda) avg_preds = predict(model=model, data=smiles, batch_size=args.batch_size, scaler=scaler, args=args) # avg_preds += np.array(model_preds) # Ensemble predictions # avg_preds = sum_preds / len(args.checkpoint_paths) # avg_preds = avg_preds.tolist() # Save predictions print(len(smiles), len(avg_preds)) assert len(smiles) == len(avg_preds) print(f'Saving predictions to {args.preds_path}') # Put Nones for invalid smiles full_preds = avg_preds # for i, si in enumerate(valid_indices): # full_preds[si] = avg_preds[i] avg_preds = full_preds test_smiles = smiles # Write predictions with open(args.preds_path, 'w') as f: writer = csv.writer(f) header = [] if args.use_compound_names: header.append('compound_names') header.append('smiles') if args.dataset_type == 'multiclass': for name in args.task_names: for i in range(args.multiclass_num_classes): header.append(name + '_class' + str(i)) else: header.extend(args.task_names) writer.writerow(header) for i in range(len(avg_preds)): row = [] # if args.use_compound_names: # row.append(compound_names[i]) row.append(test_smiles[i]) if avg_preds[i] is not None: if args.dataset_type == 'multiclass': for task_probs in avg_preds[i]: row.extend(task_probs) else: row.extend(avg_preds[i]) else: if args.dataset_type == 'multiclass': row.extend([''] * args.num_tasks * args.multiclass_num_classes) else: row.extend([''] * args.num_tasks) writer.writerow(row) return avg_preds