def evaluate(model: MoleculeModel, data_loader: MoleculeDataLoader, num_tasks: int, metric_func: Callable, dataset_type: str, scaler: StandardScaler = None, logger: logging.Logger = None) -> List[float]: """ Evaluates an ensemble of models on a dataset by making predictions and then evaluating the predictions. :param model: A :class:`~chemprop.models.model.MoleculeModel`. :param data_loader: A :class:`~chemprop.data.data.MoleculeDataLoader`. :param num_tasks: Number of tasks. :param metric_func: Metric function which takes in a list of targets and a list of predictions. :param dataset_type: Dataset type. :param scaler: A :class:`~chemprop.features.scaler.StandardScaler` object fit on the training targets. :param logger: A logger to record output. :return: A list with the score for each task based on :code:`metric_func`. """ preds = predict(model=model, data_loader=data_loader, scaler=scaler) targets = data_loader.targets() results = evaluate_predictions(preds=preds, targets=targets, num_tasks=num_tasks, metric_func=metric_func, dataset_type=dataset_type, logger=logger) return results
def __call__(self, smiles: List[str], batch_size: int = 500) -> List[List[float]]: test_data = get_data_from_smiles( smiles=smiles, skip_invalid_smiles=False, features_generator=self.args.features_generator) valid_indices = [ i for i in range(len(test_data)) if test_data[i].mol is not None ] test_data = MoleculeDataset([test_data[i] for i in valid_indices]) if self.train_args.features_scaling: test_data.normalize_features(self.features_scaler) test_data_loader = MoleculeDataLoader(dataset=test_data, batch_size=batch_size) sum_preds = [] for model in self.checkpoints: model_preds = predict(model=model, data_loader=test_data_loader, scaler=self.scaler, disable_progress_bar=True) sum_preds.append(np.array(model_preds)) # Ensemble predictions sum_preds = sum(sum_preds) avg_preds = sum_preds / len(self.checkpoints) return avg_preds
def evaluate(model: nn.Module, data_loader: MoleculeDataLoader, num_tasks: int, metric_func: Callable, dataset_type: str, scaler: StandardScaler = None, logger: logging.Logger = None) -> List[float]: """ Evaluates an ensemble of models on a dataset. :param model: A model. :param data_loader: A MoleculeDataLoader. :param num_tasks: Number of tasks. :param metric_func: Metric function which takes in a list of targets and a list of predictions. :param dataset_type: Dataset type. :param scaler: A StandardScaler object fit on the training targets. :param logger: Logger. :return: A list with the score for each task based on `metric_func`. """ preds = predict(model=model, data_loader=data_loader, scaler=scaler) targets = data_loader.targets() results = evaluate_predictions(preds=preds, targets=targets, num_tasks=num_tasks, metric_func=metric_func, dataset_type=dataset_type, logger=logger) return results
def __init__( self, args: Namespace, data, scaler: StandardScaler, ): """ Constructs an UncertaintyEstimator. :param train_data: The data a model was trained on. :param val_data: The validation/supplementary data for a model. :param test_data: The data to test the model with. :param scaler: A scaler the model uses to transform input data. :param args: The command line arguments. """ self.args = args self.scaler = scaler self.data = data self.data_loader = MoleculeDataLoader(dataset=data, batch_size=args.batch_size, num_workers=args.num_workers) self.data_len = len(data) if args.checkpoint_paths is not None: self.data_width = len(args.checkpoint_paths) self.split_UQ = args.split_UQ self.counter = 0
def compute_molecule_vectors(model: nn.Module, data: MoleculeDataset, batch_size: int, num_workers: int = 8) -> List[np.ndarray]: """ Computes the molecule vectors output from the last layer of a MoleculeModel. :param model: A MoleculeModel. :param data: A MoleculeDataset. :param batch_size: Batch size. :param num_workers: Number of parallel data loading workers. :return: A list of 1D numpy arrays of length hidden_size containing the molecule vectors generated by the model for each molecule provided. """ training = model.training model.eval() data_loader = MoleculeDataLoader( dataset=data, batch_size=batch_size, num_workers=num_workers ) vecs = [] for batch in tqdm(data_loader, total=len(data_loader)): # Apply model to batch with torch.no_grad(): batch_vecs = model.featurize(batch.mols(), batch.features()) # Collect vectors vecs.extend(batch_vecs.data.cpu().numpy()) if training: model.train() return vecs
def gcnn_predict(self) -> Tuple[array, array]: """ Function that handles graph convolutinal neural network predictions, enters them into the predictions DataFrame and reports any errors Parameters: rdkit_mols (array): a numpy array containing RDKit molecules Returns: predictions, prediction_labels (Tuple[array, array]): predictions and labels """ smiles = self.rdkit_mols.tolist() full_data = get_data_from_smiles(smiles=smiles, skip_invalid_smiles=False) full_to_valid_indices = {} valid_index = 0 for full_index in range(len(full_data)): if full_data[full_index].mol is not None: full_to_valid_indices[full_index] = valid_index valid_index += 1 test_data = MoleculeDataset( [full_data[i] for i in sorted(full_to_valid_indices.keys())]) # create data loader test_data_loader = MoleculeDataLoader(dataset=test_data, batch_size=50, num_workers=0) model_preds = predict(model=rlm_gcnn_model, data_loader=test_data_loader, scaler=rlm_gcnn_scaler) predictions = np.ma.empty(len(full_data)) predictions.mask = True labels = np.ma.empty(len(full_data)) labels.mask = True for key in full_to_valid_indices.keys(): full_index = int(key) predictions[full_index] = model_preds[ full_to_valid_indices[key]][0] labels[full_index] = np.round( model_preds[full_to_valid_indices[key]][0], 0) self.predictions_df['GCNN'] = pd.Series( pd.Series(labels).fillna('').astype(str) + ' (' + pd.Series(predictions).round(2).astype(str) + ')').str.replace( '(nan)', '', regex=False) if len(self.predictions_df.index) > len( predictions) or np.ma.count_masked(predictions) > 0: self.model_errors.append('graph convolutional neural network') self.has_errors = True return predictions, labels
def load_data(args: PredictArgs, smiles: List[List[str]]): """ Function to load data from a list of smiles or a file. :param args: A :class:`~chemprop.args.PredictArgs` object containing arguments for loading data and a model and making predictions. :param smiles: A list of list of smiles, or None if data is to be read from file :return: A tuple of a :class:`~chemprop.data.MoleculeDataset` containing all datapoints, a :class:`~chemprop.data.MoleculeDataset` containing only valid datapoints, a :class:`~chemprop.data.MoleculeDataLoader` and a dictionary mapping full to valid indices. """ print("Loading data") if smiles is not None: full_data = get_data_from_smiles( smiles=smiles, skip_invalid_smiles=False, features_generator=args.features_generator, ) else: full_data = get_data( path=args.test_path, smiles_columns=args.smiles_columns, target_columns=[], ignore_columns=[], skip_invalid_smiles=False, args=args, store_row=not args.drop_extra_columns, ) print("Validating SMILES") full_to_valid_indices = {} valid_index = 0 for full_index in range(len(full_data)): if all(mol is not None for mol in full_data[full_index].mol): full_to_valid_indices[full_index] = valid_index valid_index += 1 test_data = MoleculeDataset( [full_data[i] for i in sorted(full_to_valid_indices.keys())]) print(f"Test size = {len(test_data):,}") # Create data loader test_data_loader = MoleculeDataLoader(dataset=test_data, batch_size=args.batch_size, num_workers=args.num_workers) return full_data, test_data, test_data_loader, full_to_valid_indices
def __call__(self, smiles: List[str], batch_size: int = 500) -> List[List[float]]: """ Makes predictions on a list of SMILES. :param smiles: A list of SMILES to make predictions on. :param batch_size: The batch size. :return: A list of lists of floats containing the predicted values. """ test_data = get_data_from_smiles( smiles=smiles, skip_invalid_smiles=False, features_generator=self.args.features_generator) valid_indices = [ i for i in range(len(test_data)) if test_data[i].mol is not None ] test_data = MoleculeDataset([test_data[i] for i in valid_indices]) if self.train_args.features_scaling: test_data.normalize_features(self.features_scaler) if self.train_args.atom_descriptor_scaling and self.args.atom_descriptors is not None: test_data.normalize_features(self.atom_descriptor_scaler, scale_atom_descriptors=True) if self.train_args.bond_feature_scaling and self.args.bond_features_size > 0: test_data.normalize_features(self.bond_feature_scaler, scale_bond_features=True) test_data_loader = MoleculeDataLoader(dataset=test_data, batch_size=batch_size) sum_preds = [] for model in self.checkpoints: model_preds = predict(model=model, data_loader=test_data_loader, scaler=self.scaler, disable_progress_bar=True) sum_preds.append(np.array(model_preds)) # Ensemble predictions sum_preds = sum(sum_preds) avg_preds = sum_preds / len(self.checkpoints) return avg_preds
def __call__(self, smiles, batch_size=500): test_data = get_data_from_smiles( smiles=[[s] for s in smiles], skip_invalid_smiles=False, features_generator=self.features_generator) valid_indices = [ i for i in range(len(test_data)) if test_data[i].mol[0] is not None ] full_data = test_data test_data = MoleculeDataset([test_data[i] for i in valid_indices]) test_data_loader = MoleculeDataLoader(dataset=test_data, batch_size=batch_size) sum_preds = np.zeros((len(test_data), 1)) for model, scaler, features_scaler in zip(self.checkpoints, self.scalers, self.features_scalers): test_data.reset_features_and_targets() if features_scaler is not None: test_data.normalize_features(features_scaler) model_preds = predict(model=model, data_loader=test_data_loader, scaler=scaler) sum_preds += np.array(model_preds) # Ensemble predictions avg_preds = sum_preds / len(self.checkpoints) avg_preds = avg_preds.squeeze(-1).tolist() # Put zero for invalid smiles full_preds = [0.0] * len(full_data) for i, si in enumerate(valid_indices): full_preds[si] = avg_preds[i] return np.array(full_preds, dtype=np.float32)
def train_dun( model, train_data, val_data, num_workers, cache, loss_func, metric_func, scaler, features_scaler, args, save_dir ): # data loaders for dun train_data_loader = MoleculeDataLoader( dataset=train_data, batch_size=args.batch_size_dun, num_workers=num_workers, cache=cache, class_balance=args.class_balance, shuffle=True, seed=args.seed ) val_data_loader = MoleculeDataLoader( dataset=val_data, batch_size=args.batch_size_dun, num_workers=num_workers, cache=cache ) # instantiate DUN model with Bayesian linear layers (includes log noise) model_dun = MoleculeModelDUN(args) # copy over parameters from pretrained to DUN model # we take the transpose because the Bayes linear layers have transpose shapes for (_, param_dun), (_, param_pre) in zip(model_dun.named_parameters(), model.named_parameters()): param_dun.data = copy.deepcopy(param_pre.data.T) # instantiate rho for each weight for layer in model_dun.children(): if isinstance(layer, BayesLinear): layer.init_rho(args.rho_min_dun, args.rho_max_dun) for layer in model_dun.encoder.encoder.children(): if isinstance(layer, BayesLinear): layer.init_rho(args.rho_min_dun, args.rho_max_dun) # instantiate variational categorical distribution model_dun.create_log_cat(args) # move dun model to cuda if args.cuda: print('Moving dun model to cuda') model_dun = model_dun.to(args.device) # loss_func loss_func = neg_log_likeDUN # optimiser optimizer = torch.optim.Adam(model_dun.parameters(), lr=args.lr_dun_min) # scheduler scheduler = NoamLR( optimizer=optimizer, warmup_epochs=[2], total_epochs=[100], steps_per_epoch=args.train_data_size // args.batch_size_dun, init_lr=[args.lr_dun_min], max_lr=[args.lr_dun_max], final_lr=[args.lr_dun_min] ) # non sampling mode for first 100 epochs bbp_switch = 3 # freeze log_cat for first 100 epochs for name, parameter in model_dun.named_parameters(): if name == 'log_cat': parameter.requires_grad = False else: parameter.requires_grad = True print("----------DUN training----------") # training loop best_score = float('inf') if args.minimize_score else -float('inf') best_epoch, n_iter = 0, 0 for epoch in range(args.epochs_dun): print(f'DUN epoch {epoch}') # start second phase if epoch == 100: scheduler = scheduler_const([args.lr_dun_min]) bbp_switch = 4 for name, parameter in model_dun.named_parameters(): parameter.requires_grad = True n_iter = train( model=model_dun, data_loader=train_data_loader, loss_func=loss_func, optimizer=optimizer, scheduler=scheduler, args=args, n_iter=n_iter, bbp_switch=bbp_switch ) val_scores = evaluate( model=model_dun, data_loader=val_data_loader, args=args, num_tasks=args.num_tasks, metric_func=metric_func, dataset_type=args.dataset_type, scaler=scaler ) # Average validation score avg_val_score = np.nanmean(val_scores) print(f'Validation {args.metric} = {avg_val_score:.6f}') wandb.log({"Validation MAE": avg_val_score}) print('variational categorical:') print(torch.exp(model_dun.log_cat) / torch.sum(torch.exp(model_dun.log_cat))) # Save model checkpoint if improved validation score if (args.minimize_score and avg_val_score < best_score or \ not args.minimize_score and avg_val_score > best_score) and (epoch >= args.presave_dun): best_score, best_epoch = avg_val_score, epoch save_checkpoint(os.path.join(save_dir, 'model_dun.pt'), model_dun, scaler, features_scaler, args) # load model with best validation score template = MoleculeModelDUN(args) for layer in template.children(): if isinstance(layer, BayesLinear): layer.init_rho(args.rho_min_dun, args.rho_max_dun) for layer in template.encoder.encoder.children(): if isinstance(layer, BayesLinear): layer.init_rho(args.rho_min_dun, args.rho_max_dun) template.create_log_cat(args) print(f'Best validation {args.metric} = {best_score:.6f} on epoch {best_epoch}') model_dun = load_checkpoint(os.path.join(save_dir, 'model_dun.pt'), device=args.device, logger=None, template = template) return model_dun
def train_swag(model, train_data, val_data, num_workers, cache, loss_func, metric_func, scaler, features_scaler, args, save_dir): # create data loaders for swag (allows different batch size) train_data_loader = MoleculeDataLoader(dataset=train_data, batch_size=args.batch_size_swag, num_workers=num_workers, cache=cache, class_balance=args.class_balance, shuffle=True, seed=args.seed) val_data_loader = MoleculeDataLoader(dataset=val_data, batch_size=args.batch_size_swag, num_workers=num_workers, cache=cache) # define no_cov_mat from cov_mat if args.cov_mat: no_cov_mat = False else: no_cov_mat = True # instantiate SWAG model (wrapper) swag_model = SWAG(model, args, no_cov_mat, args.max_num_models, var_clamp=1e-30) ############## DEFINE COSINE OPTIMISER AND SCHEDULER ############## # define optimiser optimizer = torch.optim.SGD([{ 'params': model.encoder.parameters() }, { 'params': model.ffn.parameters() }, { 'params': model.log_noise, 'lr': args.lr_swag / 5 / 25, 'weight_decay': 0 }], lr=args.lr_swag / 25, weight_decay=args.weight_decay_swag, momentum=args.momentum_swag) # define scheduler num_param_groups = len(optimizer.param_groups) scheduler = OneCycleLR( optimizer, max_lr=[args.lr_swag, args.lr_swag, args.lr_swag / 5], epochs=args.epochs_swag, steps_per_epoch=-(-args.train_data_size // args.batch_size_swag), pct_start=5 / args.epochs_swag, anneal_strategy='cos', cycle_momentum=False, div_factor=25.0, final_div_factor=1 / 25) ################################################################### print("----------SWAG training----------") # training loop n_iter = 0 for epoch in range(args.epochs_swag): print(f'SWAG epoch {epoch}') n_iter = train(model=model, data_loader=train_data_loader, loss_func=loss_func, optimizer=optimizer, scheduler=scheduler, args=args, n_iter=n_iter) val_scores = evaluate(model=model, data_loader=val_data_loader, args=args, num_tasks=args.num_tasks, metric_func=metric_func, dataset_type=args.dataset_type, scaler=scaler) # Average validation score avg_val_score = np.nanmean(val_scores) print(f'Validation {args.metric} = {avg_val_score:.6f}') wandb.log({"Validation MAE": avg_val_score}) # SWAG update if (epoch >= args.burnin_swag) and (avg_val_score < args.val_threshold): swag_model.collect_model(model) print('***collection***') # save final swag model save_checkpoint(os.path.join(save_dir, 'model.pt'), swag_model, scaler, features_scaler, args) return swag_model
def gcnn_predict(self, model, scaler) -> Tuple[array, array]: """ Function that handles graph convolutinal neural network predictions, enters them into the predictions DataFrame and reports any errors Parameters: models (model): model scaler (scalar): scalar Returns: predictions, prediction_labels (Tuple[array, array]): predictions and labels """ smiles = self.kekule_smiles.tolist() full_data = get_data_from_smiles(smiles=smiles, skip_invalid_smiles=False) full_to_valid_indices = {} valid_index = 0 for full_index in range(len(full_data)): if full_data[full_index].mol is not None: full_to_valid_indices[full_index] = valid_index valid_index += 1 data = MoleculeDataset([full_data[i] for i in sorted(full_to_valid_indices.keys())]) # create data loader data_loader = MoleculeDataLoader( dataset=data, batch_size=50, num_workers=0 ) model_preds = predict( model=model, data_loader=data_loader, scaler=scaler ) predictions = np.ma.empty(len(full_data)) predictions.mask = True labels = np.ma.empty(len(full_data), dtype=np.int32) labels.mask = True for key in full_to_valid_indices.keys(): full_index = int(key) predictions[full_index] = model_preds[full_to_valid_indices[key]][0] labels[full_index] = np.round(model_preds[full_to_valid_indices[key]][0], 0) if self.smiles is not None: dt = datetime.datetime.now(timezone.utc) utc_time = dt.replace(tzinfo=timezone.utc) utc_timestamp = utc_time.timestamp() self.raw_predictions_df = self.raw_predictions_df.append( pd.DataFrame( { 'SMILES': self.smiles, 'model': self.model_name, 'prediction': predictions, 'timestamp': utc_timestamp } ), ignore_index = True ) # if self.interpret == True: # intrprt_df = get_interpretation(self.smiles, self.model_name) # else: # col_names = ['smiles', 'rationale_smiles', 'rationale_score'] # intrprt_df = pd.DataFrame(columns = col_names) #self.predictions_df['smiles'] = pd.Series(np.where(intrprt_df['rationale_scores']>0, intrprt_df['smiles'] + '_' + intrprt_df['rationale_smiles'], intrprt_df['smiles'])) self.predictions_df[self.column_dict_key] = pd.Series(pd.Series(labels).fillna('').astype(str) + ' (' + pd.Series(np.where(predictions>=0.5, predictions, (1 - predictions))).round(2).astype(str) + ')').str.replace('(nan)', '', regex=False) if len(self.predictions_df.index) > len(predictions) or np.ma.count_masked(predictions) > 0: self.model_errors.append('graph convolutional neural network') self.has_errors = True return predictions, labels
def run_training(args: TrainArgs, data: MoleculeDataset, logger: Logger = None) -> Dict[str, List[float]]: """ Loads data, trains a Chemprop model, and returns test scores for the model checkpoint with the highest validation score. :param args: A :class:`~chemprop.args.TrainArgs` object containing arguments for loading data and training the Chemprop model. :param data: A :class:`~chemprop.data.MoleculeDataset` containing the data. :param logger: A logger to record output. :return: A dictionary mapping each metric in :code:`args.metrics` to a list of values for each task. """ if logger is not None: debug, info = logger.debug, logger.info else: debug = info = print # Set pytorch seed for random initial weights torch.manual_seed(args.pytorch_seed) # Split data debug(f"Splitting data with seed {args.seed}") # if args.separate_test_path: # test_data = get_data( # path=args.separate_test_path, # args=args, # features_path=args.separate_test_features_path, # atom_descriptors_path=args.separate_test_atom_descriptors_path, # bond_features_path=args.separate_test_bond_features_path, # smiles_columns=args.smiles_columns, # logger=logger, # ) # if args.separate_val_path: # val_data = get_data( # path=args.separate_val_path, # args=args, # features_path=args.separate_val_features_path, # atom_descriptors_path=args.separate_val_atom_descriptors_path, # bond_features_path=args.separate_val_bond_features_path, # smiles_columns=args.smiles_columns, # logger=logger, # ) # if args.separate_val_path and args.separate_test_path: # train_data = data # elif args.separate_val_path: # train_data, _, test_data = split_data( # data=data, # split_type=args.split_type, # sizes=(0.8, 0.0, 0.2), # seed=args.seed, # num_folds=args.num_folds, # args=args, # logger=logger, # ) # elif args.separate_test_path: # train_data, val_data, _ = split_data( # data=data, # split_type=args.split_type, # sizes=(0.8, 0.2, 0.0), # seed=args.seed, # num_folds=args.num_folds, # args=args, # logger=logger, # ) # else: # Default train_data, val_data, test_data = split_data( data=data, split_type=args.split_type, sizes=args.split_sizes, seed=args.seed, num_folds=args.num_folds, args=args, logger=logger, ) if args.dataset_type == "classification": class_sizes = get_class_sizes(data) debug("Class sizes") for i, task_class_sizes in enumerate(class_sizes): debug( f"{args.task_names[i]} " f'{", ".join(f"{cls}: {size * 100:.2f}%" for cls, size in enumerate(task_class_sizes))}' ) if args.save_smiles_splits: save_smiles_splits( data_path=args.data_path, save_dir=args.save_dir, task_names=args.task_names, features_path=args.features_path, train_data=train_data, val_data=val_data, test_data=test_data, smiles_columns=args.smiles_columns, ) if args.features_scaling: features_scaler = train_data.normalize_features(replace_nan_token=0) val_data.normalize_features(features_scaler) test_data.normalize_features(features_scaler) else: features_scaler = None if args.atom_descriptor_scaling and args.atom_descriptors is not None: atom_descriptor_scaler = train_data.normalize_features( replace_nan_token=0, scale_atom_descriptors=True) val_data.normalize_features(atom_descriptor_scaler, scale_atom_descriptors=True) test_data.normalize_features(atom_descriptor_scaler, scale_atom_descriptors=True) else: atom_descriptor_scaler = None if args.bond_feature_scaling and args.bond_features_size > 0: bond_feature_scaler = train_data.normalize_features( replace_nan_token=0, scale_bond_features=True) val_data.normalize_features(bond_feature_scaler, scale_bond_features=True) test_data.normalize_features(bond_feature_scaler, scale_bond_features=True) else: bond_feature_scaler = None args.train_data_size = len(train_data) debug( f"Total size = {len(data):,} | " f"train size = {len(train_data):,} | val size = {len(val_data):,} | test size = {len(test_data):,}" ) # Initialize scaler and scale training targets by subtracting mean and dividing standard deviation (regression only) if args.dataset_type == "regression": debug("Fitting scaler") scaler = train_data.normalize_targets() else: scaler = None # Get loss function loss_func = get_loss_func(args) # Set up test set evaluation test_smiles, test_targets = test_data.smiles(), test_data.targets() if args.dataset_type == "multiclass": sum_test_preds = np.zeros( (len(test_smiles), args.num_tasks, args.multiclass_num_classes)) else: sum_test_preds = np.zeros((len(test_smiles), args.num_tasks)) # Automatically determine whether to cache if len(data) <= args.cache_cutoff: set_cache_graph(True) num_workers = 0 else: set_cache_graph(False) num_workers = args.num_workers # Create data loaders train_data_loader = MoleculeDataLoader( dataset=train_data, batch_size=args.batch_size, num_workers=num_workers, class_balance=args.class_balance, shuffle=True, seed=args.seed, ) val_data_loader = MoleculeDataLoader(dataset=val_data, batch_size=args.batch_size, num_workers=num_workers) test_data_loader = MoleculeDataLoader(dataset=test_data, batch_size=args.batch_size, num_workers=num_workers) if args.class_balance: debug( f"With class_balance, effective train size = {train_data_loader.iter_size:,}" ) # Train ensemble of models for model_idx in range(args.ensemble_size): # Tensorboard writer save_dir = os.path.join(args.save_dir, f"model_{model_idx}") makedirs(save_dir) try: writer = SummaryWriter(log_dir=save_dir) except: writer = SummaryWriter(logdir=save_dir) # Load/build model if args.checkpoint_paths is not None: debug( f"Loading model {model_idx} from {args.checkpoint_paths[model_idx]}" ) model = load_checkpoint(args.checkpoint_paths[model_idx], logger=logger) else: debug(f"Building model {model_idx}") model = MoleculeModel(args) debug(model) debug(f"Number of parameters = {param_count(model):,}") if args.cuda: debug("Moving model to cuda") model = model.to(args.device) # Ensure that model is saved in correct location for evaluation if 0 epochs save_checkpoint( os.path.join(save_dir, MODEL_FILE_NAME), model, scaler, features_scaler, atom_descriptor_scaler, bond_feature_scaler, args, ) # Optimizers optimizer = build_optimizer(model, args) # Learning rate schedulers scheduler = build_lr_scheduler(optimizer, args) # Run training best_score = float("inf") if args.minimize_score else -float("inf") best_epoch, n_iter = 0, 0 for epoch in trange(args.epochs): debug(f"Epoch {epoch}") n_iter = train( model=model, data_loader=train_data_loader, loss_func=loss_func, optimizer=optimizer, scheduler=scheduler, args=args, n_iter=n_iter, logger=logger, writer=writer, ) if isinstance(scheduler, ExponentialLR): scheduler.step() val_scores = evaluate( model=model, data_loader=val_data_loader, num_tasks=args.num_tasks, metrics=args.metrics, dataset_type=args.dataset_type, scaler=scaler, logger=logger, ) for metric, scores in val_scores.items(): # Average validation score avg_val_score = np.nanmean(scores) debug(f"Validation {metric} = {avg_val_score:.6f}") writer.add_scalar(f"validation_{metric}", avg_val_score, n_iter) if args.show_individual_scores: # Individual validation scores for task_name, val_score in zip(args.task_names, scores): debug( f"Validation {task_name} {metric} = {val_score:.6f}" ) writer.add_scalar(f"validation_{task_name}_{metric}", val_score, n_iter) # Save model checkpoint if improved validation score avg_val_score = np.nanmean(val_scores[args.metric]) if (args.minimize_score and avg_val_score < best_score or not args.minimize_score and avg_val_score > best_score): best_score, best_epoch = avg_val_score, epoch save_checkpoint( os.path.join(save_dir, MODEL_FILE_NAME), model, scaler, features_scaler, atom_descriptor_scaler, bond_feature_scaler, args, ) # Evaluate on test set using model with best validation score info( f"Model {model_idx} best validation {args.metric} = {best_score:.6f} on epoch {best_epoch}" ) model = load_checkpoint(os.path.join(save_dir, MODEL_FILE_NAME), device=args.device, logger=logger) test_preds = predict(model=model, data_loader=test_data_loader, scaler=scaler) test_scores = evaluate_predictions( preds=test_preds, targets=test_targets, num_tasks=args.num_tasks, metrics=args.metrics, dataset_type=args.dataset_type, logger=logger, ) if len(test_preds) != 0: sum_test_preds += np.array(test_preds) # Average test score for metric, scores in test_scores.items(): avg_test_score = np.nanmean(scores) info(f"Model {model_idx} test {metric} = {avg_test_score:.6f}") writer.add_scalar(f"test_{metric}", avg_test_score, 0) if args.show_individual_scores: # Individual test scores for task_name, test_score in zip(args.task_names, scores): info( f"Model {model_idx} test {task_name} {metric} = {test_score:.6f}" ) writer.add_scalar(f"test_{task_name}_{metric}", test_score, n_iter) writer.close() # Evaluate ensemble on test set avg_test_preds = (sum_test_preds / args.ensemble_size).tolist() ensemble_scores = evaluate_predictions( preds=avg_test_preds, targets=test_targets, num_tasks=args.num_tasks, metrics=args.metrics, dataset_type=args.dataset_type, logger=logger, ) for metric, scores in ensemble_scores.items(): # Average ensemble score avg_ensemble_test_score = np.nanmean(scores) info(f"Ensemble test {metric} = {avg_ensemble_test_score:.6f}") # Individual ensemble scores if args.show_individual_scores: for task_name, ensemble_score in zip(args.task_names, scores): info( f"Ensemble test {task_name} {metric} = {ensemble_score:.6f}" ) # Optionally save test preds if args.save_preds: test_preds_dataframe = pd.DataFrame( data={"smiles": test_data.smiles()}) for i, task_name in enumerate(args.task_names): test_preds_dataframe[task_name] = [ pred[i] for pred in avg_test_preds ] test_preds_dataframe.to_csv(os.path.join(args.save_dir, "test_preds.csv"), index=False) return ensemble_scores
def make_predictions( args: PredictArgs, smiles: List[List[str]] = None) -> List[List[Optional[float]]]: """ Loads data and a trained model and uses the model to make predictions on the data. If SMILES are provided, then makes predictions on smiles. Otherwise makes predictions on :code:`args.test_data`. :param args: A :class:`~chemprop.args.PredictArgs` object containing arguments for loading data and a model and making predictions. :param smiles: List of list of SMILES to make predictions on. :return: A list of lists of target predictions. """ print('Loading training args') train_args = load_args(args.checkpoint_paths[0]) num_tasks, task_names = train_args.num_tasks, train_args.task_names # If features were used during training, they must be used when predicting if ((train_args.features_path is not None or train_args.features_generator is not None) and args.features_path is None and args.features_generator is None): raise ValueError( 'Features were used during training so they must be specified again during prediction ' 'using the same type of features as before (with either --features_generator or ' '--features_path and using --no_features_scaling if applicable).') # Update predict args with training arguments to create a merged args object for key, value in vars(train_args).items(): if not hasattr(args, key): setattr(args, key, value) args: Union[PredictArgs, TrainArgs] print('Loading data') if smiles is not None: full_data = get_data_from_smiles( smiles=smiles, skip_invalid_smiles=False, features_generator=args.features_generator) else: full_data = get_data(path=args.test_path, target_columns=[], ignore_columns=[], skip_invalid_smiles=False, args=args, store_row=True) print('Validating SMILES') full_to_valid_indices = {} valid_index = 0 for full_index in range(len(full_data)): if all(mol is not None for mol in full_data[full_index].mol): full_to_valid_indices[full_index] = valid_index valid_index += 1 test_data = MoleculeDataset( [full_data[i] for i in sorted(full_to_valid_indices.keys())]) # Edge case if empty list of smiles is provided if len(test_data) == 0: return [None] * len(full_data) print(f'Test size = {len(test_data):,}') # Predict with each model individually and sum predictions if args.dataset_type == 'multiclass': sum_preds = np.zeros( (len(test_data), num_tasks, args.multiclass_num_classes)) else: sum_preds = np.zeros((len(test_data), num_tasks)) # Create data loader test_data_loader = MoleculeDataLoader(dataset=test_data, batch_size=args.batch_size, num_workers=args.num_workers) print( f'Predicting with an ensemble of {len(args.checkpoint_paths)} models') for checkpoint_path in tqdm(args.checkpoint_paths, total=len(args.checkpoint_paths)): # Load model and scalers model = load_checkpoint(checkpoint_path, device=args.device) scaler, features_scaler = load_scalers(checkpoint_path) # Normalize features if args.features_scaling: test_data.reset_features_and_targets() test_data.normalize_features(features_scaler) # Make predictions model_preds = predict(model=model, data_loader=test_data_loader, scaler=scaler) sum_preds += np.array(model_preds) # Ensemble predictions avg_preds = sum_preds / len(args.checkpoint_paths) avg_preds = avg_preds.tolist() # Save predictions print(f'Saving predictions to {args.preds_path}') assert len(test_data) == len(avg_preds) makedirs(args.preds_path, isfile=True) # Get prediction column names if args.dataset_type == 'multiclass': task_names = [ f'{name}_class_{i}' for name in task_names for i in range(args.multiclass_num_classes) ] else: task_names = task_names # Copy predictions over to full_data for full_index, datapoint in enumerate(full_data): valid_index = full_to_valid_indices.get(full_index, None) preds = avg_preds[valid_index] if valid_index is not None else [ 'Invalid SMILES' ] * len(task_names) for pred_name, pred in zip(task_names, preds): datapoint.row[pred_name] = pred # Save with open(args.preds_path, 'w') as f: writer = csv.DictWriter(f, fieldnames=full_data[0].row.keys()) writer.writeheader() for datapoint in full_data: writer.writerow(datapoint.row) return avg_preds
def predict_feature(args, logger, model, external_test_path): if logger is not None: debug, info = logger.debug, logger.info else: debug = info = print # Print command line debug('Command line') debug(f'python {" ".join(sys.argv)}') # Print args debug('Args') debug(args) # Save args args.save(os.path.join(args.save_dir, 'args.json')) # Set pytorch seed for random initial weights torch.manual_seed(args.pytorch_seed) # Get data debug('Loading data') args.task_names = args.target_columns or get_task_names(args.data_path) data = get_data(path=args.data_path, args=args, logger=logger) args.num_tasks = data.num_tasks() args.features_size = data.features_size() debug(f'Number of tasks = {args.num_tasks}') external_test_data = get_data(path=external_test_path, args=args, logger=logger) # Split data debug(f'Splitting data with seed {args.seed}') if args.separate_test_path: test_data = get_data(path=args.separate_test_path, args=args, features_path=args.separate_test_features_path, logger=logger) if args.separate_val_path: val_data = get_data(path=args.separate_val_path, args=args, features_path=args.separate_val_features_path, logger=logger) if args.separate_val_path and args.separate_test_path: train_data = data elif args.separate_val_path: train_data, _, test_data = split_data(data=data, split_type=args.split_type, sizes=(0.8, 0.0, 0.2), seed=args.seed, args=args, logger=logger) elif args.separate_test_path: train_data, val_data, _ = split_data(data=data, split_type=args.split_type, sizes=(0.8, 0.2, 0.0), seed=args.seed, args=args, logger=logger) else: train_data, val_data, test_data = split_data( data=data, split_type=args.split_type, sizes=args.split_sizes, seed=args.seed, args=args, logger=logger) if args.dataset_type == 'classification': class_sizes = get_class_sizes(data) debug('Class sizes') for i, task_class_sizes in enumerate(class_sizes): debug( f'{args.task_names[i]} ' f'{", ".join(f"{cls}: {size * 100:.2f}%" for cls, size in enumerate(task_class_sizes))}' ) if args.save_smiles_splits: save_smiles_splits(train_data=train_data, val_data=val_data, test_data=test_data, data_path=args.data_path, save_dir=args.save_dir) if args.features_scaling: features_scaler = train_data.normalize_features(replace_nan_token=0) external_test_data.normalize_features(features_scaler) else: features_scaler = None args.train_data_size = len(train_data) debug( f'Total size = {len(data):,} | ' f'train size = {len(train_data):,} | val size = {len(val_data):,} | test size = {len(test_data):,}' ) scaler = None # Automatically determine whether to cache if len(data) <= args.cache_cutoff: cache = True num_workers = 0 else: cache = False num_workers = args.num_workers # Create data loaders train_data_loader = MoleculeDataLoader(dataset=train_data, batch_size=args.batch_size, num_workers=num_workers, cache=cache, class_balance=args.class_balance, shuffle=False, seed=args.seed) external_test_loader = MoleculeDataLoader(dataset=external_test_data, batch_size=args.batch_size, num_workers=num_workers, cache=cache, class_balance=args.class_balance, shuffle=False, seed=args.seed) external_test_preds, external_test_feature = predict( model=model, data_loader=external_test_loader, scaler=scaler) external_test_smiles, external_test_targets = external_test_data.smiles( ), external_test_data.targets() return external_test_smiles, external_test_feature, external_test_preds, external_test_targets
def make_predictions( args: PredictArgs, smiles: List[List[str]] = None) -> List[List[Optional[float]]]: """ Loads data and a trained model and uses the model to make predictions on the data. If SMILES are provided, then makes predictions on smiles. Otherwise makes predictions on :code:`args.test_data`. :param args: A :class:`~chemprop.args.PredictArgs` object containing arguments for loading data and a model and making predictions. :param smiles: List of list of SMILES to make predictions on. :return: A list of lists of target predictions. """ print("Loading training args") train_args = load_args(args.checkpoint_paths[0]) num_tasks, task_names = train_args.num_tasks, train_args.task_names update_prediction_args(predict_args=args, train_args=train_args) args: Union[PredictArgs, TrainArgs] if args.atom_descriptors == "feature": set_extra_atom_fdim(train_args.atom_features_size) if args.bond_features_path is not None: set_extra_bond_fdim(train_args.bond_features_size) # set explicit H option and reaction option set_explicit_h(train_args.explicit_h) set_reaction(train_args.reaction, train_args.reaction_mode) print("Loading data") if smiles is not None: full_data = get_data_from_smiles( smiles=smiles, skip_invalid_smiles=False, features_generator=args.features_generator, ) else: full_data = get_data( path=args.test_path, smiles_columns=args.smiles_columns, target_columns=[], ignore_columns=[], skip_invalid_smiles=False, args=args, store_row=not args.drop_extra_columns, ) print("Validating SMILES") full_to_valid_indices = {} valid_index = 0 for full_index in range(len(full_data)): if all(mol is not None for mol in full_data[full_index].mol): full_to_valid_indices[full_index] = valid_index valid_index += 1 test_data = MoleculeDataset( [full_data[i] for i in sorted(full_to_valid_indices.keys())]) # Edge case if empty list of smiles is provided if len(test_data) == 0: return [None] * len(full_data) print(f"Test size = {len(test_data):,}") # Predict with each model individually and sum predictions if args.dataset_type == "multiclass": sum_preds = np.zeros( (len(test_data), num_tasks, args.multiclass_num_classes)) else: sum_preds = np.zeros((len(test_data), num_tasks)) # Create data loader test_data_loader = MoleculeDataLoader( dataset=test_data, batch_size=args.batch_size, num_workers=0 if sys.platform == "darwin" else args.num_workers, ) # Partial results for variance robust calculation. if args.ensemble_variance: all_preds = np.zeros( (len(test_data), num_tasks, len(args.checkpoint_paths))) print( f"Predicting with an ensemble of {len(args.checkpoint_paths)} models") for index, checkpoint_path in enumerate( tqdm(args.checkpoint_paths, total=len(args.checkpoint_paths))): # Load model and scalers model = load_checkpoint(checkpoint_path, device=args.device) ( scaler, features_scaler, atom_descriptor_scaler, bond_feature_scaler, ) = load_scalers(checkpoint_path) # Normalize features if (args.features_scaling or train_args.atom_descriptor_scaling or train_args.bond_feature_scaling): test_data.reset_features_and_targets() if args.features_scaling: test_data.normalize_features(features_scaler) if (train_args.atom_descriptor_scaling and args.atom_descriptors is not None): test_data.normalize_features(atom_descriptor_scaler, scale_atom_descriptors=True) if train_args.bond_feature_scaling and args.bond_features_size > 0: test_data.normalize_features(bond_feature_scaler, scale_bond_features=True) # Make predictions model_preds = predict(model=model, data_loader=test_data_loader, scaler=scaler) sum_preds += np.array(model_preds) if args.ensemble_variance: all_preds[:, :, index] = model_preds # Ensemble predictions avg_preds = sum_preds / len(args.checkpoint_paths) avg_preds = avg_preds.tolist() if args.ensemble_variance: all_epi_uncs = np.var(all_preds, axis=2) all_epi_uncs = all_epi_uncs.tolist() # Save predictions print(f"Saving predictions to {args.preds_path}") assert len(test_data) == len(avg_preds) if args.ensemble_variance: assert len(test_data) == len(all_epi_uncs) makedirs(args.preds_path, isfile=True) # Get prediction column names if args.dataset_type == "multiclass": task_names = [ f"{name}_class_{i}" for name in task_names for i in range(args.multiclass_num_classes) ] else: task_names = task_names # Copy predictions over to full_data for full_index, datapoint in enumerate(full_data): valid_index = full_to_valid_indices.get(full_index, None) preds = (avg_preds[valid_index] if valid_index is not None else ["Invalid SMILES"] * len(task_names)) if args.ensemble_variance: epi_uncs = (all_epi_uncs[valid_index] if valid_index is not None else ["Invalid SMILES"] * len(task_names)) # If extra columns have been dropped, add back in SMILES columns if args.drop_extra_columns: datapoint.row = OrderedDict() smiles_columns = args.smiles_columns for column, smiles in zip(smiles_columns, datapoint.smiles): datapoint.row[column] = smiles # Add predictions columns if args.ensemble_variance: for pred_name, pred, epi_unc in zip(task_names, preds, epi_uncs): datapoint.row[pred_name] = pred datapoint.row[pred_name + "_epi_unc"] = epi_unc else: for pred_name, pred in zip(task_names, preds): datapoint.row[pred_name] = pred # Save with open(args.preds_path, "w") as f: writer = csv.DictWriter(f, fieldnames=full_data[0].row.keys()) writer.writeheader() for datapoint in full_data: writer.writerow(datapoint.row) return avg_preds
def train_sgld(model, train_data, val_data, num_workers, cache, loss_func, metric_func, scaler, features_scaler, args, save_dir): # create data loaders for sgld (allows different batch size) train_data_loader = MoleculeDataLoader(dataset=train_data, batch_size=args.batch_size_sgld, num_workers=num_workers, cache=cache, class_balance=args.class_balance, shuffle=True, seed=args.seed) val_data_loader = MoleculeDataLoader(dataset=val_data, batch_size=args.batch_size_sgld, num_workers=num_workers, cache=cache) # number of sgld epochs epochs_sgld = args.mix_epochs * args.samples print("----------SGLD training----------") # training loop n_iter = 0 sample_idx = 0 for epoch in range(epochs_sgld): ##### DEFINE OPTIMISER AND SCHEDULER ######################## if epoch % args.mix_epochs == 0: print('\n********** resetting scheduler **********') optimizer = SGLD([{ 'params': model.encoder.parameters() }, { 'params': model.ffn.parameters() }, { 'params': model.log_noise, 'lr': args.lr_max_sgld / 5 / 25, 'addnoise': False }], args, lr=args.lr_max_sgld / 25, weight_decay=args.weight_decay_sgld, addnoise=True) num_param_groups = len(optimizer.param_groups) scheduler = OneCycleLR(optimizer, max_lr=[ args.lr_max_sgld, args.lr_max_sgld, args.lr_max_sgld / 5 ], epochs=args.mix_epochs, steps_per_epoch=-(-args.train_data_size // args.batch_size_sgld), pct_start=0.2, anneal_strategy='cos', cycle_momentum=False, div_factor=25.0, final_div_factor=10000) ############################################################# print(f'SGLD epoch {epoch}') n_iter = train(model=model, data_loader=train_data_loader, loss_func=loss_func, optimizer=optimizer, scheduler=scheduler, args=args, n_iter=n_iter) val_scores = evaluate(model=model, data_loader=val_data_loader, args=args, num_tasks=args.num_tasks, metric_func=metric_func, dataset_type=args.dataset_type, scaler=scaler) # Average validation score avg_val_score = np.nanmean(val_scores) print(f'Validation {args.metric} = {avg_val_score:.6f}') wandb.log({"Validation MAE": avg_val_score}) # collect model samples if (epoch + 1) % args.mix_epochs == 0: print( f'---------- collecting sgld sample {sample_idx} ----------\n') save_checkpoint(os.path.join(save_dir, f'model_{sample_idx}.pt'), model, scaler, features_scaler, args) sample_idx += 1 return model
def train_bbp(model, train_data, val_data, num_workers, cache, loss_func, metric_func, scaler, features_scaler, args, save_dir): # data loaders for bbp train_data_loader = MoleculeDataLoader(dataset=train_data, batch_size=args.batch_size_bbp, num_workers=num_workers, cache=cache, class_balance=args.class_balance, shuffle=True, seed=args.seed) val_data_loader = MoleculeDataLoader(dataset=val_data, batch_size=args.batch_size_bbp, num_workers=num_workers, cache=cache) # instantiate BBP model with Bayesian linear layers (includes log noise) model_bbp = MoleculeModelBBP(args) # copy over parameters from pretrained to BBP model # we take the transpose because the Bayes linear layers have transpose shapes for (_, param_bbp), (_, param_pre) in zip(model_bbp.named_parameters(), model.named_parameters()): param_bbp.data = copy.deepcopy(param_pre.data.T) # instantiate rho for each weight for layer in model_bbp.children(): if isinstance(layer, BayesLinear): layer.init_rho(args.rho_min_bbp, args.rho_max_bbp) for layer in model_bbp.encoder.encoder.children(): if isinstance(layer, BayesLinear): layer.init_rho(args.rho_min_bbp, args.rho_max_bbp) # move bbp model to cuda if args.cuda: print('Moving bbp model to cuda') model_bbp = model_bbp.to(args.device) # optimiser optimizer = torch.optim.Adam(model_bbp.parameters(), lr=args.lr_bbp) # scheduler scheduler = scheduler_const([args.lr_bbp]) print("----------BBP training----------") # training loop best_score = float('inf') if args.minimize_score else -float('inf') best_epoch, n_iter = 0, 0 for epoch in range(args.epochs_bbp): print(f'BBP epoch {epoch}') n_iter = train(model=model_bbp, data_loader=train_data_loader, loss_func=loss_func, optimizer=optimizer, scheduler=scheduler, args=args, n_iter=n_iter, bbp_switch=2) val_scores = evaluate(model=model_bbp, data_loader=val_data_loader, args=args, num_tasks=args.num_tasks, metric_func=metric_func, dataset_type=args.dataset_type, scaler=scaler) # Average validation score avg_val_score = np.nanmean(val_scores) print(f'Validation {args.metric} = {avg_val_score:.6f}') wandb.log({"Validation MAE": avg_val_score}) # Save model checkpoint if improved validation score if (args.minimize_score and avg_val_score < best_score or \ not args.minimize_score and avg_val_score > best_score) and (epoch >= args.presave_bbp): best_score, best_epoch = avg_val_score, epoch save_checkpoint(os.path.join(save_dir, 'model_bbp.pt'), model_bbp, scaler, features_scaler, args) # load model with best validation score template = MoleculeModelBBP(args) for layer in template.children(): if isinstance(layer, BayesLinear): layer.init_rho(args.rho_min_bbp, args.rho_max_bbp) for layer in template.encoder.encoder.children(): if isinstance(layer, BayesLinear): layer.init_rho(args.rho_min_bbp, args.rho_max_bbp) print( f'Best validation {args.metric} = {best_score:.6f} on epoch {best_epoch}' ) model_bbp = load_checkpoint(os.path.join(save_dir, 'model_bbp.pt'), device=args.device, logger=None, template=template) return model_bbp
def new_noise(args: TrainArgs, logger: Logger = None) -> List[float]: """ Trains a model and returns test scores on the model checkpoint with the highest validation score. :param args: Arguments. :param logger: Logger. :return: A list of ensemble scores for each task. """ debug = info = print # Get data args.task_names = args.target_columns or get_task_names(args.data_path) data = get_data(path=args.data_path, args=args, logger=logger) args.num_tasks = data.num_tasks() args.features_size = data.features_size() # Split data debug(f'Splitting data with seed {args.seed}') train_data, val_data, test_data = split_data(data=data, split_type=args.split_type, sizes=args.split_sizes, seed=args.seed, args=args, logger=logger) if args.features_scaling: features_scaler = train_data.normalize_features(replace_nan_token=0) val_data.normalize_features(features_scaler) test_data.normalize_features(features_scaler) else: features_scaler = None args.train_data_size = len(train_data) # Initialize scaler and scale training targets by subtracting mean and dividing standard deviation (regression only) if args.dataset_type == 'regression': debug('Fitting scaler') train_smiles, train_targets = train_data.smiles(), train_data.targets() scaler = StandardScaler().fit(train_targets) scaled_targets = scaler.transform(train_targets).tolist() train_data.set_targets(scaled_targets) else: scaler = None # Get loss and metric functions loss_func = neg_log_like metric_func = get_metric_func(metric=args.metric) # Set up test set evaluation test_smiles, test_targets = test_data.smiles(), test_data.targets() sum_test_preds = np.zeros((len(test_smiles), args.num_tasks)) # Automatically determine whether to cache if len(data) <= args.cache_cutoff: cache = True num_workers = 0 else: cache = False num_workers = args.num_workers # Create data loaders train_data_loader = MoleculeDataLoader(dataset=train_data, batch_size=args.batch_size, num_workers=num_workers, cache=cache) val_data_loader = MoleculeDataLoader(dataset=val_data, batch_size=args.batch_size, num_workers=num_workers, cache=cache) test_data_loader = MoleculeDataLoader(dataset=test_data, batch_size=args.batch_size, num_workers=num_workers, cache=cache) ########################################### ########## Outer loop over ensemble members ########################################### for model_idx in range(args.ensemble_start_idx, args.ensemble_start_idx + args.ensemble_size): # load the model if (args.method == 'map') or (args.method == 'swag') or (args.method == 'sgld'): model = load_checkpoint(args.checkpoint_path + f'/model_{model_idx}/model.pt', device=args.device, logger=logger) if args.method == 'gp': args.num_inducing_points = 1200 fake_model = MoleculeModel(args) fake_model.featurizer = True feature_extractor = fake_model inducing_points = initial_inducing_points(train_data_loader, feature_extractor, args) gp_layer = GPLayer(inducing_points, args.num_tasks) model = load_checkpoint( args.checkpoint_path + f'/model_{model_idx}/DKN_model.pt', device=args.device, logger=None, template=DKLMoleculeModel(MoleculeModel(args, featurizer=True), gp_layer)) if args.method == 'dropR' or args.method == 'dropA': model = load_checkpoint(args.checkpoint_path + f'/model_{model_idx}/model.pt', device=args.device, logger=logger) if args.method == 'bbp': template = MoleculeModelBBP(args) for layer in template.children(): if isinstance(layer, BayesLinear): layer.init_rho(args.rho_min_bbp, args.rho_max_bbp) for layer in template.encoder.encoder.children(): if isinstance(layer, BayesLinear): layer.init_rho(args.rho_min_bbp, args.rho_max_bbp) model = load_checkpoint(args.checkpoint_path + f'/model_{model_idx}/model_bbp.pt', device=args.device, logger=None, template=template) if args.method == 'dun': args.prior_sig_dun = 0.05 args.depth_min = 1 args.depth_max = 5 args.rho_min_dun = -5.5 args.rho_max_dun = -5 args.log_cat_init = 0 template = MoleculeModelDUN(args) for layer in template.children(): if isinstance(layer, BayesLinear): layer.init_rho(args.rho_min_dun, args.rho_max_dun) for layer in template.encoder.encoder.children(): if isinstance(layer, BayesLinear): layer.init_rho(args.rho_min_dun, args.rho_max_dun) template.create_log_cat(args) model = load_checkpoint(args.checkpoint_path + f'/model_{model_idx}/model_dun.pt', device=args.device, logger=None, template=template) # make results_dir results_dir = os.path.join(args.results_dir, f'model_{model_idx}') makedirs(results_dir) # train_preds, train_targets train_preds = predict(model=model, data_loader=train_data_loader, args=args, scaler=scaler, test_data=False, bbp_sample=False) train_preds = np.array(train_preds) train_targets = np.array(train_targets) # compute tstats tstats = np.ones((12, 3)) for task in range(12): resid = train_preds[:, task] - train_targets[:, task] tstats[task] = np.array(stats.t.fit(resid, floc=0.0)) ################################## ########## Inner loop over samples ################################## for sample_idx in range(args.samples): # save down np.savez(os.path.join(results_dir, f'tstats_{sample_idx}'), tstats) print('done one')
def run_training(args: TrainArgs, logger: Logger = None) -> List[float]: """ Trains a model and returns test scores on the model checkpoint with the highest validation score. :param args: Arguments. :param logger: Logger. :return: A list of ensemble scores for each task. """ debug = info = print # Print command line and args debug('Command line') debug(f'python {" ".join(sys.argv)}') debug('Args') debug(args) # Save args args.save(os.path.join(args.save_dir, 'args.json')) # Get data debug('Loading data') args.task_names = args.target_columns or get_task_names(args.data_path) data = get_data(path=args.data_path, args=args, logger=logger) args.num_tasks = data.num_tasks() args.features_size = data.features_size() debug(f'Number of tasks = {args.num_tasks}') # Split data debug(f'Splitting data with seed {args.seed}') train_data, val_data, test_data = split_data(data=data, split_type=args.split_type, sizes=args.split_sizes, seed=args.seed, args=args, logger=logger) if args.features_scaling: features_scaler = train_data.normalize_features(replace_nan_token=0) val_data.normalize_features(features_scaler) test_data.normalize_features(features_scaler) else: features_scaler = None args.train_data_size = len(train_data) debug( f'Total size = {len(data):,} | ' f'train size = {len(train_data):,} | val size = {len(val_data):,} | test size = {len(test_data):,}' ) # Initialize scaler and scale training targets by subtracting mean and dividing standard deviation (regression only) if args.dataset_type == 'regression': debug('Fitting scaler') train_smiles, train_targets = train_data.smiles(), train_data.targets() scaler = StandardScaler().fit(train_targets) scaled_targets = scaler.transform(train_targets).tolist() train_data.set_targets(scaled_targets) else: scaler = None # Get loss and metric functions loss_func = neg_log_like metric_func = get_metric_func(metric=args.metric) # Set up test set evaluation test_smiles, test_targets = test_data.smiles(), test_data.targets() sum_test_preds = np.zeros((len(test_smiles), args.num_tasks)) # Automatically determine whether to cache if len(data) <= args.cache_cutoff: cache = True num_workers = 0 else: cache = False num_workers = args.num_workers # Create data loaders train_data_loader = MoleculeDataLoader(dataset=train_data, batch_size=args.batch_size, num_workers=num_workers, cache=cache, class_balance=args.class_balance, shuffle=True, seed=args.seed) val_data_loader = MoleculeDataLoader(dataset=val_data, batch_size=args.batch_size, num_workers=num_workers, cache=cache) test_data_loader = MoleculeDataLoader(dataset=test_data, batch_size=args.batch_size, num_workers=num_workers, cache=cache) ########################################### ########## Outer loop over ensemble members ########################################### for model_idx in range(args.ensemble_start_idx, args.ensemble_start_idx + args.ensemble_size): # Set pytorch seed for random initial weights torch.manual_seed(args.pytorch_seeds[model_idx]) ######## set up all logging ######## # make save_dir save_dir = os.path.join(args.save_dir, f'model_{model_idx}') makedirs(save_dir) # make results_dir results_dir = os.path.join(args.results_dir, f'model_{model_idx}') makedirs(results_dir) # initialise wandb os.environ['WANDB_MODE'] = 'dryrun' wandb.init(name=args.wandb_name + '_' + str(model_idx), project=args.wandb_proj, reinit=True) print('WANDB directory is:') print(wandb.run.dir) #################################### # Load/build model if args.checkpoint_path is not None: debug(f'Loading model {model_idx} from {args.checkpoint_path}') model = load_checkpoint(args.checkpoint_path + f'/model_{model_idx}/model.pt', device=args.device, logger=logger) else: debug(f'Building model {model_idx}') model = MoleculeModel(args) debug(model) debug(f'Number of parameters = {param_count(model):,}') if args.cuda: debug('Moving model to cuda') model = model.to(args.device) # Ensure that model is saved in correct location for evaluation if 0 epochs save_checkpoint(os.path.join(save_dir, 'model.pt'), model, scaler, features_scaler, args) # Optimizer optimizer = Adam([{ 'params': model.encoder.parameters() }, { 'params': model.ffn.parameters() }, { 'params': model.log_noise, 'weight_decay': 0 }], lr=args.init_lr, weight_decay=args.weight_decay) # Learning rate scheduler scheduler = build_lr_scheduler(optimizer, args) # Run training best_score = float('inf') if args.minimize_score else -float('inf') best_epoch, n_iter = 0, 0 for epoch in range(args.epochs): debug(f'Epoch {epoch}') n_iter = train(model=model, data_loader=train_data_loader, loss_func=loss_func, optimizer=optimizer, scheduler=scheduler, args=args, n_iter=n_iter, logger=logger) val_scores = evaluate(model=model, data_loader=val_data_loader, args=args, num_tasks=args.num_tasks, metric_func=metric_func, dataset_type=args.dataset_type, scaler=scaler, logger=logger) # Average validation score avg_val_score = np.nanmean(val_scores) debug(f'Validation {args.metric} = {avg_val_score:.6f}') wandb.log({"Validation MAE": avg_val_score}) # Save model checkpoint if improved validation score if args.minimize_score and avg_val_score < best_score or \ not args.minimize_score and avg_val_score > best_score: best_score, best_epoch = avg_val_score, epoch save_checkpoint(os.path.join(save_dir, 'model.pt'), model, scaler, features_scaler, args) if epoch == args.noam_epochs - 1: optimizer = Adam([{ 'params': model.encoder.parameters() }, { 'params': model.ffn.parameters() }, { 'params': model.log_noise, 'weight_decay': 0 }], lr=args.final_lr, weight_decay=args.weight_decay) scheduler = scheduler_const([args.final_lr]) # load model with best validation score info( f'Model {model_idx} best validation {args.metric} = {best_score:.6f} on epoch {best_epoch}' ) model = load_checkpoint(os.path.join(save_dir, 'model.pt'), device=args.device, logger=logger) # SWAG training loop, returns swag_model if args.swag: model = train_swag(model, train_data, val_data, num_workers, cache, loss_func, metric_func, scaler, features_scaler, args, save_dir) # SGLD loop, which saves nets if args.sgld: model = train_sgld(model, train_data, val_data, num_workers, cache, loss_func, metric_func, scaler, features_scaler, args, save_dir) # GP loop if args.gp: model, likelihood = train_gp(model, train_data, val_data, num_workers, cache, metric_func, scaler, features_scaler, args, save_dir) # BBP if args.bbp: model = train_bbp(model, train_data, val_data, num_workers, cache, loss_func, metric_func, scaler, features_scaler, args, save_dir) # DUN if args.dun: model = train_dun(model, train_data, val_data, num_workers, cache, loss_func, metric_func, scaler, features_scaler, args, save_dir) ################################## ########## Inner loop over samples ################################## for sample_idx in range(args.samples): # draw model from SWAG posterior if args.swag: model.sample(scale=1.0, cov=args.cov_mat, block=args.block) # draw model from collected SGLD models if args.sgld: model = load_checkpoint(os.path.join(save_dir, f'model_{sample_idx}.pt'), device=args.device, logger=logger) # make predictions test_preds = predict(model=model, data_loader=test_data_loader, args=args, scaler=scaler, test_data=True, bbp_sample=True) ####################################################################### ####################################################################### ##### SAVING STUFF DOWN if args.gp: # get test_preds_std (scaled back to original data) test_preds_std = predict_std_gp(model=model, data_loader=test_data_loader, args=args, scaler=scaler, likelihood=likelihood) # 1 - MEANS np.savez(os.path.join(results_dir, f'preds_{sample_idx}'), np.array(test_preds)) # 2 - STD, combined aleatoric and epistemic (we save down the stds, always) np.savez(os.path.join(results_dir, f'predsSTDEV_{sample_idx}'), np.array(test_preds_std)) else: # save test_preds and aleatoric uncertainties if args.dun: log_cat = model.log_cat.detach().cpu().numpy() cat = np.exp(log_cat) / np.sum(np.exp(log_cat)) np.savez(os.path.join(results_dir, f'cat_{sample_idx}'), cat) # samples from categorical dist and saves a depth MC sample depth_sample = np.random.multinomial(1, cat).nonzero()[0][0] test_preds_MCdepth = predict_MCdepth( model=model, data_loader=test_data_loader, args=args, scaler=scaler, d=depth_sample) np.savez( os.path.join(results_dir, f'predsMCDEPTH_{sample_idx}'), np.array(test_preds_MCdepth)) if args.swag: log_noise = model.base.log_noise else: log_noise = model.log_noise noise = np.exp(log_noise.detach().cpu().numpy()) * np.array( scaler.stds) np.savez(os.path.join(results_dir, f'preds_{sample_idx}'), np.array(test_preds)) np.savez(os.path.join(results_dir, f'noise_{sample_idx}'), noise) ####################################################################### ####################################################################### # add predictions to sum_test_preds if len(test_preds) != 0: sum_test_preds += np.array(test_preds) # evaluate predictions using metric function test_scores = evaluate_predictions(preds=test_preds, targets=test_targets, num_tasks=args.num_tasks, metric_func=metric_func, dataset_type=args.dataset_type, logger=logger) # compute average test score avg_test_score = np.nanmean(test_scores) info( f'Model {model_idx}, sample {sample_idx} test {args.metric} = {avg_test_score:.6f}' ) ################################# ########## Bayesian Model Average ################################# # note: this is an average over Bayesian samples AND components in an ensemble # compute number of prediction iterations pred_iterations = args.ensemble_size * args.samples # average predictions across iterations avg_test_preds = (sum_test_preds / pred_iterations).tolist() # evaluate BMA_scores = evaluate_predictions(preds=avg_test_preds, targets=test_targets, num_tasks=args.num_tasks, metric_func=metric_func, dataset_type=args.dataset_type, logger=logger) # average scores across tasks avg_BMA_test_score = np.nanmean(BMA_scores) info(f'BMA test {args.metric} = {avg_BMA_test_score:.6f}') return BMA_scores
def find_similar_mols(test_smiles: List[str], train_smiles: List[str], distance_measure: str, model: MoleculeModel = None, num_neighbors: int = None, batch_size: int = 50) -> List[OrderedDict]: """ For each test molecule, finds the N most similar training molecules according to some distance measure. :param test_smiles: A list of test SMILES strings. :param train_smiles: A list of train SMILES strings. :param model: A trained MoleculeModel (only needed for distance_measure == 'embedding'). :param distance_measure: The distance measure to use to determine nearest neighbors. :param num_neighbors: The number of nearest training molecules to find for each test molecule. :param batch_size: Batch size. :return: A list of OrderedDicts containing the test smiles, the num_neighbors nearest training smiles, and other relevant distance info. """ test_data = get_data_from_smiles(smiles=[[smiles] for smiles in test_smiles]) train_data = get_data_from_smiles(smiles=[[smiles] for smiles in train_smiles]) train_smiles_set = set(train_smiles) # Create data loader test_data_loader = MoleculeDataLoader(dataset=test_data, batch_size=batch_size, num_workers=args.num_workers) train_data_loader = MoleculeDataLoader(dataset=train_data, batch_size=batch_size, num_workers=args.num_workers) print(f'Computing {distance_measure} vectors') if distance_measure == 'embedding': assert model is not None test_vecs = np.array( model_fingerprint(model=model, data_loader=test_data_loader, fingerprint_type='last_FFN')) train_vecs = np.array( model_fingerprint(model=model, data_loader=train_data_loader, fingerprint_type='last_FFN')) metric = 'cosine' elif distance_measure == 'morgan': test_vecs = np.array([ morgan_binary_features_generator(smiles) for smiles in tqdm(test_smiles, total=len(test_smiles)) ]) train_vecs = np.array([ morgan_binary_features_generator(smiles) for smiles in tqdm(train_smiles, total=len(train_smiles)) ]) metric = 'jaccard' elif distance_measure == 'tanimoto': # Generate RDKit topological fingerprints test_fps = [Chem.RDKFingerprint(m.mol[0]) for m in tqdm(test_data)] train_fps = [Chem.RDKFingerprint(m.mol[0]) for m in tqdm(train_data)] # Compute pairwise similarity print('Computing distances') similarity = np.zeros([len(test_fps), len(train_fps)]) for (x, y), _ in np.ndenumerate(similarity): similarity[x, y] = DataStructs.FingerprintSimilarity( test_fps[x], train_fps[y]) # Convert the tanimoto similarity to a distance distances = 1 - similarity metric = 'tanimoto' else: raise ValueError( f'Distance measure "{distance_measure}" not supported.') if distance_measure in ('embedding', 'morgan'): print('Computing distances') distances = cdist(test_vecs, train_vecs, metric=metric) print('Finding neighbors') neighbors = [] for test_index, test_smile in enumerate(test_smiles): # Find the num_neighbors molecules in the training set which are most similar to the test molecule nearest_train_indices = np.argsort( distances[test_index])[:num_neighbors] # Build dictionary with distance info neighbor = OrderedDict() neighbor['test_smiles'] = test_smile neighbor['test_in_train'] = test_smile in train_smiles_set for i, train_index in enumerate(nearest_train_indices): neighbor[f'train_{i + 1}_smiles'] = train_smiles[train_index] neighbor[ f'train_{i + 1}_{distance_measure}_{metric}_distance'] = distances[ test_index][train_index] neighbors.append(neighbor) return neighbors
def train_gp( model, train_data, val_data, num_workers, cache, metric_func, scaler, features_scaler, args, save_dir): # create data loaders for gp (allows different batch size) train_data_loader = MoleculeDataLoader( dataset=train_data, batch_size=args.batch_size_gp, num_workers=num_workers, cache=cache, class_balance=args.class_balance, shuffle=True, seed=args.seed ) val_data_loader = MoleculeDataLoader( dataset=val_data, batch_size=args.batch_size_gp, num_workers=num_workers, cache=cache ) # feature_extractor model.featurizer = True feature_extractor = model # inducing points inducing_points = initial_inducing_points( train_data_loader, feature_extractor, args ) # GP layer gp_layer = GPLayer(inducing_points, args.num_tasks) # full DKL model model = copy.deepcopy(DKLMoleculeModel(feature_extractor, gp_layer)) # likelihood # rank 0 restricts to diagonal matrix likelihood = gpytorch.likelihoods.MultitaskGaussianLikelihood(num_tasks=12, rank=0) # model and likelihood to CUDA if args.cuda: model.cuda() likelihood.cuda() # loss object mll = gpytorch.mlls.VariationalELBO(likelihood, model.gp_layer, num_data=args.train_data_size) # optimizer params_list = [ {'params': model.feature_extractor.parameters(), 'weight_decay': args.weight_decay_gp}, {'params': model.gp_layer.hyperparameters()}, {'params': model.gp_layer.variational_parameters()}, {'params': likelihood.parameters()}, ] optimizer = torch.optim.Adam(params_list, lr = args.init_lr_gp) # scheduler num_params = len(params_list) scheduler = NoamLR( optimizer=optimizer, warmup_epochs=[args.warmup_epochs_gp]*num_params, total_epochs=[args.noam_epochs_gp]*num_params, steps_per_epoch=args.train_data_size // args.batch_size_gp, init_lr=[args.init_lr_gp]*num_params, max_lr=[args.max_lr_gp]*num_params, final_lr=[args.final_lr_gp]*num_params) print("----------GP training----------") # training loop best_score = float('inf') if args.minimize_score else -float('inf') best_epoch, n_iter = 0, 0 for epoch in range(args.epochs_gp): print(f'GP epoch {epoch}') if epoch == args.noam_epochs_gp: scheduler = scheduler_const([args.final_lr_gp]) n_iter = train( model=model, data_loader=train_data_loader, loss_func=mll, optimizer=optimizer, scheduler=scheduler, args=args, n_iter=n_iter, gp_switch=True, likelihood = likelihood ) val_scores = evaluate( model=model, data_loader=val_data_loader, args=args, num_tasks=args.num_tasks, metric_func=metric_func, dataset_type=args.dataset_type, scaler=scaler ) # Average validation score avg_val_score = np.nanmean(val_scores) print(f'Validation {args.metric} = {avg_val_score:.6f}') wandb.log({"Validation MAE": avg_val_score}) # Save model AND LIKELIHOOD checkpoint if improved validation score if args.minimize_score and avg_val_score < best_score or \ not args.minimize_score and avg_val_score > best_score: best_score, best_epoch = avg_val_score, epoch save_checkpoint(os.path.join(save_dir, 'DKN_model.pt'), model, scaler, features_scaler, args) best_likelihood = copy.deepcopy(likelihood) # load model with best validation score # NOTE: TEMPLATE MUST BE NEWLY INSTANTIATED MODEL print(f'Loading model with best validation {args.metric} = {best_score:.6f} on epoch {best_epoch}') model = load_checkpoint(os.path.join(save_dir, 'DKN_model.pt'), device=args.device, logger=None, template = DKLMoleculeModel(MoleculeModel(args, featurizer=True), gp_layer)) return model, best_likelihood
def make_predictions(args: PredictArgs, smiles: List[str] = None) -> List[Optional[List[float]]]: """ Makes predictions. If smiles is provided, makes predictions on smiles. Otherwise makes predictions on args.test_data. :param args: Arguments. :param smiles: Smiles to make predictions on. :return: A list of lists of target predictions. """ print('Loading training args') scaler, features_scaler = load_scalers(args.checkpoint_paths[0]) train_args = load_args(args.checkpoint_paths[0]) num_tasks, task_names = train_args.num_tasks, train_args.task_names # If features were used during training, they must be used when predicting if ((train_args.features_path is not None or train_args.features_generator is not None) and args.features_path is None and args.features_generator is None): raise ValueError( 'Features were used during training so they must be specified again during prediction ' 'using the same type of features as before (with either --features_generator or ' '--features_path and using --no_features_scaling if applicable).') # Update predict args with training arguments to create a merged args object for key, value in vars(train_args).items(): if not hasattr(args, key): setattr(args, key, value) args: Union[PredictArgs, TrainArgs] print('Loading data') if smiles is not None: full_data = get_data_from_smiles( smiles=smiles, skip_invalid_smiles=False, features_generator=args.features_generator) else: full_data = get_data(path=args.test_path, args=args, target_columns=[], skip_invalid_smiles=False) print('Validating SMILES') full_to_valid_indices = {} valid_index = 0 for full_index in range(len(full_data)): if full_data[full_index].mol is not None: full_to_valid_indices[full_index] = valid_index valid_index += 1 test_data = MoleculeDataset( [full_data[i] for i in sorted(full_to_valid_indices.keys())]) # Edge case if empty list of smiles is provided if len(test_data) == 0: return [None] * len(full_data) print(f'Test size = {len(test_data):,}') # Normalize features if args.features_scaling: test_data.normalize_features(features_scaler) # Initialize uncertainty estimator if args.uncertainty: uncertainty_estimator = uncertainty_estimator_builder( args.uncertainty)(args, test_data, scaler) # Predict with each model individually and sum predictions if not args.uncertainty: if args.dataset_type == 'multiclass': sum_preds = np.zeros( (len(test_data), num_tasks, args.multiclass_num_classes)) else: sum_preds = np.zeros((len(test_data), num_tasks)) # Create data loader test_data_loader = MoleculeDataLoader(dataset=test_data, batch_size=args.batch_size, num_workers=args.num_workers) print( f'Predicting with an ensemble of {len(args.checkpoint_paths)} models') for N, checkpoint_path in tqdm(enumerate(args.checkpoint_paths), total=len(args.checkpoint_paths)): # Load model model = load_checkpoint(checkpoint_path, device=args.device) model.training = False if not args.uncertainty: model_preds = predict(model=model, data_loader=test_data_loader, scaler=scaler) sum_preds += np.array(model_preds) else: uncertainty_estimator.process_model(model, N) # Ensemble predictions if not args.uncertainty: avg_preds = sum_preds / len(args.checkpoint_paths) avg_preds = avg_preds.tolist() else: avg_preds, avg_UQ = uncertainty_estimator.calculate_UQ() if type(avg_UQ) is tuple: aleatoric, epistemic = avg_UQ # Save predictions print(f'Saving predictions to {args.preds_path}') assert len(test_data) == len(avg_preds) makedirs(args.preds_path, isfile=True) # Get prediction column names if args.dataset_type == 'multiclass': task_names = [ f'{name}_class_{i}' for name in task_names for i in range(args.multiclass_num_classes) ] else: task_names = task_names # Copy predictions over to full_data for full_index, datapoint in enumerate(full_data): valid_index = full_to_valid_indices.get(full_index, None) preds = avg_preds[valid_index] if valid_index is not None else [ 'Invalid SMILES' ] * len(task_names) if args.uncertainty: if not args.split_UQ: cur_UQ = avg_UQ[valid_index] if valid_index is not None else [ 'Invalid SMILES' ] * len(task_names) datapoint.row['Uncertainty'] = cur_UQ elif args.split_UQ: cur_al = aleatoric[ valid_index] if valid_index is not None else [ 'Invalid SMILES' ] * len(task_names) cur_ep = epistemic[ valid_index] if valid_index is not None else [ 'Invalid SMILES' ] * len(task_names) datapoint.row['Aleatoric'] = cur_al datapoint.row['Epistemic'] = cur_ep if type(preds) is list: for pred_name, pred in zip(task_names, preds): datapoint.row[pred_name] = pred else: datapoint.row[task_names[0]] = preds # Save with open(args.preds_path, 'w') as f: writer = csv.DictWriter(f, fieldnames=full_data[0].row.keys()) writer.writeheader() for datapoint in full_data: writer.writerow(datapoint.row) return avg_preds
def molecule_fingerprint( args: PredictArgs, smiles: List[List[str]] = None) -> List[List[Optional[float]]]: """ Loads data and a trained model and uses the model to encode fingerprint vectors for the data. :param args: A :class:`~chemprop.args.PredictArgs` object containing arguments for loading data and a model and making predictions. :param smiles: List of list of SMILES to make predictions on. :return: A list of fingerprint vectors (list of floats) """ print('Loading training args') train_args = load_args(args.checkpoint_paths[0]) # Update args with training arguments update_prediction_args(predict_args=args, train_args=train_args, validate_feature_sources=False) args: Union[PredictArgs, TrainArgs] #set explicit H option and reaction option set_explicit_h(train_args.explicit_h) set_reaction(train_args.reaction, train_args.reaction_mode) print('Loading data') if smiles is not None: full_data = get_data_from_smiles( smiles=smiles, skip_invalid_smiles=False, features_generator=args.features_generator) else: full_data = get_data(path=args.test_path, smiles_columns=args.smiles_columns, target_columns=[], ignore_columns=[], skip_invalid_smiles=False, args=args, store_row=True) print('Validating SMILES') full_to_valid_indices = {} valid_index = 0 for full_index in range(len(full_data)): if all(mol is not None for mol in full_data[full_index].mol): full_to_valid_indices[full_index] = valid_index valid_index += 1 test_data = MoleculeDataset( [full_data[i] for i in sorted(full_to_valid_indices.keys())]) # Edge case if empty list of smiles is provided if len(test_data) == 0: return [None] * len(full_data) print(f'Test size = {len(test_data):,}') # Create data loader test_data_loader = MoleculeDataLoader(dataset=test_data, batch_size=args.batch_size, num_workers=args.num_workers) # Load model print(f'Encoding smiles into a fingerprint vector from a single model') if len(args.checkpoint_paths) != 1: raise ValueError( "Fingerprint generation only supports one model, cannot use an ensemble" ) model = load_checkpoint(args.checkpoint_paths[0], device=args.device) scaler, features_scaler, atom_descriptor_scaler, bond_feature_scaler = load_scalers( args.checkpoint_paths[0]) # Normalize features if args.features_scaling or train_args.atom_descriptor_scaling or train_args.bond_feature_scaling: test_data.reset_features_and_targets() if args.features_scaling: test_data.normalize_features(features_scaler) if train_args.atom_descriptor_scaling and args.atom_descriptors is not None: test_data.normalize_features(atom_descriptor_scaler, scale_atom_descriptors=True) if train_args.bond_feature_scaling and args.bond_features_size > 0: test_data.normalize_features(bond_feature_scaler, scale_bond_features=True) # Make fingerprints model_preds = model_fingerprint(model=model, data_loader=test_data_loader) # Save predictions print(f'Saving predictions to {args.preds_path}') assert len(test_data) == len(model_preds) makedirs(args.preds_path, isfile=True) # Copy predictions over to full_data total_hidden_size = args.hidden_size * args.number_of_molecules for full_index, datapoint in enumerate(full_data): valid_index = full_to_valid_indices.get(full_index, None) preds = model_preds[valid_index] if valid_index is not None else [ 'Invalid SMILES' ] * total_hidden_size fingerprint_columns = [f'fp_{i}' for i in range(total_hidden_size)] for i in range(len(fingerprint_columns)): datapoint.row[fingerprint_columns[i]] = preds[i] # Write predictions with open(args.preds_path, 'w') as f: writer = csv.DictWriter(f, fieldnames=args.smiles_columns + fingerprint_columns, extrasaction='ignore') writer.writeheader() for datapoint in full_data: writer.writerow(datapoint.row) return model_preds
def pdts(args: TrainArgs, model_idx): """ preliminary experiment with PDTS (approximate BO) we use a data set size of 50k and run until we have trained with 15k data points our batch size is 50 we initialise with 1000 data points """ ######## set up all logging ######## logger = None # make save_dir save_dir = os.path.join(args.save_dir, f'model_{model_idx}') makedirs(save_dir) # make results_dir results_dir = args.results_dir makedirs(results_dir) # initialise wandb #os.environ['WANDB_MODE'] = 'dryrun' wandb.init(name=args.wandb_name + '_' + str(model_idx), project=args.wandb_proj, reinit=True) #print('WANDB directory is:') #print(wandb.run.dir) #################################### ########## get data args.task_names = args.target_columns or get_task_names(args.data_path) data = get_data(path=args.data_path, args=args, logger=logger) args.num_tasks = data.num_tasks() args.features_size = data.features_size() ########## SMILES of top 1% top1p = np.array(MoleculeDataset(data).targets()) top1p_idx = np.argsort(-top1p[:, 0])[:int(args.max_data_size * 0.01)] SMILES = np.array(MoleculeDataset(data).smiles())[top1p_idx] ########## initial data splits args.seed = args.data_seeds[model_idx] data.shuffle(seed=args.seed) sizes = args.split_sizes train_size = int(sizes[0] * len(data)) train_orig = data[:train_size] test_orig = data[train_size:] train_data, test_data = copy.deepcopy( MoleculeDataset(train_orig)), copy.deepcopy(MoleculeDataset(test_orig)) args.train_data_size = len(train_data) ########## standardising # features (train and test) features_scaler = train_data.normalize_features(replace_nan_token=0) test_data.normalize_features(features_scaler) # targets (train) train_targets = train_data.targets() test_targets = test_data.targets() scaler = StandardScaler().fit(train_targets) scaled_targets = scaler.transform(train_targets).tolist() train_data.set_targets(scaled_targets) ########## loss, metric functions loss_func = neg_log_like metric_func = get_metric_func(metric=args.metric) ########## data loaders if len(data) <= args.cache_cutoff: cache = True num_workers = 0 else: cache = False num_workers = args.num_workers train_data_loader = MoleculeDataLoader(dataset=train_data, batch_size=args.batch_size, num_workers=num_workers, cache=cache, class_balance=args.class_balance, shuffle=True, seed=args.seed) test_data_loader = MoleculeDataLoader(dataset=test_data, batch_size=args.batch_size, num_workers=num_workers, cache=cache) ########## instantiating model, optimiser, scheduler (MAP) # set pytorch seed for random initial weights torch.manual_seed(args.pytorch_seeds[model_idx]) # build model print(f'Building model {model_idx}') model = MoleculeModel(args) print(model) print(f'Number of parameters = {param_count(model):,}') if args.cuda: print('Moving model to cuda') model = model.to(args.device) # optimizer optimizer = Adam([{ 'params': model.encoder.parameters() }, { 'params': model.ffn.parameters() }, { 'params': model.log_noise, 'weight_decay': 0 }], lr=args.lr, weight_decay=args.weight_decay) # learning rate scheduler scheduler = scheduler_const([args.lr]) #################################################################### #################################################################### # FIRST THOMPSON ITERATION ### scores array ptds_scores = np.ones(args.pdts_batches + 1) batch_no = 0 ### fill for batch 0 SMILES_train = np.array(train_data.smiles()) SMILES_stack = np.hstack((SMILES, SMILES_train)) overlap = len(SMILES_stack) - len(np.unique(SMILES_stack)) prop = overlap / len(SMILES) ptds_scores[batch_no] = prop wandb.log({ "Proportion of top 1%": prop, "batch_no": batch_no }, commit=False) ### train MAP posterior gp_switch = False likelihood = None bbp_switch = None n_iter = 0 for epoch in range(args.epochs_init_map): n_iter = train(model=model, data_loader=train_data_loader, loss_func=loss_func, optimizer=optimizer, scheduler=scheduler, args=args, n_iter=n_iter, bbp_switch=bbp_switch) # save to save_dir #if epoch == args.epochs_init_map - 1: #save_checkpoint(os.path.join(save_dir, f'model_{batch_no}.pt'), model, scaler, features_scaler, args) # if X load from checkpoint path if args.bbp or args.gp or args.swag or args.sgld: model = load_checkpoint(args.checkpoint_path + f'/model_{model_idx}/model_{batch_no}.pt', device=args.device, logger=None) ########## BBP if args.bbp: model_bbp = MoleculeModelBBP( args) # instantiate with bayesian linear layers for (_, param_bbp), (_, param_pre) in zip(model_bbp.named_parameters(), model.named_parameters()): param_bbp.data = copy.deepcopy( param_pre.data.T) # copy over parameters # instantiate rhos for layer in model_bbp.children(): if isinstance(layer, BayesLinear): layer.init_rho(args.rho_min_bbp, args.rho_max_bbp) for layer in model_bbp.encoder.encoder.children(): if isinstance(layer, BayesLinear): layer.init_rho(args.rho_min_bbp, args.rho_max_bbp) model = model_bbp # name back # move to cuda if args.cuda: print('Moving bbp model to cuda') model = model.to(args.device) # optimiser and scheduler optimizer = torch.optim.Adam(model.parameters(), lr=args.lr) scheduler = scheduler_const([args.lr]) bbp_switch = 2 n_iter = 0 for epoch in range(args.epochs_init): n_iter = train(model=model, data_loader=train_data_loader, loss_func=loss_func, optimizer=optimizer, scheduler=scheduler, args=args, n_iter=n_iter, bbp_switch=bbp_switch) ########## GP if args.gp: # feature_extractor model.featurizer = True feature_extractor = model # inducing points inducing_points = initial_inducing_points(train_data_loader, feature_extractor, args) # GP layer gp_layer = GPLayer(inducing_points, args.num_tasks) # full DKL model model = copy.deepcopy(DKLMoleculeModel(feature_extractor, gp_layer)) # likelihood (rank 0 restricts to diagonal matrix) likelihood = gpytorch.likelihoods.MultitaskGaussianLikelihood( num_tasks=12, rank=0) # model and likelihood to CUDA if args.cuda: model.cuda() likelihood.cuda() # loss object loss_func = gpytorch.mlls.VariationalELBO( likelihood, model.gp_layer, num_data=args.train_data_size) # optimiser and scheduler params_list = [ { 'params': model.feature_extractor.parameters(), 'weight_decay': args.weight_decay_gp }, { 'params': model.gp_layer.hyperparameters() }, { 'params': model.gp_layer.variational_parameters() }, { 'params': likelihood.parameters() }, ] optimizer = torch.optim.Adam(params_list, lr=args.lr) scheduler = scheduler_const([args.lr]) gp_switch = True n_iter = 0 for epoch in range(args.epochs_init): n_iter = train(model=model, data_loader=train_data_loader, loss_func=loss_func, optimizer=optimizer, scheduler=scheduler, args=args, n_iter=n_iter, gp_switch=gp_switch, likelihood=likelihood) ########## SWAG if args.swag: model_core = copy.deepcopy(model) model = train_swag_pdts(model_core, train_data_loader, loss_func, scaler, features_scaler, args, save_dir, batch_no) ########## SGLD if args.sgld: model = train_sgld_pdts(model, train_data_loader, loss_func, scaler, features_scaler, args, save_dir, batch_no) ### find top_idx top_idx = [] # need for thom sum_test_preds = np.zeros( (len(test_orig), args.num_tasks)) # need for greedy for sample in range(args.samples): # draw model from SWAG posterior if args.swag: model.sample(scale=1.0, cov=args.cov_mat, block=args.block) # retrieve sgld sample if args.sgld: model = load_checkpoint( args.save_dir + f'/model_{model_idx}/model_{batch_no}/model_{sample}.pt', device=args.device, logger=logger) test_preds = predict(model=model, data_loader=test_data_loader, args=args, scaler=scaler, test_data=True, gp_sample=args.thompson, bbp_sample=True) test_preds = np.array(test_preds) # thompson bit rank = 0 # base length if args.sgld: base_length = 5 * sample + 4 else: base_length = sample while args.thompson and (len(top_idx) <= base_length): top_unique_molecule = np.argsort(-test_preds[:, 0])[rank] rank += 1 if top_unique_molecule not in top_idx: top_idx.append(top_unique_molecule) # add to sum_test_preds sum_test_preds += test_preds # print print('done sample ' + str(sample)) # final top_idx if args.thompson: top_idx = np.array(top_idx) else: sum_test_preds /= args.samples top_idx = np.argsort(-sum_test_preds[:, 0])[:50] ### transfer from test to train top_idx = -np.sort(-top_idx) for idx in top_idx: train_orig.append(test_orig.pop(idx)) train_data, test_data = copy.deepcopy( MoleculeDataset(train_orig)), copy.deepcopy(MoleculeDataset(test_orig)) args.train_data_size = len(train_data) if args.gp: loss_func = gpytorch.mlls.VariationalELBO( likelihood, model.gp_layer, num_data=args.train_data_size) print(args.train_data_size) ### standardise features (train and test; using original features_scaler) train_data.normalize_features(features_scaler) test_data.normalize_features(features_scaler) ### standardise targets (train only; using original scaler) train_targets = train_data.targets() scaled_targets_tr = scaler.transform(train_targets).tolist() train_data.set_targets(scaled_targets_tr) ### create data loaders train_data_loader = MoleculeDataLoader(dataset=train_data, batch_size=args.batch_size, num_workers=num_workers, cache=cache, class_balance=args.class_balance, shuffle=True, seed=args.seed) test_data_loader = MoleculeDataLoader(dataset=test_data, batch_size=args.batch_size, num_workers=num_workers, cache=cache) #################################################################### #################################################################### ################################## ########## thompson sampling loop ################################## for batch_no in range(1, args.pdts_batches + 1): ### fill in ptds_scores SMILES_train = np.array(train_data.smiles()) SMILES_stack = np.hstack((SMILES, SMILES_train)) overlap = len(SMILES_stack) - len(np.unique(SMILES_stack)) prop = overlap / len(SMILES) ptds_scores[batch_no] = prop wandb.log({ "Proportion of top 1%": prop, "batch_no": batch_no }, commit=False) ### train posterior n_iter = 0 for epoch in range(args.epochs): n_iter = train(model=model, data_loader=train_data_loader, loss_func=loss_func, optimizer=optimizer, scheduler=scheduler, args=args, n_iter=n_iter, gp_switch=gp_switch, likelihood=likelihood, bbp_switch=bbp_switch) # save to save_dir #if epoch == args.epochs - 1: #save_checkpoint(os.path.join(save_dir, f'model_{batch_no}.pt'), model, scaler, features_scaler, args) # if swag, load checkpoint if args.swag: model_core = load_checkpoint( args.checkpoint_path + f'/model_{model_idx}/model_{batch_no}.pt', device=args.device, logger=None) ########## SWAG if args.swag: model = train_swag_pdts(model_core, train_data_loader, loss_func, scaler, features_scaler, args, save_dir, batch_no) ########## SGLD if args.sgld: model = train_sgld_pdts(model, train_data_loader, loss_func, scaler, features_scaler, args, save_dir, batch_no) ### find top_idx top_idx = [] # need for thom sum_test_preds = np.zeros( (len(test_orig), args.num_tasks)) # need for greedy for sample in range(args.samples): # draw model from SWAG posterior if args.swag: model.sample(scale=1.0, cov=args.cov_mat, block=args.block) # retrieve sgld sample if args.sgld: model = load_checkpoint( args.save_dir + f'/model_{model_idx}/model_{batch_no}/model_{sample}.pt', device=args.device, logger=logger) test_preds = predict(model=model, data_loader=test_data_loader, args=args, scaler=scaler, test_data=True, gp_sample=args.thompson, bbp_sample=True) test_preds = np.array(test_preds) # thompson bit rank = 0 # base length if args.sgld: base_length = 5 * sample + 4 else: base_length = sample while args.thompson and (len(top_idx) <= base_length): top_unique_molecule = np.argsort(-test_preds[:, 0])[rank] rank += 1 if top_unique_molecule not in top_idx: top_idx.append(top_unique_molecule) # add to sum_test_preds sum_test_preds += test_preds # print print('done sample ' + str(sample)) # final top_idx if args.thompson: top_idx = np.array(top_idx) else: sum_test_preds /= args.samples top_idx = np.argsort(-sum_test_preds[:, 0])[:50] ### transfer from test to train top_idx = -np.sort(-top_idx) for idx in top_idx: train_orig.append(test_orig.pop(idx)) train_data, test_data = copy.deepcopy( MoleculeDataset(train_orig)), copy.deepcopy( MoleculeDataset(test_orig)) args.train_data_size = len(train_data) if args.gp: loss_func = gpytorch.mlls.VariationalELBO( likelihood, model.gp_layer, num_data=args.train_data_size) print(args.train_data_size) ### standardise features (train and test; using original features_scaler) train_data.normalize_features(features_scaler) test_data.normalize_features(features_scaler) ### standardise targets (train only; using original scaler) train_targets = train_data.targets() scaled_targets_tr = scaler.transform(train_targets).tolist() train_data.set_targets(scaled_targets_tr) ### create data loaders train_data_loader = MoleculeDataLoader( dataset=train_data, batch_size=args.batch_size, num_workers=num_workers, cache=cache, class_balance=args.class_balance, shuffle=True, seed=args.seed) test_data_loader = MoleculeDataLoader(dataset=test_data, batch_size=args.batch_size, num_workers=num_workers, cache=cache) # save scores np.savez(os.path.join(results_dir, f'ptds_{model_idx}'), ptds_scores)
def make_predictions(args: PredictArgs, smiles: List[str] = None) -> List[Optional[List[float]]]: """ Makes predictions. If smiles is provided, makes predictions on smiles. Otherwise makes predictions on args.test_data. :param args: Arguments. :param smiles: Smiles to make predictions on. :return: A list of lists of target predictions. """ print('Loading training args') scaler, features_scaler = load_scalers(args.checkpoint_paths[0]) train_args = load_args(args.checkpoint_paths[0]) num_tasks, task_names = train_args.num_tasks, train_args.task_names # If features were used during training, they must be used when predicting if ((train_args.features_path is not None or train_args.features_generator is not None) and args.features_path is None and args.features_generator is None): raise ValueError( 'Features were used during training so they must be specified again during prediction ' 'using the same type of features as before (with either --features_generator or ' '--features_path and using --no_features_scaling if applicable).') # Update predict args with training arguments to create a merged args object for key, value in vars(train_args).items(): if not hasattr(args, key): setattr(args, key, value) args: Union[PredictArgs, TrainArgs] if args.parcel_size and args.max_data_size: num_iterations = math.ceil(args.max_data_size / args.parcel_size) max_data_size = args.parcel_size print('Using parcels: ' + str(num_iterations)) else: num_iterations = 1 max_data_size = args.max_data_size print('Not using parcels.') if args.parcel_offset: offset = args.parcel_offset * parcel_size else: offset = 0 for iteration in range(num_iterations): print('Loading data') if smiles is not None: full_data = get_data_from_smiles( smiles=smiles, skip_invalid_smiles=False, features_generator=args.features_generator) else: print("Getting without SMILES") full_data = get_data(path=args.test_path, args=args, target_columns=[], max_data_size=max_data_size, data_offset=offset, skip_invalid_smiles=False) print('Validating SMILES') full_to_valid_indices = {} valid_index = 0 for full_index in range(len(full_data)): if full_data[full_index].mol is not None: full_to_valid_indices[full_index] = valid_index valid_index += 1 test_data = MoleculeDataset( [full_data[i] for i in sorted(full_to_valid_indices.keys())]) # Edge case if empty list of smiles is provided if len(test_data) == 0: return [None] * len(full_data) print(f'Test size = {len(test_data):,}') # Normalize features if args.features_scaling: test_data.normalize_features(features_scaler) # Predict with each model individually and sum predictions if args.dataset_type == 'multiclass': sum_preds = np.zeros( (len(test_data), num_tasks, args.multiclass_num_classes)) else: sum_preds = np.zeros((len(test_data), num_tasks)) # Create data loader test_data_loader = MoleculeDataLoader(dataset=test_data, batch_size=args.batch_size, num_workers=args.num_workers) print( f'Predicting with an ensemble of {len(args.checkpoint_paths)} models' ) for checkpoint_path in tqdm(args.checkpoint_paths, total=len(args.checkpoint_paths)): # Load model model = load_checkpoint(checkpoint_path, device=args.device) model_preds = predict(model=model, data_loader=test_data_loader, scaler=scaler) sum_preds += np.array(model_preds) # Ensemble predictions avg_preds = sum_preds / len(args.checkpoint_paths) avg_preds = avg_preds.tolist() # Save predictions if iteration != 0: name, ext = os.path.splitext(args.preds_path) preds_path = "{name}.{it}.csv".format(name=name, it=iteration) else: preds_path = args.preds_path print(f'Saving predictions to {preds_path}') assert len(test_data) == len(avg_preds) makedirs(preds_path, isfile=True) # Get prediction column names if args.dataset_type == 'multiclass': task_names = [ f'{name}_class_{i}' for name in task_names for i in range(args.multiclass_num_classes) ] else: task_names = task_names # Copy predictions over to full_data for full_index, datapoint in enumerate(full_data): valid_index = full_to_valid_indices.get(full_index, None) preds = avg_preds[valid_index] if valid_index is not None else [ 'Invalid SMILES' ] * len(task_names) for pred_name, pred in zip(task_names, preds): datapoint.row[pred_name] = pred with open(preds_path, 'w') as f: writer = csv.DictWriter(f, fieldnames=full_data[0].row.keys()) writer.writeheader() for datapoint in full_data: writer.writerow(datapoint.row) offset = offset + parcel_size return avg_preds
def run_training(args: TrainArgs, data: MoleculeDataset, logger: Logger = None) -> Dict[str, List[float]]: """ Loads data, trains a Chemprop model, and returns test scores for the model checkpoint with the highest validation score. :param args: A :class:`~chemprop.args.TrainArgs` object containing arguments for loading data and training the Chemprop model. :param data: A :class:`~chemprop.data.MoleculeDataset` containing the data. :param logger: A logger to record output. :return: A dictionary mapping each metric in :code:`args.metrics` to a list of values for each task. """ if logger is not None: debug, info = logger.debug, logger.info else: debug = info = print # Set pytorch seed for random initial weights torch.manual_seed(args.pytorch_seed) # Split data debug(f'Splitting data with seed {args.seed}') if args.separate_test_path: test_data = get_data( path=args.separate_test_path, args=args, features_path=args.separate_test_features_path, atom_descriptors_path=args.separate_test_atom_descriptors_path, bond_features_path=args.separate_test_bond_features_path, phase_features_path=args.separate_test_phase_features_path, smiles_columns=args.smiles_columns, logger=logger) if args.separate_val_path: val_data = get_data( path=args.separate_val_path, args=args, features_path=args.separate_val_features_path, atom_descriptors_path=args.separate_val_atom_descriptors_path, bond_features_path=args.separate_val_bond_features_path, phase_features_path=args.separate_val_phase_features_path, smiles_columns=args.smiles_columns, logger=logger) if args.separate_val_path and args.separate_test_path: train_data = data elif args.separate_val_path: train_data, _, test_data = split_data(data=data, split_type=args.split_type, sizes=(0.8, 0.0, 0.2), seed=args.seed, num_folds=args.num_folds, args=args, logger=logger) elif args.separate_test_path: train_data, val_data, _ = split_data(data=data, split_type=args.split_type, sizes=(0.8, 0.2, 0.0), seed=args.seed, num_folds=args.num_folds, args=args, logger=logger) else: train_data, val_data, test_data = split_data( data=data, split_type=args.split_type, sizes=args.split_sizes, seed=args.seed, num_folds=args.num_folds, args=args, logger=logger) if args.dataset_type == 'classification': class_sizes = get_class_sizes(data) debug('Class sizes') for i, task_class_sizes in enumerate(class_sizes): debug( f'{args.task_names[i]} ' f'{", ".join(f"{cls}: {size * 100:.2f}%" for cls, size in enumerate(task_class_sizes))}' ) if args.save_smiles_splits: save_smiles_splits( data_path=args.data_path, save_dir=args.save_dir, task_names=args.task_names, features_path=args.features_path, train_data=train_data, val_data=val_data, test_data=test_data, smiles_columns=args.smiles_columns, logger=logger, ) if args.features_scaling: features_scaler = train_data.normalize_features(replace_nan_token=0) val_data.normalize_features(features_scaler) test_data.normalize_features(features_scaler) else: features_scaler = None if args.atom_descriptor_scaling and args.atom_descriptors is not None: atom_descriptor_scaler = train_data.normalize_features( replace_nan_token=0, scale_atom_descriptors=True) val_data.normalize_features(atom_descriptor_scaler, scale_atom_descriptors=True) test_data.normalize_features(atom_descriptor_scaler, scale_atom_descriptors=True) else: atom_descriptor_scaler = None if args.bond_feature_scaling and args.bond_features_size > 0: bond_feature_scaler = train_data.normalize_features( replace_nan_token=0, scale_bond_features=True) val_data.normalize_features(bond_feature_scaler, scale_bond_features=True) test_data.normalize_features(bond_feature_scaler, scale_bond_features=True) else: bond_feature_scaler = None args.train_data_size = len(train_data) debug( f'Total size = {len(data):,} | ' f'train size = {len(train_data):,} | val size = {len(val_data):,} | test size = {len(test_data):,}' ) # Initialize scaler and scale training targets by subtracting mean and dividing standard deviation (regression only) if args.dataset_type == 'regression': debug('Fitting scaler') scaler = train_data.normalize_targets() elif args.dataset_type == 'spectra': debug( 'Normalizing spectra and excluding spectra regions based on phase') args.spectra_phase_mask = load_phase_mask(args.spectra_phase_mask_path) for dataset in [train_data, test_data, val_data]: data_targets = normalize_spectra( spectra=dataset.targets(), phase_features=dataset.phase_features(), phase_mask=args.spectra_phase_mask, excluded_sub_value=None, threshold=args.spectra_target_floor, ) dataset.set_targets(data_targets) scaler = None else: scaler = None # Get loss function loss_func = get_loss_func(args) # Set up test set evaluation test_smiles, test_targets = test_data.smiles(), test_data.targets() if args.dataset_type == 'multiclass': sum_test_preds = np.zeros( (len(test_smiles), args.num_tasks, args.multiclass_num_classes)) else: sum_test_preds = np.zeros((len(test_smiles), args.num_tasks)) # Automatically determine whether to cache if len(data) <= args.cache_cutoff: set_cache_graph(True) num_workers = 0 else: set_cache_graph(False) num_workers = args.num_workers # Create data loaders train_data_loader = MoleculeDataLoader(dataset=train_data, batch_size=args.batch_size, num_workers=num_workers, class_balance=args.class_balance, shuffle=True, seed=args.seed) val_data_loader = MoleculeDataLoader(dataset=val_data, batch_size=args.batch_size, num_workers=num_workers) test_data_loader = MoleculeDataLoader(dataset=test_data, batch_size=args.batch_size, num_workers=num_workers) if args.class_balance: debug( f'With class_balance, effective train size = {train_data_loader.iter_size:,}' ) # Train ensemble of models for model_idx in range(args.ensemble_size): # Tensorboard writer save_dir = os.path.join(args.save_dir, f'model_{model_idx}') makedirs(save_dir) try: writer = SummaryWriter(log_dir=save_dir) except: writer = SummaryWriter(logdir=save_dir) # Load/build model if args.checkpoint_paths is not None: debug( f'Loading model {model_idx} from {args.checkpoint_paths[model_idx]}' ) model = load_checkpoint(args.checkpoint_paths[model_idx], logger=logger) else: debug(f'Building model {model_idx}') model = MoleculeModel(args) # Optionally, overwrite weights: if args.checkpoint_frzn is not None: debug( f'Loading and freezing parameters from {args.checkpoint_frzn}.' ) model = load_frzn_model(model=model, path=args.checkpoint_frzn, current_args=args, logger=logger) debug(model) if args.checkpoint_frzn is not None: debug(f'Number of unfrozen parameters = {param_count(model):,}') debug(f'Total number of parameters = {param_count_all(model):,}') else: debug(f'Number of parameters = {param_count_all(model):,}') if args.cuda: debug('Moving model to cuda') model = model.to(args.device) # Ensure that model is saved in correct location for evaluation if 0 epochs save_checkpoint(os.path.join(save_dir, MODEL_FILE_NAME), model, scaler, features_scaler, atom_descriptor_scaler, bond_feature_scaler, args) # Optimizers optimizer = build_optimizer(model, args) # Learning rate schedulers scheduler = build_lr_scheduler(optimizer, args) # Run training best_score = float('inf') if args.minimize_score else -float('inf') best_epoch, n_iter = 0, 0 for epoch in trange(args.epochs): debug(f'Epoch {epoch}') n_iter = train(model=model, data_loader=train_data_loader, loss_func=loss_func, optimizer=optimizer, scheduler=scheduler, args=args, n_iter=n_iter, logger=logger, writer=writer) if isinstance(scheduler, ExponentialLR): scheduler.step() val_scores = evaluate(model=model, data_loader=val_data_loader, num_tasks=args.num_tasks, metrics=args.metrics, dataset_type=args.dataset_type, scaler=scaler, logger=logger) for metric, scores in val_scores.items(): # Average validation score avg_val_score = np.nanmean(scores) debug(f'Validation {metric} = {avg_val_score:.6f}') writer.add_scalar(f'validation_{metric}', avg_val_score, n_iter) if args.show_individual_scores: # Individual validation scores for task_name, val_score in zip(args.task_names, scores): debug( f'Validation {task_name} {metric} = {val_score:.6f}' ) writer.add_scalar(f'validation_{task_name}_{metric}', val_score, n_iter) # Save model checkpoint if improved validation score avg_val_score = np.nanmean(val_scores[args.metric]) if args.minimize_score and avg_val_score < best_score or \ not args.minimize_score and avg_val_score > best_score: best_score, best_epoch = avg_val_score, epoch save_checkpoint(os.path.join(save_dir, MODEL_FILE_NAME), model, scaler, features_scaler, atom_descriptor_scaler, bond_feature_scaler, args) # Evaluate on test set using model with best validation score info( f'Model {model_idx} best validation {args.metric} = {best_score:.6f} on epoch {best_epoch}' ) model = load_checkpoint(os.path.join(save_dir, MODEL_FILE_NAME), device=args.device, logger=logger) test_preds = predict(model=model, data_loader=test_data_loader, scaler=scaler) test_scores = evaluate_predictions(preds=test_preds, targets=test_targets, num_tasks=args.num_tasks, metrics=args.metrics, dataset_type=args.dataset_type, logger=logger) if len(test_preds) != 0: sum_test_preds += np.array(test_preds) # Average test score for metric, scores in test_scores.items(): avg_test_score = np.nanmean(scores) info(f'Model {model_idx} test {metric} = {avg_test_score:.6f}') writer.add_scalar(f'test_{metric}', avg_test_score, 0) if args.show_individual_scores and args.dataset_type != 'spectra': # Individual test scores for task_name, test_score in zip(args.task_names, scores): info( f'Model {model_idx} test {task_name} {metric} = {test_score:.6f}' ) writer.add_scalar(f'test_{task_name}_{metric}', test_score, n_iter) writer.close() # Evaluate ensemble on test set avg_test_preds = (sum_test_preds / args.ensemble_size).tolist() ensemble_scores = evaluate_predictions(preds=avg_test_preds, targets=test_targets, num_tasks=args.num_tasks, metrics=args.metrics, dataset_type=args.dataset_type, logger=logger) for metric, scores in ensemble_scores.items(): # Average ensemble score avg_ensemble_test_score = np.nanmean(scores) info(f'Ensemble test {metric} = {avg_ensemble_test_score:.6f}') # Individual ensemble scores if args.show_individual_scores: for task_name, ensemble_score in zip(args.task_names, scores): info( f'Ensemble test {task_name} {metric} = {ensemble_score:.6f}' ) # Save scores with open(os.path.join(args.save_dir, 'test_scores.json'), 'w') as f: json.dump(ensemble_scores, f, indent=4, sort_keys=True) # Optionally save test preds if args.save_preds: test_preds_dataframe = pd.DataFrame( data={'smiles': test_data.smiles()}) for i, task_name in enumerate(args.task_names): test_preds_dataframe[task_name] = [ pred[i] for pred in avg_test_preds ] test_preds_dataframe.to_csv(os.path.join(args.save_dir, 'test_preds.csv'), index=False) return ensemble_scores
def molecule_fingerprint( args: FingerprintArgs, smiles: List[List[str]] = None) -> List[List[Optional[float]]]: """ Loads data and a trained model and uses the model to encode fingerprint vectors for the data. :param args: A :class:`~chemprop.args.PredictArgs` object containing arguments for loading data and a model and making predictions. :param smiles: List of list of SMILES to make predictions on. :return: A list of fingerprint vectors (list of floats) """ print('Loading training args') train_args = load_args(args.checkpoint_paths[0]) # Update args with training arguments if args.fingerprint_type == 'MPN': # only need to supply input features if using FFN latent representation and if model calls for them. validate_feature_sources = False else: validate_feature_sources = True update_prediction_args(predict_args=args, train_args=train_args, validate_feature_sources=validate_feature_sources) args: Union[FingerprintArgs, TrainArgs] #set explicit H option and reaction option reset_featurization_parameters() if args.atom_descriptors == 'feature': set_extra_atom_fdim(train_args.atom_features_size) if args.bond_features_path is not None: set_extra_bond_fdim(train_args.bond_features_size) set_explicit_h(train_args.explicit_h) set_adding_hs(args.adding_h) if train_args.reaction: set_reaction(train_args.reaction, train_args.reaction_mode) elif train_args.reaction_solvent: set_reaction(True, train_args.reaction_mode) print('Loading data') if smiles is not None: full_data = get_data_from_smiles( smiles=smiles, skip_invalid_smiles=False, features_generator=args.features_generator) else: full_data = get_data(path=args.test_path, smiles_columns=args.smiles_columns, target_columns=[], ignore_columns=[], skip_invalid_smiles=False, args=args, store_row=True) print('Validating SMILES') full_to_valid_indices = {} valid_index = 0 for full_index in range(len(full_data)): if all(mol is not None for mol in full_data[full_index].mol): full_to_valid_indices[full_index] = valid_index valid_index += 1 test_data = MoleculeDataset( [full_data[i] for i in sorted(full_to_valid_indices.keys())]) # Edge case if empty list of smiles is provided if len(test_data) == 0: return [None] * len(full_data) print(f'Test size = {len(test_data):,}') # Create data loader test_data_loader = MoleculeDataLoader(dataset=test_data, batch_size=args.batch_size, num_workers=args.num_workers) # Set fingerprint size if args.fingerprint_type == 'MPN': if args.atom_descriptors == "descriptor": # special case when we have 'descriptor' extra dimensions need to be added total_fp_size = ( args.hidden_size + test_data.atom_descriptors_size()) * args.number_of_molecules else: if args.reaction_solvent: total_fp_size = args.hidden_size + args.hidden_size_solvent else: total_fp_size = args.hidden_size * args.number_of_molecules if args.features_only: raise ValueError( 'With features_only models, there is no latent MPN representation. Use last_FFN fingerprint type instead.' ) elif args.fingerprint_type == 'last_FFN': if args.ffn_num_layers != 1: total_fp_size = args.ffn_hidden_size else: raise ValueError( 'With a ffn_num_layers of 1, there is no latent FFN representation. Use MPN fingerprint type instead.' ) else: raise ValueError( f'Fingerprint type {args.fingerprint_type} not supported') all_fingerprints = np.zeros( (len(test_data), total_fp_size, len(args.checkpoint_paths))) # Load model print( f'Encoding smiles into a fingerprint vector from {len(args.checkpoint_paths)} models.' ) for index, checkpoint_path in enumerate( tqdm(args.checkpoint_paths, total=len(args.checkpoint_paths))): model = load_checkpoint(checkpoint_path, device=args.device) scaler, features_scaler, atom_descriptor_scaler, bond_feature_scaler = load_scalers( args.checkpoint_paths[index]) # Normalize features if args.features_scaling or train_args.atom_descriptor_scaling or train_args.bond_feature_scaling: test_data.reset_features_and_targets() if args.features_scaling: test_data.normalize_features(features_scaler) if train_args.atom_descriptor_scaling and args.atom_descriptors is not None: test_data.normalize_features(atom_descriptor_scaler, scale_atom_descriptors=True) if train_args.bond_feature_scaling and args.bond_features_size > 0: test_data.normalize_features(bond_feature_scaler, scale_bond_features=True) # Make fingerprints model_fp = model_fingerprint(model=model, data_loader=test_data_loader, fingerprint_type=args.fingerprint_type) if args.fingerprint_type == 'MPN' and ( args.features_path is not None or args.features_generator ): # truncate any features from MPN fingerprint model_fp = np.array(model_fp)[:, :total_fp_size] all_fingerprints[:, :, index] = model_fp # Save predictions print(f'Saving predictions to {args.preds_path}') # assert len(test_data) == len(all_fingerprints) #TODO: add unit test for this makedirs(args.preds_path, isfile=True) # Set column names fingerprint_columns = [] if args.fingerprint_type == 'MPN': if len(args.checkpoint_paths) == 1: for j in range(total_fp_size // args.number_of_molecules): for k in range(args.number_of_molecules): fingerprint_columns.append(f'fp_{j}_mol_{k}') else: for j in range(total_fp_size // args.number_of_molecules): for i in range(len(args.checkpoint_paths)): for k in range(args.number_of_molecules): fingerprint_columns.append(f'fp_{j}_mol_{k}_model_{i}') else: # args == 'last_FNN' if len(args.checkpoint_paths) == 1: for j in range(total_fp_size): fingerprint_columns.append(f'fp_{j}') else: for j in range(total_fp_size): for i in range(len(args.checkpoint_paths)): fingerprint_columns.append(f'fp_{j}_model_{i}') # Copy predictions over to full_data for full_index, datapoint in enumerate(full_data): valid_index = full_to_valid_indices.get(full_index, None) preds = all_fingerprints[valid_index].reshape( (len(args.checkpoint_paths) * total_fp_size )) if valid_index is not None else ['Invalid SMILES'] * len( args.checkpoint_paths) * total_fp_size for i in range(len(fingerprint_columns)): datapoint.row[fingerprint_columns[i]] = preds[i] # Write predictions with open(args.preds_path, 'w') as f: writer = csv.DictWriter(f, fieldnames=args.smiles_columns + fingerprint_columns, extrasaction='ignore') writer.writeheader() for datapoint in full_data: writer.writerow(datapoint.row) return all_fingerprints
def run_training_gnn_xgb(args: TrainArgs, logger: Logger = None) -> List[float]: """ Trains a model and returns test scores on the model checkpoint with the highest validation score. :param args: Arguments. :param logger: Logger. :return: A list of ensemble scores for each task. """ if logger is not None: debug, info = logger.debug, logger.info else: debug = info = print # Print command line debug('Command line') debug(f'python {" ".join(sys.argv)}') # Print args debug('Args') debug(args) # Save args args.save(os.path.join(args.save_dir, 'args.json')) # Set pytorch seed for random initial weights torch.manual_seed(args.pytorch_seed) # Get data debug('Loading data') args.task_names = args.target_columns or get_task_names(args.data_path) data = get_data(path=args.data_path, args=args, logger=logger) args.num_tasks = data.num_tasks() args.features_size = data.features_size() debug(f'Number of tasks = {args.num_tasks}') # Split data debug(f'Splitting data with seed {args.seed}') if args.separate_test_path: test_data = get_data(path=args.separate_test_path, args=args, features_path=args.separate_test_features_path, logger=logger) if args.separate_val_path: val_data = get_data(path=args.separate_val_path, args=args, features_path=args.separate_val_features_path, logger=logger) if args.separate_val_path and args.separate_test_path: train_data = data elif args.separate_val_path: train_data, _, test_data = split_data(data=data, split_type=args.split_type, sizes=(0.8, 0.0, 0.2), seed=args.seed, args=args, logger=logger) elif args.separate_test_path: train_data, val_data, _ = split_data(data=data, split_type=args.split_type, sizes=(0.8, 0.2, 0.0), seed=args.seed, args=args, logger=logger) else: train_data, val_data, test_data = split_data( data=data, split_type=args.split_type, sizes=args.split_sizes, seed=args.seed, args=args, logger=logger) if args.dataset_type == 'classification': class_sizes = get_class_sizes(data) debug('Class sizes') for i, task_class_sizes in enumerate(class_sizes): debug( f'{args.task_names[i]} ' f'{", ".join(f"{cls}: {size * 100:.2f}%" for cls, size in enumerate(task_class_sizes))}' ) if args.save_smiles_splits: save_smiles_splits(train_data=train_data, val_data=val_data, test_data=test_data, data_path=args.data_path, save_dir=args.save_dir) if args.features_scaling: features_scaler = train_data.normalize_features(replace_nan_token=0) val_data.normalize_features(features_scaler) test_data.normalize_features(features_scaler) else: features_scaler = None args.train_data_size = len(train_data) debug( f'Total size = {len(data):,} | ' f'train size = {len(train_data):,} | val size = {len(val_data):,} | test size = {len(test_data):,}' ) # Initialize scaler and scale training targets by subtracting mean and dividing standard deviation (regression only) if args.dataset_type == 'regression': debug('Fitting scaler') train_smiles, train_targets = train_data.smiles(), train_data.targets() scaler = StandardScaler().fit(train_targets) scaled_targets = scaler.transform(train_targets).tolist() train_data.set_targets(scaled_targets) else: scaler = None # Get loss and metric functions loss_func = get_loss_func(args) metric_func = get_metric_func(metric=args.metric) # Set up test set evaluation val_smiles, val_targets = val_data.smiles(), val_data.targets() test_smiles, test_targets = test_data.smiles(), test_data.targets() if args.dataset_type == 'multiclass': sum_test_preds = np.zeros( (len(test_smiles), args.num_tasks, args.multiclass_num_classes)) else: sum_test_preds = np.zeros((len(test_smiles), args.num_tasks)) # Automatically determine whether to cache if len(data) <= args.cache_cutoff: cache = True num_workers = 0 else: cache = False num_workers = args.num_workers # Create data loaders train_data_loader = MoleculeDataLoader(dataset=train_data, batch_size=args.batch_size, num_workers=num_workers, cache=cache, class_balance=args.class_balance, shuffle=True, seed=args.seed) val_data_loader = MoleculeDataLoader(dataset=val_data, batch_size=args.batch_size, num_workers=num_workers, cache=cache) test_data_loader = MoleculeDataLoader(dataset=test_data, batch_size=args.batch_size, num_workers=num_workers, cache=cache) # Train ensemble of models for model_idx in range(args.ensemble_size): # Tensorboard writer save_dir = os.path.join(args.save_dir, f'model_{model_idx}') makedirs(save_dir) try: writer = SummaryWriter(log_dir=save_dir) except: writer = SummaryWriter(logdir=save_dir) # Load/build model if args.checkpoint_paths is not None: debug( f'Loading model {model_idx} from {args.checkpoint_paths[model_idx]}' ) model = load_checkpoint(args.checkpoint_paths[model_idx], logger=logger) else: debug(f'Building model {model_idx}') model = MoleculeModel(args) debug(model) debug(f'Number of parameters = {param_count(model):,}') if args.cuda: debug('Moving model to cuda') model = model.to(args.device) # Ensure that model is saved in correct location for evaluation if 0 epochs save_checkpoint(os.path.join(save_dir, 'model.pt'), model, scaler, features_scaler, args) # Optimizers optimizer = build_optimizer(model, args) # Learning rate schedulers scheduler = build_lr_scheduler(optimizer, args) # Run training best_score = float('inf') if args.minimize_score else -float('inf') best_epoch, n_iter = 0, 0 for epoch in trange(args.epochs): debug(f'Epoch {epoch}') n_iter = train(model=model, data_loader=train_data_loader, loss_func=loss_func, optimizer=optimizer, scheduler=scheduler, args=args, n_iter=n_iter, logger=logger, writer=writer) if isinstance(scheduler, ExponentialLR): scheduler.step() val_scores = evaluate(model=model, data_loader=val_data_loader, num_tasks=args.num_tasks, metric_func=metric_func, dataset_type=args.dataset_type, scaler=scaler, logger=logger) # Average validation score avg_val_score = np.nanmean(val_scores) debug(f'Validation {args.metric} = {avg_val_score:.6f}') writer.add_scalar(f'validation_{args.metric}', avg_val_score, n_iter) if args.show_individual_scores: # Individual validation scores for task_name, val_score in zip(args.task_names, val_scores): debug( f'Validation {task_name} {args.metric} = {val_score:.6f}' ) writer.add_scalar(f'validation_{task_name}_{args.metric}', val_score, n_iter) # Save model checkpoint if improved validation score if args.minimize_score and avg_val_score < best_score or \ not args.minimize_score and avg_val_score > best_score: best_score, best_epoch = avg_val_score, epoch save_checkpoint(os.path.join(save_dir, 'model.pt'), model, scaler, features_scaler, args) # Evaluate on test set using model with best validation score info( f'Model {model_idx} best validation {args.metric} = {best_score:.6f} on epoch {best_epoch}' ) model = load_checkpoint(os.path.join(save_dir, 'model.pt'), device=args.device, logger=logger) test_preds, _ = predict(model=model, data_loader=test_data_loader, scaler=scaler) test_scores = evaluate_predictions(preds=test_preds, targets=test_targets, num_tasks=args.num_tasks, metric_func=metric_func, dataset_type=args.dataset_type, logger=logger) if len(test_preds) != 0: sum_test_preds += np.array(test_preds) # Average test score avg_test_score = np.nanmean(test_scores) info(f'Model {model_idx} test {args.metric} = {avg_test_score:.6f}') writer.add_scalar(f'test_{args.metric}', avg_test_score, 0) if args.show_individual_scores: # Individual test scores for task_name, test_score in zip(args.task_names, test_scores): info( f'Model {model_idx} test {task_name} {args.metric} = {test_score:.6f}' ) writer.add_scalar(f'test_{task_name}_{args.metric}', test_score, n_iter) writer.close() # Evaluate ensemble on test set avg_test_preds = (sum_test_preds / args.ensemble_size).tolist() ensemble_scores = evaluate_predictions(preds=avg_test_preds, targets=test_targets, num_tasks=args.num_tasks, metric_func=metric_func, dataset_type=args.dataset_type, logger=logger) # Average ensemble score avg_ensemble_test_score = np.nanmean(ensemble_scores) info(f'Ensemble test {args.metric} = {avg_ensemble_test_score:.6f}') # Individual ensemble scores if args.show_individual_scores: for task_name, ensemble_score in zip(args.task_names, ensemble_scores): info( f'Ensemble test {task_name} {args.metric} = {ensemble_score:.6f}' ) _, train_feature = predict(model=model, data_loader=train_data_loader, scaler=scaler) _, val_feature = predict(model=model, data_loader=val_data_loader, scaler=scaler) _, test_feature = predict(model=model, data_loader=test_data_loader, scaler=scaler) return ensemble_scores, train_feature, val_feature, test_feature, train_targets, val_targets, test_targets
def evaluate(model: nn.Module, data_loader: MoleculeDataLoader, num_tasks: int, loss_func: Callable, metric_func: Callable, args: TrainArgs, dataset_type: str, scaler: StandardScaler = None, logger: logging.Logger = None) -> List[float]: """ Evaluates an ensemble of models on a dataset. :param model: A model. :param data_loader: A MoleculeDataLoader. :param num_tasks: Number of tasks. :param metric_func: Metric function which takes in a list of targets and a list of predictions. :param dataset_type: Dataset type. :param scaler: A StandardScaler object fit on the training targets. :param logger: Logger. :return: A list with the score for each task based on `metric_func`. """ preds, feature = predict(model=model, data_loader=data_loader, scaler=scaler) targets = data_loader.targets() results = evaluate_predictions(preds=preds, targets=targets, num_tasks=num_tasks, metric_func=metric_func, dataset_type=dataset_type, logger=logger) loss_sum, iter_count = 0, 0 for batch in tqdm(data_loader, total=len(data_loader)): # Prepare batch mol_batch, features_batch, target_batch = batch.batch_graph( ), batch.features(), batch.targets() mask = torch.Tensor([[x is not None for x in tb] for tb in target_batch]) targets = torch.Tensor([[0 if x is None else x for x in tb] for tb in target_batch]) # Run model preds, _ = model(mol_batch, features_batch) # Move tensors to correct device mask = mask.to(preds.device) targets = targets.to(preds.device) class_weights = torch.ones(targets.shape, device=preds.device) if args.dataset_type == 'multiclass': targets = targets.long() loss = torch.cat([ loss_func(preds[:, target_index, :], targets[:, target_index]).unsqueeze(1) for target_index in range(preds.size(1)) ], dim=1) * class_weights * mask else: loss = loss_func(preds, targets) * class_weights * mask loss = loss.sum() / mask.sum() loss_sum += loss.item() iter_count += len(batch) loss_cut_count = loss_sum / iter_count return results, loss_cut_count