def find_similar_mols(test_smiles: List[str], train_smiles: List[str], distance_measure: str, model: MoleculeModel = None, num_neighbors: int = None, batch_size: int = 50) -> List[OrderedDict]: """ For each test molecule, finds the N most similar training molecules according to some distance measure. :param test_smiles: A list of test SMILES strings. :param train_smiles: A list of train SMILES strings. :param model: A trained MoleculeModel (only needed for distance_measure == 'embedding'). :param distance_measure: The distance measure to use to determine nearest neighbors. :param num_neighbors: The number of nearest training molecules to find for each test molecule. :param batch_size: Batch size. :return: A list of OrderedDicts containing the test smiles, the num_neighbors nearest training smiles, and other relevant distance info. """ test_data, train_data = get_data_from_smiles(test_smiles), get_data_from_smiles(train_smiles) train_smiles_set = set(train_smiles) print(f'Computing {distance_measure} vectors') if distance_measure == 'embedding': assert model is not None test_vecs = np.array(compute_molecule_vectors(model=model, data=test_data, batch_size=batch_size)) train_vecs = np.array(compute_molecule_vectors(model=model, data=train_data, batch_size=batch_size)) metric = 'cosine' elif distance_measure == 'morgan': test_vecs = np.array([morgan_binary_features_generator(smiles) for smiles in tqdm(test_smiles, total=len(test_smiles))]) train_vecs = np.array([morgan_binary_features_generator(smiles) for smiles in tqdm(train_smiles, total=len(train_smiles))]) metric = 'jaccard' else: raise ValueError(f'Distance measure "{distance_measure}" not supported.') print('Computing distances') distances = cdist(test_vecs, train_vecs, metric=metric) print('Finding neighbors') neighbors = [] for test_index, test_smile in enumerate(test_smiles): # Find the num_neighbors molecules in the training set which are most similar to the test molecule nearest_train_indices = np.argsort(distances[test_index])[:num_neighbors] # Build dictionary with distance info neighbor = OrderedDict() neighbor['test_smiles'] = test_smile neighbor['test_in_train'] = test_smile in train_smiles_set for i, train_index in enumerate(nearest_train_indices): neighbor[f'train_{i + 1}_smiles'] = train_smiles[train_index] neighbor[f'train_{i + 1}_{distance_measure}_{metric}_distance'] = distances[test_index][train_index] neighbors.append(neighbor) return neighbors
def __call__(self, smiles, batch_size=500): test_data = get_data_from_smiles(smiles=smiles, skip_invalid_smiles=False, args=self.train_args) valid_indices = [ i for i in range(len(test_data)) if test_data[i].mol is not None ] full_data = test_data test_data = MoleculeDataset([test_data[i] for i in valid_indices]) if self.train_args.features_scaling: test_data.normalize_features(self.features_scaler) sum_preds = [] for model in self.checkpoints: model_preds = predict(model=model, data=test_data, batch_size=batch_size, scaler=self.scaler, disable_progress_bar=True) sum_preds.append(np.array(model_preds)) # Ensemble predictions sum_preds = sum(sum_preds) avg_preds = sum_preds / len(self.checkpoints) return avg_preds
def __call__(self, smiles: List[str], batch_size: int = 500) -> List[List[float]]: test_data = get_data_from_smiles( smiles=smiles, skip_invalid_smiles=False, features_generator=self.args.features_generator) valid_indices = [ i for i in range(len(test_data)) if test_data[i].mol is not None ] test_data = MoleculeDataset([test_data[i] for i in valid_indices]) if self.train_args.features_scaling: test_data.normalize_features(self.features_scaler) test_data_loader = MoleculeDataLoader(dataset=test_data, batch_size=batch_size) sum_preds = [] for model in self.checkpoints: model_preds = predict(model=model, data_loader=test_data_loader, scaler=self.scaler, disable_progress_bar=True) sum_preds.append(np.array(model_preds)) # Ensemble predictions sum_preds = sum(sum_preds) avg_preds = sum_preds / len(self.checkpoints) return avg_preds
def __call__(self, smiles, batch_size=500): test_data = get_data_from_smiles(smiles=smiles, skip_invalid_smiles=False, args=self.train_args) valid_indices = [i for i in range(len(test_data)) if test_data[i].mol is not None] full_data = test_data test_data = MoleculeDataset([test_data[i] for i in valid_indices]) if self.train_args.features_scaling: test_data.normalize_features(self.features_scaler) sum_preds = np.zeros((len(test_data), 1)) for model in self.checkpoints: model_preds = predict( model=model, data=test_data, batch_size=batch_size, scaler=self.scaler ) sum_preds += np.array(model_preds) # Ensemble predictions avg_preds = sum_preds / len(self.checkpoints) avg_preds = avg_preds.squeeze(-1).tolist() # Put zero for invalid smiles full_preds = [0.0] * len(full_data) for i, si in enumerate(valid_indices): full_preds[si] = avg_preds[i] return np.array(full_preds, dtype=np.float32)
def predict_smile(checkpoint_path: str, smile: str): smiles = [smile] """ Makes predictions. If smiles is provided, makes predictions on smiles. Otherwise makes predictions on args.test_data. :param args: Arguments. :param smiles: Smiles to make predictions on. :return: A list of lists of target predictions. """ args = Namespace() # print('Loading training args') scaler, features_scaler = load_scalers(checkpoint_path) train_args = load_args(checkpoint_path) # Update args with training arguments for key, value in vars(train_args).items(): if not hasattr(args, key): setattr(args, key, value) # print('Loading data') if smiles is not None: test_data = get_data_from_smiles(smiles=smiles, skip_invalid_smiles=False) else: print("Enter Valid Smile String") return # print('Validating SMILES') valid_indices = [ i for i in range(len(test_data)) if test_data[i].mol is not None ] full_data = test_data test_data = MoleculeDataset([test_data[i] for i in valid_indices]) # Edge case if empty list of smiles is provided if len(test_data) == 0: return [None] * len(full_data) # Normalize features if train_args.features_scaling: test_data.normalize_features(features_scaler) # Predict with each model individually and sum predictions if args.dataset_type == 'multiclass': sum_preds = np.zeros( (len(test_data), args.num_tasks, args.multiclass_num_classes)) else: sum_preds = np.zeros((len(test_data), args.num_tasks)) model = load_checkpoint(checkpoint_path, cuda=args.cuda) model_preds = predict(model=model, data=test_data, batch_size=1, scaler=scaler) sum_preds += np.array(model_preds) # Ensemble predictions return sum_preds[0][0]
def gcnn_predict(self) -> Tuple[array, array]: """ Function that handles graph convolutinal neural network predictions, enters them into the predictions DataFrame and reports any errors Parameters: rdkit_mols (array): a numpy array containing RDKit molecules Returns: predictions, prediction_labels (Tuple[array, array]): predictions and labels """ smiles = self.rdkit_mols.tolist() full_data = get_data_from_smiles(smiles=smiles, skip_invalid_smiles=False) full_to_valid_indices = {} valid_index = 0 for full_index in range(len(full_data)): if full_data[full_index].mol is not None: full_to_valid_indices[full_index] = valid_index valid_index += 1 test_data = MoleculeDataset( [full_data[i] for i in sorted(full_to_valid_indices.keys())]) # create data loader test_data_loader = MoleculeDataLoader(dataset=test_data, batch_size=50, num_workers=0) model_preds = predict(model=rlm_gcnn_model, data_loader=test_data_loader, scaler=rlm_gcnn_scaler) predictions = np.ma.empty(len(full_data)) predictions.mask = True labels = np.ma.empty(len(full_data)) labels.mask = True for key in full_to_valid_indices.keys(): full_index = int(key) predictions[full_index] = model_preds[ full_to_valid_indices[key]][0] labels[full_index] = np.round( model_preds[full_to_valid_indices[key]][0], 0) self.predictions_df['GCNN'] = pd.Series( pd.Series(labels).fillna('').astype(str) + ' (' + pd.Series(predictions).round(2).astype(str) + ')').str.replace( '(nan)', '', regex=False) if len(self.predictions_df.index) > len( predictions) or np.ma.count_masked(predictions) > 0: self.model_errors.append('graph convolutional neural network') self.has_errors = True return predictions, labels
def create_crossval_splits(): assay_file = '../data/assay_matrix_discrete_270_assays.csv' smiles = [] smiles_ = [] with open(assay_file) as f: next(f) reader = csv.reader(f) for row in reader: smiles.append([row[0]]) smiles_.append(row[0]) data = get_data_from_smiles(smiles) all_indices = list(range(len(data))) fold_indicies = split_indices(all_indices, num_folds=5, data=data) array = np.array(fold_indicies) print(array.shape) np.savez('../data/scaffold_based_split_jan22.npz', features=array)
def __call__(self, smiles, batch_size=500): test_data = get_data_from_smiles( smiles=[[s] for s in smiles], skip_invalid_smiles=False, features_generator=self.features_generator) valid_indices = [ i for i in range(len(test_data)) if test_data[i].mol[0] is not None ] full_data = test_data test_data = MoleculeDataset([test_data[i] for i in valid_indices]) test_data_loader = MoleculeDataLoader(dataset=test_data, batch_size=batch_size) sum_preds = np.zeros((len(test_data), 1)) for model, scaler, features_scaler in zip(self.checkpoints, self.scalers, self.features_scalers): test_data.reset_features_and_targets() if features_scaler is not None: test_data.normalize_features(features_scaler) model_preds = predict(model=model, data_loader=test_data_loader, scaler=scaler) sum_preds += np.array(model_preds) # Ensemble predictions avg_preds = sum_preds / len(self.checkpoints) avg_preds = avg_preds.squeeze(-1).tolist() # Put zero for invalid smiles full_preds = [0.0] * len(full_data) for i, si in enumerate(valid_indices): full_preds[si] = avg_preds[i] return np.array(full_preds, dtype=np.float32)
def make_predictions(args: Namespace, smiles: List[str] = None) -> List[Optional[List[float]]]: """ Makes predictions. If smiles is provided, makes predictions on smiles. Otherwise makes predictions on args.test_data. :param args: Arguments. :param smiles: Smiles to make predictions on. :return: A list of lists of target predictions. """ if args.gpu is not None: torch.cuda.set_device(args.gpu) print('Loading training args') scaler, drug_scaler, cmpd_scaler = load_scalers(args.checkpoint_paths[0]) train_args = load_args(args.checkpoint_paths[0]) # Update args with training arguments for key, value in vars(train_args).items(): if not hasattr(args, key): setattr(args, key, value) print('Loading data') if smiles is not None: test_data = get_data_from_smiles(smiles=smiles, skip_invalid_smiles=False) else: test_data = get_data(path=args.test_path, args=args, use_compound_names=args.use_compound_names, skip_invalid_smiles=False) print('Validating SMILES') valid_indices = [i for i in range(len(test_data)) if test_data[i].drug_mol is not None] full_data = test_data test_data = MolPairDataset([test_data[i] for i in valid_indices]) # Edge case if empty list of smiles is provided if len(test_data) == 0: return [None] * len(full_data) if args.use_compound_names: compound_names = test_data.compound_names() print(f'Test size = {len(test_data):,}') # Normalize features if train_args.features_scaling: test_data.normalize_features(drug_scaler, cmpd_scaler) # Predict with each model individually and sum predictions if args.dataset_type == 'multiclass': sum_preds = np.zeros((len(test_data), args.num_tasks, args.multiclass_num_classes)) else: sum_preds = np.zeros((len(test_data), args.num_tasks)) print(f'Predicting with an ensemble of {len(args.checkpoint_paths)} models') for checkpoint_path in tqdm(args.checkpoint_paths, total=len(args.checkpoint_paths)): # Load model model = load_checkpoint(checkpoint_path, cuda=args.cuda) model_preds = predict( model=model, data=test_data, batch_size=args.batch_size, scaler=scaler # TODO: Shouldn't this be the custom scalers if avail? ) sum_preds += np.array(model_preds) # Ensemble predictions avg_preds = sum_preds / len(args.checkpoint_paths) avg_preds = avg_preds.tolist() # Save predictions assert len(test_data) == len(avg_preds) print(f'Saving predictions to {args.preds_path}') # Put Nones for invalid smiles full_preds = [None] * len(full_data) for i, si in enumerate(valid_indices): full_preds[si] = avg_preds[i] avg_preds = full_preds test_smiles = full_data.smiles() # Write predictions with open(args.preds_path, 'w') as f: writer = csv.writer(f) header = ['drugSMILE', 'cmpdSMILE'] if args.dataset_type == 'multiclass': for name in args.task_names: for i in range(args.multiclass_num_classes): header.append(name + '_class' + str(i)) else: header.extend(args.task_names) writer.writerow(header) for i in range(len(avg_preds)): row = [test_smiles[i][0], test_smiles[i][1]] if avg_preds[i] is not None: if args.dataset_type == 'multiclass': for task_probs in avg_preds[i]: row.extend(task_probs) else: row.extend(avg_preds[i]) else: if args.dataset_type == 'multiclass': row.extend([''] * args.num_tasks * args.multiclass_num_classes) else: row.extend([''] * args.num_tasks) writer.writerow(row) return avg_preds
def __call__( self, smiles: List[Optional[str]] = None, smiles2: List[Optional[str]] = None ) -> List[Optional[List[float]]]: if self.computed_prop: # Identity non-None smiles if len(smiles) > 0: valid_indices, valid_smiles = zip( *[(i, smile) for i, smile in enumerate(smiles) if smile is not None]) else: valid_indices, valid_smiles = [], [] valid_props = [ self.scorer(valid_smile) for valid_smile in valid_smiles ] # Combine properties of non-None smiles with Nones props = [None] * len(smiles) for i, prop in zip(valid_indices, valid_props): props[i] = prop return props test_data = get_data_from_smiles(smiles=smiles, skip_invalid_smiles=False) valid_indices = [ i for i in range(len(test_data)) if test_data[i].mol is not None ] full_data = test_data test_data = MoleculeDataset([test_data[i] for i in valid_indices]) if self.features_generator is not None: self.generate_features(test_data) valid_indices = [ i for i in range(len(test_data)) if test_data[i].mol is not None and test_data[i].features is not None ] test_data = MoleculeDataset([test_data[i] for i in valid_indices]) # Edge case if empty list of smiles is provided if len(test_data) == 0: return [None] * len(full_data) # Normalize features if self.train_args.features_scaling: test_data.normalize_features(self.features_scaler) # Predict with each model individually and sum predictions sum_preds = np.zeros((len(test_data), self.num_tasks)) for chemprop_model in self.chemprop_models: model_preds = predict(model=chemprop_model, data=test_data, batch_size=self.batch_size, scaler=self.scaler) sum_preds += np.array(model_preds) # Ensemble predictions avg_preds = sum_preds / len(self.chemprop_models) avg_preds = avg_preds.tolist() # Put Nones for invalid smiles full_preds = [None] * len(full_data) for i, si in enumerate(valid_indices): full_preds[si] = avg_preds[i] if self.neg_threshold: return [ -p[self.prop_index] if p is not None else None for p in full_preds ] else: return [ p[self.prop_index] if p is not None else None for p in full_preds ]
def make_predictions(args: PredictArgs, smiles: List[str] = None) -> List[Optional[List[float]]]: """ Makes predictions. If smiles is provided, makes predictions on smiles. Otherwise makes predictions on args.test_data. :param args: Arguments. :param smiles: Smiles to make predictions on. :return: A list of lists of target predictions. """ print('Loading training args') scaler, features_scaler = load_scalers(args.checkpoint_paths[0]) train_args = load_args(args.checkpoint_paths[0]) num_tasks, task_names = train_args.num_tasks, train_args.task_names # If features were used during training, they must be used when predicting if ((train_args.features_path is not None or train_args.features_generator is not None) and args.features_path is None and args.features_generator is None): raise ValueError( 'Features were used during training so they must be specified again during prediction ' 'using the same type of features as before (with either --features_generator or ' '--features_path and using --no_features_scaling if applicable).') # Update predict args with training arguments to create a merged args object for key, value in vars(train_args).items(): if not hasattr(args, key): setattr(args, key, value) args: Union[PredictArgs, TrainArgs] print('Loading data') if smiles is not None: full_data = get_data_from_smiles( smiles=smiles, skip_invalid_smiles=False, features_generator=args.features_generator) else: full_data = get_data(path=args.test_path, args=args, target_columns=[], skip_invalid_smiles=False) print('Validating SMILES') full_to_valid_indices = {} valid_index = 0 for full_index in range(len(full_data)): if full_data[full_index].mol is not None: full_to_valid_indices[full_index] = valid_index valid_index += 1 test_data = MoleculeDataset( [full_data[i] for i in sorted(full_to_valid_indices.keys())]) # Edge case if empty list of smiles is provided if len(test_data) == 0: return [None] * len(full_data) print(f'Test size = {len(test_data):,}') # Normalize features if args.features_scaling: test_data.normalize_features(features_scaler) # Predict with each model individually and sum predictions if args.dataset_type == 'multiclass': sum_preds = np.zeros( (len(test_data), num_tasks, args.multiclass_num_classes)) else: sum_preds = np.zeros((len(test_data), num_tasks)) # Create data loader test_data_loader = MoleculeDataLoader(dataset=test_data, batch_size=args.batch_size, num_workers=args.num_workers) print( f'Predicting with an ensemble of {len(args.checkpoint_paths)} models') for checkpoint_path in tqdm(args.checkpoint_paths, total=len(args.checkpoint_paths)): # Load model model = load_checkpoint(checkpoint_path, device=args.device) model_preds = predict(model=model, data_loader=test_data_loader, scaler=scaler) sum_preds += np.array(model_preds) # Ensemble predictions avg_preds = sum_preds / len(args.checkpoint_paths) avg_preds = avg_preds.tolist() # Save predictions print(f'Saving predictions to {args.preds_path}') assert len(test_data) == len(avg_preds) makedirs(args.preds_path, isfile=True) # Get prediction column names if args.dataset_type == 'multiclass': task_names = [ f'{name}_class_{i}' for name in task_names for i in range(args.multiclass_num_classes) ] else: task_names = task_names # Copy predictions over to full_data for full_index, datapoint in enumerate(full_data): valid_index = full_to_valid_indices.get(full_index, None) preds = avg_preds[valid_index] if valid_index is not None else [ 'Invalid SMILES' ] * len(task_names) for pred_name, pred in zip(task_names, preds): datapoint.row[pred_name] = pred # Save with open(args.preds_path, 'w') as f: writer = csv.DictWriter(f, fieldnames=full_data[0].row.keys()) writer.writeheader() for datapoint in full_data: writer.writerow(datapoint.row) return avg_preds
def make_predictions(args: Namespace, smiles: List[str] = None) -> List[Optional[List[float]]]: """ Makes predictions. If smiles is provided, makes predictions on smiles. Otherwise makes predictions on args.test_data. :param args: Arguments. :param smiles: Smiles to make predictions on. :return: A list of lists of target predictions. """ if args.gpu is not None: torch.cuda.set_device(args.gpu) print('Loading training args') scaler, features_scaler = load_scalers(args.checkpoint_paths[0]) train_args = load_args(args.checkpoint_paths[0]) # Update args with training arguments for key, value in vars(train_args).items(): if not hasattr(args, key): setattr(args, key, value) print('Loading data') if smiles is not None: test_data = get_data_from_smiles(smiles=smiles, skip_invalid_smiles=False, args=args) else: test_data = get_data(path=args.test_path, args=args, use_compound_names=args.use_compound_names, skip_invalid_smiles=False) print('Validating SMILES') valid_indices = [ i for i in range(len(test_data)) if test_data[i].mol is not None ] full_data = test_data test_data = MoleculeDataset([test_data[i] for i in valid_indices]) # Edge case if empty list of smiles is provided if len(test_data) == 0: return [None] * len(full_data) if args.use_compound_names: compound_names = test_data.compound_names() print(f'Test size = {len(test_data):,}') # Normalize features if train_args.features_scaling: test_data.normalize_features(features_scaler) # Predict with each model individually and sum predictions if args.dataset_type == 'multiclass': sum_preds = np.zeros( (len(test_data), args.num_tasks, args.multiclass_num_classes)) sum_ale_uncs = np.zeros( (len(test_data), args.num_tasks, args.multiclass_num_classes)) sum_epi_uncs = np.zeros( (len(test_data), args.num_tasks, args.multiclass_num_classes)) else: sum_preds = np.zeros((len(test_data), args.num_tasks)) sum_ale_uncs = np.zeros((len(test_data), args.num_tasks)) sum_epi_uncs = np.zeros((len(test_data), args.num_tasks)) # Partial results for variance robust calculation. all_preds = np.zeros( (len(test_data), args.num_tasks, len(args.checkpoint_paths))) print( f'Predicting with an ensemble of {len(args.checkpoint_paths)} models') for index, checkpoint_path in enumerate( tqdm(args.checkpoint_paths, total=len(args.checkpoint_paths))): # Load model model = load_checkpoint(checkpoint_path, cuda=args.cuda) model_preds, ale_uncs, epi_uncs = predict( model=model, data=test_data, batch_size=args.batch_size, scaler=scaler, sampling_size=args.sampling_size) sum_preds += np.array(model_preds) if ale_uncs is not None: sum_ale_uncs += np.array(ale_uncs) if epi_uncs is not None: sum_epi_uncs += np.array(epi_uncs) if args.estimate_variance: all_preds[:, :, index] = model_preds print('\nmodel_preds\n', model_preds) print('ale_uncs\n', ale_uncs) # Ensemble predictions if args.estimate_variance: # Use ensemble variance to estimate uncertainty. This overwrites existing uncertainty estimates. # preds <- mean(preds), ale_uncs <- mean(ale_uncs), epi_uncs <- var(preds) avg_preds = sum_preds / len(args.checkpoint_paths) avg_preds = avg_preds.tolist() avg_ale_uncs = sum_ale_uncs / len(args.checkpoint_paths) avg_ale_uncs = avg_ale_uncs.tolist() avg_epi_uncs = np.var(all_preds, axis=2) avg_epi_uncs = avg_epi_uncs.tolist() else: # Use another method to estimate uncertainty. # preds <- mean(preds), ale_uncs <- mean(ale_uncs), epi_uncs <- mean(epi_uncs) avg_preds = sum_preds / len(args.checkpoint_paths) avg_preds = avg_preds.tolist() avg_ale_uncs = sum_ale_uncs / len(args.checkpoint_paths) avg_ale_uncs = avg_ale_uncs.tolist() avg_epi_uncs = sum_epi_uncs / len(args.checkpoint_paths) avg_epi_uncs = avg_epi_uncs.tolist() # Save predictions assert len(test_data) == len(avg_preds) assert len(test_data) == len(avg_ale_uncs) assert len(test_data) == len(avg_epi_uncs) print(f'Saving predictions to {args.preds_path}') # Put Nones for invalid smiles full_preds = [None] * len(full_data) full_ale_uncs = [None] * len(full_data) full_epi_uncs = [None] * len(full_data) for i, si in enumerate(valid_indices): full_preds[si] = avg_preds[i] full_ale_uncs[si] = avg_ale_uncs[i] full_epi_uncs[si] = avg_epi_uncs[i] avg_preds = full_preds avg_ale_uncs = full_ale_uncs avg_epi_uncs = full_epi_uncs test_smiles = full_data.smiles() # Write predictions with open(args.preds_path, 'w') as f: writer = csv.writer(f) header = [] if args.use_compound_names: header.append('compound_names') header.append('smiles') if args.dataset_type == 'multiclass': for name in args.task_names: for i in range(args.multiclass_num_classes): header.append(name + '_class' + str(i)) else: header.extend(args.task_names) header.extend([tn + "_ale_unc" for tn in args.task_names]) header.extend([tn + "_epi_unc" for tn in args.task_names]) writer.writerow(header) for i in range(len(avg_preds)): row = [] if args.use_compound_names: row.append(compound_names[i]) row.append(test_smiles[i]) if avg_preds[i] is not None: if args.dataset_type == 'multiclass': for task_probs in avg_preds[i]: row.extend(task_probs) else: row.extend(avg_preds[i]) row.extend(avg_ale_uncs[i]) row.extend(avg_epi_uncs[i]) else: if args.dataset_type == 'multiclass': row.extend([''] * args.num_tasks * args.multiclass_num_classes) else: # Both the prediction, the aleatoric uncertainty and the epistemic uncertainty are None row.extend([''] * 3 * args.num_tasks) writer.writerow(row) return avg_preds
def make_predictions(args: Namespace, smiles: List[str] = None) -> List[Optional[List[float]]]: """ Makes predictions. If smiles is provided, makes predictions on smiles. Otherwise makes predictions on args.test_data. :param args: Arguments. :param smiles: Smiles to make predictions on. :return: A list of lists of target predictions. """ if args.gpu is not None: torch.cuda.set_device(args.gpu) print('Loading training args') scaler, features_scaler = load_scalers(args.checkpoint_paths[0]) train_args = load_args(args.checkpoint_paths[0]) # Update args with training arguments for key, value in vars(train_args).items(): if not hasattr(args, key): setattr(args, key, value) print('Loading data') if smiles is not None: test_data = get_data_from_smiles(smiles=smiles, skip_invalid_smiles=False, args=args) else: test_data = get_data(path=args.test_path, args=args, use_compound_names=args.use_compound_names, skip_invalid_smiles=False) print('Validating SMILES') valid_indices = [ i for i in range(len(test_data)) if test_data[i].mol is not None ] full_data = test_data test_data = MoleculeDataset([test_data[i] for i in valid_indices]) # Edge case if empty list of smiles is provided if len(test_data) == 0: return [None] * len(full_data) if args.use_compound_names: compound_names = test_data.compound_names() print(f'Test size = {len(test_data):,}') # Normalize features if train_args.features_scaling: test_data.normalize_features(features_scaler) # Predict with each model individually and sum predictions if args.dataset_type == 'multiclass': sum_preds = np.zeros( (len(test_data), args.num_tasks, args.multiclass_num_classes)) else: sum_preds = np.zeros((len(test_data), args.num_tasks)) print( f'Predicting with an ensemble of {len(args.checkpoint_paths)} models') #for checkpoint_path in tqdm(args.checkpoint_paths, total=len(args.checkpoint_paths)): # Load model model = load_checkpoint(args.checkpoint_path, cuda=args.cuda) test_preds, test_smiles_batch = predict(model=model, data=test_data, batch_size=args.batch_size, scaler=scaler) return test_preds, test_smiles_batch '''
def gcnn_predict(self, model, scaler) -> Tuple[array, array]: """ Function that handles graph convolutinal neural network predictions, enters them into the predictions DataFrame and reports any errors Parameters: models (model): model scaler (scalar): scalar Returns: predictions, prediction_labels (Tuple[array, array]): predictions and labels """ smiles = self.kekule_smiles.tolist() full_data = get_data_from_smiles(smiles=smiles, skip_invalid_smiles=False) full_to_valid_indices = {} valid_index = 0 for full_index in range(len(full_data)): if full_data[full_index].mol is not None: full_to_valid_indices[full_index] = valid_index valid_index += 1 data = MoleculeDataset([full_data[i] for i in sorted(full_to_valid_indices.keys())]) # create data loader data_loader = MoleculeDataLoader( dataset=data, batch_size=50, num_workers=0 ) model_preds = predict( model=model, data_loader=data_loader, scaler=scaler ) predictions = np.ma.empty(len(full_data)) predictions.mask = True labels = np.ma.empty(len(full_data), dtype=np.int32) labels.mask = True for key in full_to_valid_indices.keys(): full_index = int(key) predictions[full_index] = model_preds[full_to_valid_indices[key]][0] labels[full_index] = np.round(model_preds[full_to_valid_indices[key]][0], 0) if self.smiles is not None: dt = datetime.datetime.now(timezone.utc) utc_time = dt.replace(tzinfo=timezone.utc) utc_timestamp = utc_time.timestamp() self.raw_predictions_df = self.raw_predictions_df.append( pd.DataFrame( { 'SMILES': self.smiles, 'model': self.model_name, 'prediction': predictions, 'timestamp': utc_timestamp } ), ignore_index = True ) # if self.interpret == True: # intrprt_df = get_interpretation(self.smiles, self.model_name) # else: # col_names = ['smiles', 'rationale_smiles', 'rationale_score'] # intrprt_df = pd.DataFrame(columns = col_names) #self.predictions_df['smiles'] = pd.Series(np.where(intrprt_df['rationale_scores']>0, intrprt_df['smiles'] + '_' + intrprt_df['rationale_smiles'], intrprt_df['smiles'])) self.predictions_df[self.column_dict_key] = pd.Series(pd.Series(labels).fillna('').astype(str) + ' (' + pd.Series(np.where(predictions>=0.5, predictions, (1 - predictions))).round(2).astype(str) + ')').str.replace('(nan)', '', regex=False) if len(self.predictions_df.index) > len(predictions) or np.ma.count_masked(predictions) > 0: self.model_errors.append('graph convolutional neural network') self.has_errors = True return predictions, labels
def make_predictions(args: Namespace, smiles: List[str] = None, invalid_smiles_warning: str = None) -> List[List[float]]: """Makes predictions.""" if args.gpu is not None: torch.cuda.set_device(args.gpu) if invalid_smiles_warning is not None: success_indices = [] for i, s in enumerate(smiles): mol = Chem.MolFromSmiles(s) if mol is not None: success_indices.append(i) full_smiles = smiles smiles = [smiles[i] for i in success_indices] print('Loading training args') scaler, features_scaler = load_scalers(args.checkpoint_paths[0]) train_args = load_args(args.checkpoint_paths[0]) # Update args with training arguments for key, value in vars(train_args).items(): if not hasattr(args, key): setattr(args, key, value) print('Loading data') if smiles is not None: test_data = get_data_from_smiles(smiles) else: test_data = get_data(args.test_path, args, use_compound_names=args.compound_names) test_smiles = test_data.smiles() if args.compound_names: compound_names = test_data.compound_names() print('Test size = {:,}'.format(len(test_data))) # Normalize features if train_args.features_scaling: test_data.normalize_features(features_scaler) # Predict with each model individually and sum predictions sum_preds = np.zeros((len(test_data), args.num_tasks)) print('Predicting with an ensemble of {} models'.format( len(args.checkpoint_paths))) for checkpoint_path in tqdm(args.checkpoint_paths, total=len(args.checkpoint_paths)): # Load model model = load_checkpoint(checkpoint_path, cuda=args.cuda) model_preds = predict(model=model, data=test_data, args=args, scaler=scaler) sum_preds += np.array(model_preds) # Ensemble predictions avg_preds = sum_preds / args.ensemble_size avg_preds = avg_preds.tolist() # Save predictions assert len(test_data) == len(avg_preds) print('Saving predictions to {}'.format(args.preds_path)) with open(args.preds_path, 'w') as f: if args.write_smiles: f.write('smiles,') if args.compound_names: f.write('compound_name,') f.write(','.join(args.task_names) + '\n') for i in range(len(avg_preds)): if args.write_smiles: f.write(test_smiles[i] + ',') if args.compound_names: f.write(compound_names[i] + ',') f.write(','.join(str(p) for p in avg_preds[i]) + '\n') if invalid_smiles_warning is not None: full_preds = [[invalid_smiles_warning] for _ in range(len(full_smiles))] for i, si in enumerate(success_indices): full_preds[si] = avg_preds[i] return full_preds return avg_preds
with open(assay_train_file) as f: next(f) reader = csv.reader(f) for row in reader: smiles.append([row[0]]) smiles_.append(row[0]) with open(assay_test_file) as f: next(f) reader = csv.reader(f) for row in reader: smiles.append([row[0]]) smiles_.append(row[0]) fingerprints = [] data = get_data_from_smiles(smiles) data = data.mols(flatten=True) for i in range(len(data)): mf = morgan_binary_features_generator(data[i]) fingerprints.append(mf) fingerprints = np.array(fingerprints) #print(fingerprints.shape) np.savez('fingerprints.npz', features=fingerprints) fps = [] for i in range(len(data)): fps.append(Chem.RDKFingerprint(data[i])) similarity = np.zeros([16978, 16978])