def find_similar_mols(test_smiles: List[str], train_smiles: List[str], distance_measure: str, model: MoleculeModel = None, num_neighbors: int = None, batch_size: int = 50) -> List[OrderedDict]: """ For each test molecule, finds the N most similar training molecules according to some distance measure. :param test_smiles: A list of test SMILES strings. :param train_smiles: A list of train SMILES strings. :param model: A trained MoleculeModel (only needed for distance_measure == 'embedding'). :param distance_measure: The distance measure to use to determine nearest neighbors. :param num_neighbors: The number of nearest training molecules to find for each test molecule. :param batch_size: Batch size. :return: A list of OrderedDicts containing the test smiles, the num_neighbors nearest training smiles, and other relevant distance info. """ test_data, train_data = get_data_from_smiles(test_smiles), get_data_from_smiles(train_smiles) train_smiles_set = set(train_smiles) print(f'Computing {distance_measure} vectors') if distance_measure == 'embedding': assert model is not None test_vecs = np.array(compute_molecule_vectors(model=model, data=test_data, batch_size=batch_size)) train_vecs = np.array(compute_molecule_vectors(model=model, data=train_data, batch_size=batch_size)) metric = 'cosine' elif distance_measure == 'morgan': test_vecs = np.array([morgan_binary_features_generator(smiles) for smiles in tqdm(test_smiles, total=len(test_smiles))]) train_vecs = np.array([morgan_binary_features_generator(smiles) for smiles in tqdm(train_smiles, total=len(train_smiles))]) metric = 'jaccard' else: raise ValueError(f'Distance measure "{distance_measure}" not supported.') print('Computing distances') distances = cdist(test_vecs, train_vecs, metric=metric) print('Finding neighbors') neighbors = [] for test_index, test_smile in enumerate(test_smiles): # Find the num_neighbors molecules in the training set which are most similar to the test molecule nearest_train_indices = np.argsort(distances[test_index])[:num_neighbors] # Build dictionary with distance info neighbor = OrderedDict() neighbor['test_smiles'] = test_smile neighbor['test_in_train'] = test_smile in train_smiles_set for i, train_index in enumerate(nearest_train_indices): neighbor[f'train_{i + 1}_smiles'] = train_smiles[train_index] neighbor[f'train_{i + 1}_{distance_measure}_{metric}_distance'] = distances[test_index][train_index] neighbors.append(neighbor) return neighbors
def featurize_file(input_df, output_path, pretrained_model): smiles_list = input_df[input_df.columns[0]].tolist() print(len(smiles_list)) data = get_data_from_smiles(smiles=[[smiles] for smiles in smiles_list]) print("Starting molecule vector computation...") descriptors = compute_molecule_vectors(model=pretrained_model, data=data, batch_size=64) print("Computation finished, saving result...") smiles_descriptors_dict = { 'smiles': smiles_list, 'descriptors': descriptors } output_df = pd.DataFrame(smiles_descriptors_dict) output_df.to_csv(output_path, mode='a+', header=not os.path.exists(output_path), encoding="ascii", index=False)