def find_similar_mols_from_file(test_path: str, train_path: str, distance_measure: str, checkpoint_path: str = None, num_neighbors: int = -1, batch_size: int = 50) -> List[OrderedDict]: """ For each test molecule, finds the N most similar training molecules according to some distance measure. Loads molecules and model from file. :param test_path: Path to a CSV file containing test SMILES. :param train_path: Path to a CSV file containing train SMILES. :param checkpoint_path: Path to a .pt model checkpoint file (only needed for distance_measure == 'embedding'). :param distance_measure: The distance measure to use to determine nearest neighbors. :param num_neighbors: The number of nearest training molecules to find for each test molecule. :param batch_size: Batch size. :return: A list of OrderedDicts containing the test smiles, the num_neighbors nearest training smiles, and other relevant distance info. """ print('Loading data') test_smiles, train_smiles = get_smiles( test_path, flatten=True), get_smiles(train_path, flatten=True) if checkpoint_path is not None: print('Loading model') model = load_checkpoint(checkpoint_path) else: model = None return find_similar_mols(test_smiles=test_smiles, train_smiles=train_smiles, distance_measure=distance_measure, model=model, num_neighbors=num_neighbors, batch_size=batch_size)
def run_split_data(args: Args): # Load raw data with open(args.data_path) as f: reader = csv.reader(f) header = next(reader) lines = list(reader) # Load SMILES smiles = get_smiles(path=args.data_path, smiles_columns=args.smiles_column) # Make sure lines and smiles line up assert len(lines) == len(smiles) assert all(s in line for smile, line in zip(smiles, lines) for s in smile) # Create data data = [] for smile, line in tqdm(zip(smiles, lines), total=len(smiles)): datapoint = MoleculeDatapoint(smiles=smile) datapoint.line = line data.append(datapoint) data = MoleculeDataset(data) train, val, test = split_data(data=data, split_type=args.split_type, sizes=args.split_sizes, seed=args.seed) makedirs(args.save_dir) for name, dataset in [('train', train), ('val', val), ('test', test)]: with open(os.path.join(args.save_dir, f'{name}.csv'), 'w') as f: writer = csv.writer(f) writer.writerow(header) for datapoint in dataset: writer.writerow(datapoint.line)
def test_flatten(self): """Testing with flattened output""" smiles = get_smiles( path=self.smiles_path, flatten=True, ) self.assertEqual(smiles, ['C', 'CC', 'CC', 'CN', 'O', 'CO'])
def overlap(args: Args): smiles_1 = get_smiles(path=args.data_path_1, smiles_columns=args.smiles_column_1, flatten=True) smiles_2 = get_smiles(path=args.data_path_2, smiles_columns=args.smiles_column_2, flatten=True) smiles_1, smiles_2 = set(smiles_1), set(smiles_2) size_1, size_2 = len(smiles_1), len(smiles_2) intersection = smiles_1.intersection(smiles_2) size_intersect = len(intersection) print(f'Size of dataset 1: {size_1}') print(f'Size of dataset 2: {size_2}') print(f'Size of intersection: {size_intersect}') print( f'Size of intersection as frac of dataset 1: {size_intersect / size_1}' ) print( f'Size of intersection as frac of dataset 2: {size_intersect / size_2}' ) if args.save_intersection_path is not None: with open(args.data_path_1, 'r') as rf, open(args.save_intersection_path, 'w') as wf: reader, writer = csv.reader(rf), csv.writer(wf) header = next(reader) writer.writerow(header) for line in reader: if line[0] in intersection: writer.writerow(line) if args.save_difference_path is not None: with open(args.data_path_1, 'r') as rf, open(args.save_difference_path, 'w') as wf: reader, writer = csv.reader(rf), csv.writer(wf) header = next(reader) writer.writerow(header) for line in reader(): if line[0] not in intersection: writer.writerow(line)
def interpret(args: InterpretArgs) -> None: """ Runs interpretation of a Chemprop model using the Monte Carlo Tree Search algorithm. :param args: A :class:`~chemprop.args.InterpretArgs` object containing arguments for interpretation. """ if args.number_of_molecules != 1: raise ValueError( "Interpreting is currently only available for single-molecule models." ) global C_PUCT, MIN_ATOMS chemprop_model = ChempropModel(args) def scoring_function(smiles: List[str]) -> List[float]: return chemprop_model(smiles)[:, args.property_id - 1] C_PUCT = args.c_puct MIN_ATOMS = args.min_atoms all_smiles = get_smiles(path=args.data_path, smiles_columns=args.smiles_columns) header = get_header(path=args.data_path) property_name = header[ args.property_id] if len(header) > args.property_id else 'score' print(f'smiles,{property_name},rationale,rationale_score') for smiles in all_smiles: score = scoring_function([smiles])[0] if score > args.prop_delta: rationales = mcts(smiles=smiles[0], scoring_function=scoring_function, n_rollout=args.rollout, max_atoms=args.max_atoms, prop_delta=args.prop_delta) else: rationales = [] if len(rationales) == 0: print(f'{smiles},{score:.3f},,') else: min_size = min(len(x.atoms) for x in rationales) min_rationales = [ x for x in rationales if len(x.atoms) == min_size ] rats = sorted(min_rationales, key=lambda x: x.P, reverse=True) print(f'{smiles},{score:.3f},{rats[0].smiles},{rats[0].P:.3f}')
def predict(): """Renders the predict page and makes predictions if the method is POST.""" if request.method == 'GET': return render_predict() # Get arguments ckpt_id = request.form['checkpointName'] if request.form['textSmiles'] != '': smiles = request.form['textSmiles'].split() elif request.form['drawSmiles'] != '': smiles = [request.form['drawSmiles']] else: # Upload data file with SMILES data = request.files['data'] data_name = secure_filename(data.filename) data_path = os.path.join(app.config['TEMP_FOLDER'], data_name) data.save(data_path) # Check if header is smiles possible_smiles = get_header(data_path)[0] smiles = [possible_smiles ] if Chem.MolFromSmiles(possible_smiles) is not None else [] # Get remaining smiles smiles.extend(get_smiles(data_path)) models = db.get_models(ckpt_id) model_paths = [ os.path.join(app.config['CHECKPOINT_FOLDER'], f'{model["id"]}.pt') for model in models ] task_names = load_task_names(model_paths[0]) num_tasks = len(task_names) gpu = request.form.get('gpu') train_args = load_args(model_paths[0]) # Build arguments arguments = [ '--test_path', 'None', '--preds_path', os.path.join(app.config['TEMP_FOLDER'], app.config['PREDICTIONS_FILENAME']), '--checkpoint_paths', *model_paths ] if gpu is not None: if gpu == 'None': arguments.append('--no_cuda') else: arguments += ['--gpu', gpu] # Handle additional features if train_args.features_path is not None: # TODO: make it possible to specify the features generator if trained using features_path arguments += [ '--features_generator', 'rdkit_2d_normalized', '--no_features_scaling' ] elif train_args.features_generator is not None: arguments += ['--features_generator', *train_args.features_generator] if not train_args.features_scaling: arguments.append('--no_features_scaling') # Parse arguments args = PredictArgs().parse_args(arguments) # Run predictions preds = make_predictions(args=args, smiles=smiles) if all(p is None for p in preds): return render_predict(errors=['All SMILES are invalid']) # Replace invalid smiles with message invalid_smiles_warning = 'Invalid SMILES String' preds = [ pred if pred is not None else [invalid_smiles_warning] * num_tasks for pred in preds ] return render_predict( predicted=True, smiles=smiles, num_smiles=min(10, len(smiles)), show_more=max(0, len(smiles) - 10), task_names=task_names, num_tasks=len(task_names), preds=preds, warnings=["List contains invalid SMILES strings"] if None in preds else None, errors=["No SMILES strings given"] if len(preds) == 0 else None)
def compare_datasets_tsne(args: Args): if len(args.smiles_paths) > len(args.colors) or len( args.smiles_paths) > len(args.sizes): raise ValueError( 'Must have at least as many colors and sizes as datasets') # Random seed for random subsampling np.random.seed(0) # Load the smiles datasets print('Loading data') smiles, slices, labels = [], [], [] for smiles_path in args.smiles_paths: # Get label label = os.path.basename(smiles_path).replace('.csv', '') # Get SMILES new_smiles = get_smiles(path=smiles_path, smiles_columns=args.smiles_column, flatten=True) print(f'{label}: {len(new_smiles):,}') # Subsample if dataset is too large if len(new_smiles) > args.max_per_dataset: print(f'Subsampling to {args.max_per_dataset:,} molecules') new_smiles = np.random.choice(new_smiles, size=args.max_per_dataset, replace=False).tolist() slices.append(slice(len(smiles), len(smiles) + len(new_smiles))) labels.append(label) smiles += new_smiles # Compute Morgan fingerprints print('Computing Morgan fingerprints') morgan_generator = get_features_generator('morgan') morgans = [ morgan_generator(smile) for smile in tqdm(smiles, total=len(smiles)) ] print('Running t-SNE') start = time.time() tsne = TSNE(n_components=2, init='pca', random_state=0, metric='jaccard') X = tsne.fit_transform(morgans) print(f'time = {time.time() - start:.2f} seconds') if args.cluster: import hdbscan # pip install hdbscan print('Running HDBSCAN') start = time.time() clusterer = hdbscan.HDBSCAN(min_cluster_size=5, gen_min_span_tree=True) colors = clusterer.fit_predict(X) print(f'time = {time.time() - start:.2f} seconds') print('Plotting t-SNE') x_min, x_max = np.min(X, axis=0), np.max(X, axis=0) X = (X - x_min) / (x_max - x_min) makedirs(args.save_path, isfile=True) plt.clf() fontsize = 50 * args.scale fig = plt.figure(figsize=(64 * args.scale, 48 * args.scale)) plt.title('t-SNE using Morgan fingerprint with Jaccard similarity', fontsize=2 * fontsize) ax = fig.gca() handles = [] legend_kwargs = dict(loc='upper right', fontsize=fontsize) if args.cluster: plt.scatter(X[:, 0], X[:, 1], s=150 * np.mean(args.sizes), c=colors, cmap='nipy_spectral') else: for slc, color, label, size in zip(slices, args.colors, labels, args.sizes): if args.plot_molecules: # Plots molecules handles.append(mpatches.Patch(color=color, label=label)) for smile, (x, y) in zip(smiles[slc], X[slc]): img = Draw.MolsToGridImage([Chem.MolFromSmiles(smile)], molsPerRow=1, subImgSize=(200, 200)) imagebox = offsetbox.AnnotationBbox( offsetbox.OffsetImage(img), (x, y), bboxprops=dict(color=color)) ax.add_artist(imagebox) else: # Plots points plt.scatter(X[slc, 0], X[slc, 1], s=150 * size, color=color, label=label) if args.plot_molecules: legend_kwargs['handles'] = handles plt.legend(**legend_kwargs) plt.xticks([]), plt.yticks([]) print('Saving t-SNE') plt.savefig(args.save_path)
def test_noheader_2mol(self): """Testing with no header and 2 molecules.""" smiles = get_smiles(path=self.no_header_path, number_of_molecules=2, header=False) self.assertEqual(smiles, [['C', 'CC'], ['CC', 'CN'], ['O', 'CO']])
def test_noheader_1mol(self): """Testing with no header""" smiles = get_smiles(path=self.no_header_path, header=False) self.assertEqual(smiles, [['C'], ['CC'], ['O']])
def test_specified_columns_changed_order(self): """Testing with no optional arguments.""" smiles = get_smiles(path=self.smiles_path, smiles_columns=['column1', 'column0']) self.assertEqual(smiles, [['CC', 'C'], ['CN', 'CC'], ['CO', 'O']])
def test_specified_column_inputs(self): """Testing with a specified smiles column argument.""" smiles = get_smiles(path=self.smiles_path, smiles_columns=['column1']) self.assertEqual(smiles, [['CC'], ['CN'], ['CO']])
def test_default_inputs(self): """Testing with no optional arguments.""" smiles = get_smiles(path=self.smiles_path) self.assertEqual(smiles, [['C', 'CC'], ['CC', 'CN'], ['O', 'CO']])
print( f'Average dice similarity = {np.mean(similarities):.4f} +/- {np.std(similarities):.4f}' ) print(f'Minimum dice similarity = {np.min(similarities):.4f}') print(f'Maximum dice similarity = {np.max(similarities):.4f}') print() print('Percentiles for dice similarity') print(' | '.join([ f'{i}% = {np.percentile(similarities, i):.4f}' for i in range(0, 101, 10) ])) if __name__ == '__main__': args = Args().parse_args() smiles_1 = get_smiles(path=args.data_path_1, smiles_columns=args.smiles_column_1, flatten=True) smiles_2 = get_smiles(path=args.data_path_2, smiles_columns=args.smiles_column_2, flatten=True) if args.similarity_measure == 'scaffold': scaffold_similarity(smiles_1, smiles_2) elif args.similarity_measure == 'morgan': morgan_similarity(smiles_1, smiles_2, args.radius, args.sample_rate) else: raise ValueError( f'Similarity measure "{args.similarity_measure}" not supported.')
def generate_and_save_features(args: Args): """ Computes and saves features for a dataset of molecules as a 2D array in a .npz file. :param args: Arguments. """ # Create directory for save_path makedirs(args.save_path, isfile=True) # Get data and features function smiles = get_smiles(path=args.data_path, smiles_column=args.smiles_column) features_generator = get_features_generator(args.features_generator) temp_save_dir = args.save_path + '_temp' # Load partially complete data if args.restart: if os.path.exists(args.save_path): os.remove(args.save_path) if os.path.exists(temp_save_dir): shutil.rmtree(temp_save_dir) else: if os.path.exists(args.save_path): raise ValueError( f'"{args.save_path}" already exists and args.restart is False.' ) if os.path.exists(temp_save_dir): features, temp_num = load_temp(temp_save_dir) if not os.path.exists(temp_save_dir): makedirs(temp_save_dir) features, temp_num = [], 0 # Build features map function smiles = smiles[len( features ):] # restrict to data for which features have not been computed yet if args.sequential: features_map = map(features_generator, smiles) else: features_map = Pool().imap(features_generator, smiles) # Get features temp_features = [] for i, feats in tqdm(enumerate(features_map), total=len(smiles)): temp_features.append(feats) # Save temporary features every save_frequency if (i > 0 and (i + 1) % args.save_frequency == 0) or i == len(smiles) - 1: save_features(os.path.join(temp_save_dir, f'{temp_num}.npz'), temp_features) features.extend(temp_features) temp_features = [] temp_num += 1 try: # Save all features save_features(args.save_path, features) # Remove temporary features shutil.rmtree(temp_save_dir) except OverflowError: print( 'Features array is too large to save as a single file. Instead keeping features as a directory of files.' )