예제 #1
0
def find_similar_mols_from_file(test_path: str,
                                train_path: str,
                                distance_measure: str,
                                checkpoint_path: str = None,
                                num_neighbors: int = -1,
                                batch_size: int = 50) -> List[OrderedDict]:
    """
    For each test molecule, finds the N most similar training molecules according to some distance measure.
    Loads molecules and model from file.

    :param test_path: Path to a CSV file containing test SMILES.
    :param train_path: Path to a CSV file containing train SMILES.
    :param checkpoint_path: Path to a .pt model checkpoint file (only needed for distance_measure == 'embedding').
    :param distance_measure: The distance measure to use to determine nearest neighbors.
    :param num_neighbors: The number of nearest training molecules to find for each test molecule.
    :param batch_size: Batch size.
    :return: A list of OrderedDicts containing the test smiles, the num_neighbors nearest training smiles,
    and other relevant distance info.
    """
    print('Loading data')
    test_smiles, train_smiles = get_smiles(
        test_path, flatten=True), get_smiles(train_path, flatten=True)

    if checkpoint_path is not None:
        print('Loading model')
        model = load_checkpoint(checkpoint_path)
    else:
        model = None

    return find_similar_mols(test_smiles=test_smiles,
                             train_smiles=train_smiles,
                             distance_measure=distance_measure,
                             model=model,
                             num_neighbors=num_neighbors,
                             batch_size=batch_size)
예제 #2
0
def run_split_data(args: Args):
    # Load raw data
    with open(args.data_path) as f:
        reader = csv.reader(f)
        header = next(reader)
        lines = list(reader)

    # Load SMILES
    smiles = get_smiles(path=args.data_path, smiles_columns=args.smiles_column)

    # Make sure lines and smiles line up
    assert len(lines) == len(smiles)
    assert all(s in line for smile, line in zip(smiles, lines) for s in smile)

    # Create data
    data = []
    for smile, line in tqdm(zip(smiles, lines), total=len(smiles)):
        datapoint = MoleculeDatapoint(smiles=smile)
        datapoint.line = line
        data.append(datapoint)
    data = MoleculeDataset(data)

    train, val, test = split_data(data=data,
                                  split_type=args.split_type,
                                  sizes=args.split_sizes,
                                  seed=args.seed)

    makedirs(args.save_dir)

    for name, dataset in [('train', train), ('val', val), ('test', test)]:
        with open(os.path.join(args.save_dir, f'{name}.csv'), 'w') as f:
            writer = csv.writer(f)
            writer.writerow(header)
            for datapoint in dataset:
                writer.writerow(datapoint.line)
예제 #3
0
 def test_flatten(self):
     """Testing with flattened output"""
     smiles = get_smiles(
         path=self.smiles_path,
         flatten=True,
     )
     self.assertEqual(smiles, ['C', 'CC', 'CC', 'CN', 'O', 'CO'])
예제 #4
0
def overlap(args: Args):
    smiles_1 = get_smiles(path=args.data_path_1,
                          smiles_columns=args.smiles_column_1,
                          flatten=True)
    smiles_2 = get_smiles(path=args.data_path_2,
                          smiles_columns=args.smiles_column_2,
                          flatten=True)

    smiles_1, smiles_2 = set(smiles_1), set(smiles_2)
    size_1, size_2 = len(smiles_1), len(smiles_2)
    intersection = smiles_1.intersection(smiles_2)
    size_intersect = len(intersection)
    print(f'Size of dataset 1: {size_1}')
    print(f'Size of dataset 2: {size_2}')
    print(f'Size of intersection: {size_intersect}')
    print(
        f'Size of intersection as frac of dataset 1: {size_intersect / size_1}'
    )
    print(
        f'Size of intersection as frac of dataset 2: {size_intersect / size_2}'
    )

    if args.save_intersection_path is not None:
        with open(args.data_path_1,
                  'r') as rf, open(args.save_intersection_path, 'w') as wf:
            reader, writer = csv.reader(rf), csv.writer(wf)
            header = next(reader)
            writer.writerow(header)
            for line in reader:
                if line[0] in intersection:
                    writer.writerow(line)

    if args.save_difference_path is not None:
        with open(args.data_path_1, 'r') as rf, open(args.save_difference_path,
                                                     'w') as wf:
            reader, writer = csv.reader(rf), csv.writer(wf)
            header = next(reader)
            writer.writerow(header)
            for line in reader():
                if line[0] not in intersection:
                    writer.writerow(line)
예제 #5
0
def interpret(args: InterpretArgs) -> None:
    """
    Runs interpretation of a Chemprop model using the Monte Carlo Tree Search algorithm.

    :param args: A :class:`~chemprop.args.InterpretArgs` object containing arguments for interpretation.
    """

    if args.number_of_molecules != 1:
        raise ValueError(
            "Interpreting is currently only available for single-molecule models."
        )

    global C_PUCT, MIN_ATOMS

    chemprop_model = ChempropModel(args)

    def scoring_function(smiles: List[str]) -> List[float]:
        return chemprop_model(smiles)[:, args.property_id - 1]

    C_PUCT = args.c_puct
    MIN_ATOMS = args.min_atoms

    all_smiles = get_smiles(path=args.data_path,
                            smiles_columns=args.smiles_columns)
    header = get_header(path=args.data_path)

    property_name = header[
        args.property_id] if len(header) > args.property_id else 'score'
    print(f'smiles,{property_name},rationale,rationale_score')

    for smiles in all_smiles:
        score = scoring_function([smiles])[0]
        if score > args.prop_delta:
            rationales = mcts(smiles=smiles[0],
                              scoring_function=scoring_function,
                              n_rollout=args.rollout,
                              max_atoms=args.max_atoms,
                              prop_delta=args.prop_delta)
        else:
            rationales = []

        if len(rationales) == 0:
            print(f'{smiles},{score:.3f},,')
        else:
            min_size = min(len(x.atoms) for x in rationales)
            min_rationales = [
                x for x in rationales if len(x.atoms) == min_size
            ]
            rats = sorted(min_rationales, key=lambda x: x.P, reverse=True)
            print(f'{smiles},{score:.3f},{rats[0].smiles},{rats[0].P:.3f}')
예제 #6
0
def predict():
    """Renders the predict page and makes predictions if the method is POST."""
    if request.method == 'GET':
        return render_predict()

    # Get arguments
    ckpt_id = request.form['checkpointName']

    if request.form['textSmiles'] != '':
        smiles = request.form['textSmiles'].split()
    elif request.form['drawSmiles'] != '':
        smiles = [request.form['drawSmiles']]
    else:
        # Upload data file with SMILES
        data = request.files['data']
        data_name = secure_filename(data.filename)
        data_path = os.path.join(app.config['TEMP_FOLDER'], data_name)
        data.save(data_path)

        # Check if header is smiles
        possible_smiles = get_header(data_path)[0]
        smiles = [possible_smiles
                  ] if Chem.MolFromSmiles(possible_smiles) is not None else []

        # Get remaining smiles
        smiles.extend(get_smiles(data_path))

    models = db.get_models(ckpt_id)
    model_paths = [
        os.path.join(app.config['CHECKPOINT_FOLDER'], f'{model["id"]}.pt')
        for model in models
    ]

    task_names = load_task_names(model_paths[0])
    num_tasks = len(task_names)
    gpu = request.form.get('gpu')
    train_args = load_args(model_paths[0])

    # Build arguments
    arguments = [
        '--test_path', 'None', '--preds_path',
        os.path.join(app.config['TEMP_FOLDER'],
                     app.config['PREDICTIONS_FILENAME']), '--checkpoint_paths',
        *model_paths
    ]

    if gpu is not None:
        if gpu == 'None':
            arguments.append('--no_cuda')
        else:
            arguments += ['--gpu', gpu]

    # Handle additional features
    if train_args.features_path is not None:
        # TODO: make it possible to specify the features generator if trained using features_path
        arguments += [
            '--features_generator', 'rdkit_2d_normalized',
            '--no_features_scaling'
        ]
    elif train_args.features_generator is not None:
        arguments += ['--features_generator', *train_args.features_generator]

        if not train_args.features_scaling:
            arguments.append('--no_features_scaling')

    # Parse arguments
    args = PredictArgs().parse_args(arguments)

    # Run predictions
    preds = make_predictions(args=args, smiles=smiles)

    if all(p is None for p in preds):
        return render_predict(errors=['All SMILES are invalid'])

    # Replace invalid smiles with message
    invalid_smiles_warning = 'Invalid SMILES String'
    preds = [
        pred if pred is not None else [invalid_smiles_warning] * num_tasks
        for pred in preds
    ]

    return render_predict(
        predicted=True,
        smiles=smiles,
        num_smiles=min(10, len(smiles)),
        show_more=max(0,
                      len(smiles) - 10),
        task_names=task_names,
        num_tasks=len(task_names),
        preds=preds,
        warnings=["List contains invalid SMILES strings"]
        if None in preds else None,
        errors=["No SMILES strings given"] if len(preds) == 0 else None)
예제 #7
0
def compare_datasets_tsne(args: Args):
    if len(args.smiles_paths) > len(args.colors) or len(
            args.smiles_paths) > len(args.sizes):
        raise ValueError(
            'Must have at least as many colors and sizes as datasets')

    # Random seed for random subsampling
    np.random.seed(0)

    # Load the smiles datasets
    print('Loading data')
    smiles, slices, labels = [], [], []
    for smiles_path in args.smiles_paths:
        # Get label
        label = os.path.basename(smiles_path).replace('.csv', '')

        # Get SMILES
        new_smiles = get_smiles(path=smiles_path,
                                smiles_columns=args.smiles_column,
                                flatten=True)
        print(f'{label}: {len(new_smiles):,}')

        # Subsample if dataset is too large
        if len(new_smiles) > args.max_per_dataset:
            print(f'Subsampling to {args.max_per_dataset:,} molecules')
            new_smiles = np.random.choice(new_smiles,
                                          size=args.max_per_dataset,
                                          replace=False).tolist()

        slices.append(slice(len(smiles), len(smiles) + len(new_smiles)))
        labels.append(label)
        smiles += new_smiles

    # Compute Morgan fingerprints
    print('Computing Morgan fingerprints')
    morgan_generator = get_features_generator('morgan')
    morgans = [
        morgan_generator(smile) for smile in tqdm(smiles, total=len(smiles))
    ]

    print('Running t-SNE')
    start = time.time()
    tsne = TSNE(n_components=2, init='pca', random_state=0, metric='jaccard')
    X = tsne.fit_transform(morgans)
    print(f'time = {time.time() - start:.2f} seconds')

    if args.cluster:
        import hdbscan  # pip install hdbscan
        print('Running HDBSCAN')
        start = time.time()
        clusterer = hdbscan.HDBSCAN(min_cluster_size=5, gen_min_span_tree=True)
        colors = clusterer.fit_predict(X)
        print(f'time = {time.time() - start:.2f} seconds')

    print('Plotting t-SNE')
    x_min, x_max = np.min(X, axis=0), np.max(X, axis=0)
    X = (X - x_min) / (x_max - x_min)

    makedirs(args.save_path, isfile=True)

    plt.clf()
    fontsize = 50 * args.scale
    fig = plt.figure(figsize=(64 * args.scale, 48 * args.scale))
    plt.title('t-SNE using Morgan fingerprint with Jaccard similarity',
              fontsize=2 * fontsize)
    ax = fig.gca()
    handles = []
    legend_kwargs = dict(loc='upper right', fontsize=fontsize)

    if args.cluster:
        plt.scatter(X[:, 0],
                    X[:, 1],
                    s=150 * np.mean(args.sizes),
                    c=colors,
                    cmap='nipy_spectral')
    else:
        for slc, color, label, size in zip(slices, args.colors, labels,
                                           args.sizes):
            if args.plot_molecules:
                # Plots molecules
                handles.append(mpatches.Patch(color=color, label=label))

                for smile, (x, y) in zip(smiles[slc], X[slc]):
                    img = Draw.MolsToGridImage([Chem.MolFromSmiles(smile)],
                                               molsPerRow=1,
                                               subImgSize=(200, 200))
                    imagebox = offsetbox.AnnotationBbox(
                        offsetbox.OffsetImage(img), (x, y),
                        bboxprops=dict(color=color))
                    ax.add_artist(imagebox)
            else:
                # Plots points
                plt.scatter(X[slc, 0],
                            X[slc, 1],
                            s=150 * size,
                            color=color,
                            label=label)

        if args.plot_molecules:
            legend_kwargs['handles'] = handles

    plt.legend(**legend_kwargs)
    plt.xticks([]), plt.yticks([])

    print('Saving t-SNE')
    plt.savefig(args.save_path)
예제 #8
0
 def test_noheader_2mol(self):
     """Testing with no header and 2 molecules."""
     smiles = get_smiles(path=self.no_header_path,
                         number_of_molecules=2,
                         header=False)
     self.assertEqual(smiles, [['C', 'CC'], ['CC', 'CN'], ['O', 'CO']])
예제 #9
0
 def test_noheader_1mol(self):
     """Testing with no header"""
     smiles = get_smiles(path=self.no_header_path, header=False)
     self.assertEqual(smiles, [['C'], ['CC'], ['O']])
예제 #10
0
 def test_specified_columns_changed_order(self):
     """Testing with no optional arguments."""
     smiles = get_smiles(path=self.smiles_path,
                         smiles_columns=['column1', 'column0'])
     self.assertEqual(smiles, [['CC', 'C'], ['CN', 'CC'], ['CO', 'O']])
예제 #11
0
 def test_specified_column_inputs(self):
     """Testing with a specified smiles column argument."""
     smiles = get_smiles(path=self.smiles_path, smiles_columns=['column1'])
     self.assertEqual(smiles, [['CC'], ['CN'], ['CO']])
예제 #12
0
 def test_default_inputs(self):
     """Testing with no optional arguments."""
     smiles = get_smiles(path=self.smiles_path)
     self.assertEqual(smiles, [['C', 'CC'], ['CC', 'CN'], ['O', 'CO']])
예제 #13
0
    print(
        f'Average dice similarity = {np.mean(similarities):.4f} +/- {np.std(similarities):.4f}'
    )
    print(f'Minimum dice similarity = {np.min(similarities):.4f}')
    print(f'Maximum dice similarity = {np.max(similarities):.4f}')
    print()
    print('Percentiles for dice similarity')
    print(' | '.join([
        f'{i}% = {np.percentile(similarities, i):.4f}'
        for i in range(0, 101, 10)
    ]))


if __name__ == '__main__':
    args = Args().parse_args()

    smiles_1 = get_smiles(path=args.data_path_1,
                          smiles_columns=args.smiles_column_1,
                          flatten=True)
    smiles_2 = get_smiles(path=args.data_path_2,
                          smiles_columns=args.smiles_column_2,
                          flatten=True)

    if args.similarity_measure == 'scaffold':
        scaffold_similarity(smiles_1, smiles_2)
    elif args.similarity_measure == 'morgan':
        morgan_similarity(smiles_1, smiles_2, args.radius, args.sample_rate)
    else:
        raise ValueError(
            f'Similarity measure "{args.similarity_measure}" not supported.')
예제 #14
0
def generate_and_save_features(args: Args):
    """
    Computes and saves features for a dataset of molecules as a 2D array in a .npz file.

    :param args: Arguments.
    """
    # Create directory for save_path
    makedirs(args.save_path, isfile=True)

    # Get data and features function
    smiles = get_smiles(path=args.data_path, smiles_column=args.smiles_column)
    features_generator = get_features_generator(args.features_generator)
    temp_save_dir = args.save_path + '_temp'

    # Load partially complete data
    if args.restart:
        if os.path.exists(args.save_path):
            os.remove(args.save_path)
        if os.path.exists(temp_save_dir):
            shutil.rmtree(temp_save_dir)
    else:
        if os.path.exists(args.save_path):
            raise ValueError(
                f'"{args.save_path}" already exists and args.restart is False.'
            )

        if os.path.exists(temp_save_dir):
            features, temp_num = load_temp(temp_save_dir)

    if not os.path.exists(temp_save_dir):
        makedirs(temp_save_dir)
        features, temp_num = [], 0

    # Build features map function
    smiles = smiles[len(
        features
    ):]  # restrict to data for which features have not been computed yet

    if args.sequential:
        features_map = map(features_generator, smiles)
    else:
        features_map = Pool().imap(features_generator, smiles)

    # Get features
    temp_features = []
    for i, feats in tqdm(enumerate(features_map), total=len(smiles)):
        temp_features.append(feats)

        # Save temporary features every save_frequency
        if (i > 0 and
            (i + 1) % args.save_frequency == 0) or i == len(smiles) - 1:
            save_features(os.path.join(temp_save_dir, f'{temp_num}.npz'),
                          temp_features)
            features.extend(temp_features)
            temp_features = []
            temp_num += 1

    try:
        # Save all features
        save_features(args.save_path, features)

        # Remove temporary features
        shutil.rmtree(temp_save_dir)
    except OverflowError:
        print(
            'Features array is too large to save as a single file. Instead keeping features as a directory of files.'
        )