Exemple #1
0
def GetMHFP6(mol, nBits=2048, radius=3):
    """
    MHFP6: radius=3
    """
    encoder = MHFPEncoder(n_permutations=nBits)
    hash_values = encoder.encode_mol(mol,
                                     radius=radius,
                                     rings=True,
                                     kekulize=True,
                                     min_radius=1)
    arr = encoder.fold(hash_values, nBits)
    return arr.astype(bool)
Exemple #2
0
    def __init__(self, dimensions=1024, radius=2, is_counted=False, is_folded=False):
        """
        MAP4 calculator class
        """
        self.radius = radius
        self.is_counted = is_counted
        self.is_folded = is_folded

        if self.is_folded:
            self.encoder = MHFPEncoder(dimensions)
        else:
            self.encoder = tm.Minhash(dimensions)
Exemple #3
0
def similarity_search(chos_fp, db, mol, number_of_hits, lf1, lf2, lf3, findID1,
                      findID2, findID3):
    """returns n hits of the given query ligand  

    Arguments:
        smile {string} -- smile of the query ligand
        number_of_hits {integer} -- number of required hits

    Returns:
        list -- n NN of the query molecule according to MXfp 
    """

    results = []

    if db == 'ChEMBL':
        lf = lf1
        findID = findID1

    elif db == 'SwissProt':
        lf = lf2
        findID = findID2

    else:
        lf = lf3
        findID = findID3

    mhfp_encoder = MHFPEncoder(512)

    if chos_fp == 'MAP4':
        fp = calc_map(mol)
    else:
        fp = calc_mhfp(mhfp_encoder, mol)

    NNs = lf.query_linear_scan(fp, int(number_of_hits))

    for i, NN in enumerate(NNs):
        results.append([
            findID[NN[1]][1].split(';'), findID[NN[1]][0],
            round(NN[0], 3), findID[NN[1]][2]
        ])

    return results
Exemple #4
0
def convert(subset):
    target = '/cluster/chembl/chembl.' + str(subset) + '.smi'
    actives = pd.read_csv(target, sep=' ', usecols=[0], header=None)

    mh = MHFPEncoder()

    with open('/cluster/chembl/chembl.' + str(subset) + '.mhfp6', 'w+') as f:
        for _, row in actives.iterrows():
            mol = AllChem.MolFromSmiles(row[0])
            if mol:
                fp_vals = ','.join(map(str, mh.encode_mol(mol)))

                f.write(fp_vals + '\n')

    with open('/cluster/chembl/chembl.' + str(subset) + '.mhecfp4', 'w+') as f:
        for _, row in actives.iterrows():
            mol = AllChem.MolFromSmiles(row[0])
            if mol:
                fp_vals = ','.join(
                    map(
                        str,
                        mh.from_sparse_array([
                            *AllChem.GetMorganFingerprint(
                                mol, 2).GetNonzeroElements()
                        ])))

                f.write(fp_vals + '\n')

    with open('/cluster/chembl/chembl.' + str(subset) + '.ecfp4', 'w+') as f:
        for _, row in actives.iterrows():
            mol = AllChem.MolFromSmiles(row[0])
            if mol:
                fp_vals = ','.join(
                    map(
                        str,
                        AllChem.GetMorganFingerprintAsBitVect(mol,
                                                              2,
                                                              nBits=2048)))

                f.write(fp_vals + '\n')
Exemple #5
0
def LSH_Convert(mols, outpath, num_workers):
    # MinHash fingerprints (mhfp) encoder for molecular fingerprinting
    enc = MHFPEncoder(1024)
    # Locality Sensitive Hashing Forest Instance
    lf = tm.LSHForest(1024, 64)

    print("Number of mols to be hashed:", len(mols))
    fps = process_map(enc.encode_mol,
                      mols,
                      chunksize=100,
                      max_workers=num_workers)

    fp_vecs = [tm.VectorUint(fp) for fp in fps]

    lf.batch_add(fp_vecs)
    lf.index()
    # save fp and lf
    with open(os.path.join(outpath, "fps.pickle"), "wb") as fpfile:
        pickle.dump(fps, fpfile)
    lf.store(os.path.join(outpath, "lf.dat"))
    print('LSH data files saved!')
    return lf
Exemple #6
0
    def __init__(self,
                 dimensions=1024,
                 radius=2,
                 is_counted=False,
                 is_folded=False,
                 return_strings=False):
        """
        Parameters
        ----------
        dimensions : int
            (default = 1024)
            Number of entries in the output map4 fingerprint.

        radius : int
            (default = 2)
            Number of bonds away from atom centre to consider.

        is_counted : bool
            (default = False)

        is_folded : bool
            (default = False)

        return_strings : bool
            (default = False)
            If True then returns substructure strings rather than hashed fingerprint.
        """
        self.dimensions = int(dimensions)
        self.radius = int(radius)
        self.is_counted = bool(is_counted)
        self.is_folded = bool(is_folded)
        self.return_strings = bool(return_strings)

        if self.is_folded:
            self.encoder = MHFPEncoder(dimensions)
        else:
            self.encoder = tm.Minhash(dimensions)
Exemple #7
0
def CalculateMinHashFingerprint(mol: Chem.Mol,
                                radius: int = 3,
                                rtype: str = 'bitstring',
                                bits: int = 2048) -> Tuple[str, dict, Any]:
    """Calculate the MinHash Fingerprint (MHFP) of molecule.

    doi: 10.1186/s13321-018-0321-8.
    :param radius: maximum radius of atom-centered substructures.
    :param rtype: Type of output, may either be:
                  bitstring (default), returns a binary string
                  numpy, return the underlying numpy array
                  dict, for a dict of bits turned on
    :param bits: Number of folded bits (ignored if rtype != 'bitstring')
    """
    mhfp = MHFPEncoder()
    shingles = mhfp.shingling_from_mol(mol, radius, True, True, 1)
    hash_values = mhfp.hash(shingles)
    if rtype == 'numpy':
        return hash_values
    elif rtype == 'dict':
        return {x: 1 for x in hash_values.tolist()}
    else:
        folded = mhfp.fold(hash_values, bits)
        return ''.join(map(str, folded))
import pytest
import numpy as np
from scipy.spatial.distance import jaccard
from rdkit.Chem import AllChem
from mhfp.encoder import MHFPEncoder
from mhfp.lsh_forest import LSHForestHelper

# Keeping tests barebone and simple

mhfp_encoder = MHFPEncoder()
lfh = LSHForestHelper()

drugbank = []

with open('test/drugbank.smi') as f:
    for line in f.readlines():
        mol = AllChem.MolFromSmiles(line.strip().split()[0])
        if mol:
            drugbank.append(mhfp_encoder.encode_mol(mol))

for i, fp in enumerate(drugbank):
    lfh.add(i, fp)

lfh.index()


def test_setup():
    assert len(drugbank) == 226


def test_add():
Exemple #9
0
        type=int,
        help="Number of workers (CPU cores) to use for multiprocessing,\
                        default to the number of available CPU cores minus one",
        default=os.cpu_count() - 1)
    parser.add_argument("-d",
                        "--dim",
                        type=int,
                        help="Fingerprint dimension, default to 1024",
                        default=1024)

    a = parser.parse_args()
    outpath = os.path.abspath(a.output)
    mols = file_to_mols(a.filename)

    # Define a named properties tuple
    # To pickle a named tuple correctly:
    ## 1) The named tupple object has to be declared under __main__
    ## 2) The declared variable for the named tuple has to match
    ##    the tuple name in the quotation mark!!
    Props = namedtuple('Props', ['SMILES', 'MolWt', 'LogP', 'QED', 'SAS'])

    # MinHash fingerprints (mhfp) encoder. This is a specialized molecular fingerprint scheme
    enc = MHFPEncoder(a.dim)
    # Locality Sensitive Hashing Forest
    lf = tm.LSHForest(a.dim, 64)

    MolsToLSHForest(mol_list=mols,
                    save_path=outpath,
                    worker=a.worker,
                    batch_size=a.batch)
Exemple #10
0
def main():
    """ The main function """
    df = pd.read_csv("drugbank.csv").dropna(subset=["SMILES"]).reset_index(
        drop=True)
    enc = MHFPEncoder()
    lf = tm.LSHForest(2048, 128)

    fps = []
    labels = []
    groups = []
    tpsa = []
    logp = []
    mw = []
    h_acceptors = []
    h_donors = []
    ring_count = []
    is_lipinski = []
    has_coc = []
    has_sa = []
    has_tz = []

    substruct_coc = AllChem.MolFromSmiles("COC")
    substruct_sa = AllChem.MolFromSmiles("NS(=O)=O")
    substruct_tz = AllChem.MolFromSmiles("N1N=NN=C1")

    total = len(df)
    for i, row in df.iterrows():
        if i % 1000 == 0 and i > 0:
            print(f"{round(100 * (i / total))}% done ...")

        smiles = row[6]
        mol = AllChem.MolFromSmiles(smiles)

        if mol and mol.GetNumAtoms() > 5 and smiles.count(".") < 2:
            fps.append(tm.VectorUint(enc.encode_mol(mol, min_radius=0)))
            labels.append(
                f'{smiles}__<a href="https://www.drugbank.ca/drugs/{row[0]}" target="_blank">{row[0]}</a>__{row[1]}'
                .replace("'", ""))
            groups.append(row[3].split(";")[0])
            tpsa.append(Descriptors.TPSA(mol))
            logp.append(Descriptors.MolLogP(mol))
            mw.append(Descriptors.MolWt(mol))
            h_acceptors.append(Descriptors.NumHAcceptors(mol))
            h_donors.append(Descriptors.NumHDonors(mol))
            ring_count.append(Descriptors.RingCount(mol))
            is_lipinski.append(lipinski_pass(mol))
            has_coc.append(mol.HasSubstructMatch(substruct_coc))
            has_sa.append(mol.HasSubstructMatch(substruct_sa))
            has_tz.append(mol.HasSubstructMatch(substruct_tz))

    # Create the labels and the integer encoded array for the groups,
    # as they're categorical
    labels_groups, groups = Faerun.create_categories(groups)
    tpsa_ranked = ss.rankdata(np.array(tpsa) / max(tpsa)) / len(tpsa)
    logp_ranked = ss.rankdata(np.array(logp) / max(logp)) / len(logp)
    mw_ranked = ss.rankdata(np.array(mw) / max(mw)) / len(mw)
    h_acceptors_ranked = ss.rankdata(
        np.array(h_acceptors) / max(h_acceptors)) / len(h_acceptors)
    h_donors_ranked = ss.rankdata(
        np.array(h_donors) / max(h_donors)) / len(h_donors)
    ring_count_ranked = ss.rankdata(
        np.array(ring_count) / max(ring_count)) / len(ring_count)

    lf.batch_add(fps)
    lf.index()
    cfg = tm.LayoutConfiguration()
    cfg.k = 100
    # cfg.sl_extra_scaling_steps = 1
    cfg.sl_repeats = 2
    cfg.mmm_repeats = 2
    cfg.node_size = 2
    x, y, s, t, _ = tm.layout_from_lsh_forest(lf, config=cfg)

    # Define a colormap highlighting approved vs non-approved
    custom_cmap = ListedColormap(
        [
            "#2ecc71", "#9b59b6", "#ecf0f1", "#e74c3c", "#e67e22", "#f1c40f",
            "#95a5a6"
        ],
        name="custom",
    )

    bin_cmap = ListedColormap(["#e74c3c", "#2ecc71"], name="bin_cmap")

    f = Faerun(
        clear_color="#222222",
        coords=False,
        view="front",
        impress=
        'made with <a href="http://tmap.gdb.tools" target="_blank">tmap</a><br />and <a href="https://github.com/reymond-group/faerun-python" target="_blank">faerun</a><br /><a href="https://gist.github.com/daenuprobst/5cddd0159c0cf4758fb16b4b4acbef89">source</a>',
    )

    f.add_scatter(
        "Drugbank",
        {
            "x":
            x,
            "y":
            y,
            "c": [
                groups,
                is_lipinski,
                has_coc,
                has_sa,
                has_tz,
                tpsa_ranked,
                logp_ranked,
                mw_ranked,
                h_acceptors_ranked,
                h_donors_ranked,
                ring_count_ranked,
            ],
            "labels":
            labels,
        },
        shader="smoothCircle",
        colormap=[
            custom_cmap,
            bin_cmap,
            bin_cmap,
            bin_cmap,
            bin_cmap,
            "viridis",
            "viridis",
            "viridis",
            "viridis",
            "viridis",
            "viridis",
        ],
        point_scale=2.5,
        categorical=[
            True, True, True, True, True, False, False, False, False, False
        ],
        has_legend=True,
        legend_labels=[
            labels_groups,
            [(0, "No"), (1, "Yes")],
            [(0, "No"), (1, "Yes")],
            [(0, "No"), (1, "Yes")],
            [(0, "No"), (1, "Yes")],
        ],
        selected_labels=["SMILES", "Drugbank ID", "Name"],
        series_title=[
            "Group",
            "Lipinski",
            "Ethers",
            "Sulfonamides",
            "Tetrazoles",
            "TPSA",
            "logP",
            "Mol Weight",
            "H Acceptors",
            "H Donors",
            "Ring Count",
        ],
        max_legend_label=[
            None,
            None,
            None,
            None,
            None,
            str(round(max(tpsa))),
            str(round(max(logp))),
            str(round(max(mw))),
            str(round(max(h_acceptors))),
            str(round(max(h_donors))),
            str(round(max(ring_count))),
        ],
        min_legend_label=[
            None,
            None,
            None,
            None,
            None,
            str(round(min(tpsa))),
            str(round(min(logp))),
            str(round(min(mw))),
            str(round(min(h_acceptors))),
            str(round(min(h_donors))),
            str(round(min(ring_count))),
        ],
        title_index=2,
        legend_title="",
    )

    f.add_tree("drugbanktree", {"from": s, "to": t}, point_helper="Drugbank")

    f.plot("drugbank", template="smiles")
Exemple #11
0
from rdkit.Chem import AllChem

# config = mstmap.LayoutConfiguration()
# # config.merger = mstmap.Merger.Solar
# # print(config)

# # TODO: Fails for disconnected components!
# u = mstmap.VectorUint([0, 1, 2, 3, 4])
# v = mstmap.VectorUint([1, 2, 0, 4, 3])
# w = mstmap.VectorFloat([1.0, 1.0, 1.0, 2.0, 6.0])
# x, y = mstmap.layout(5, u, v, config, w)

# print(x)
# print(y)

enc = MHFPEncoder(512)

fps = []

if not os.path.isfile('fps.dat'):
    with open('drugbank.smi', 'r') as f:
        i = 0
        for line in f:
            smiles = line.split()[0].strip()
            mol = AllChem.MolFromSmiles(smiles)
            if mol:
                fps.append(enc.encode_mol(mol))
            i += 1
            if i > 2000: break
    pickle.dump(fps, open('fps.dat', 'wb'))
else: