Beispiel #1
0
def fp_tt_std_mp(mol, mol_can, i, nBits, chiral):
    fp_mol = Torsions.GetHashedTopologicalTorsionFingerprint(mol, nBits=nBits, includeChirality=chiral)
    if id(mol) == id(mol_can):
        fp_can = fp_mol
    else:
        fp_can = Torsions.GetHashedTopologicalTorsionFingerprint(mol_can, nBits=nBits, includeChirality=chiral)
    return (i, fp_mol, fp_can)
 def score_model(self, model_configuration: dict, fragments_file: str,
                 descriptors_file: str, output_file: str):
     inputoutput_utils.create_parent_directory(output_file)
     model_data = model_configuration["data"]
     radius = int(
         model_configuration["configuration"]["fragments"][0]["size"])
     active_molecules_tt = []
     for active_molecule in model_data["active"]:
         molecule_smiles = active_molecule.strip("\"")
         molecule = Chem.MolFromSmiles(molecule_smiles)
         tt_fingerprint = Torsions.GetTopologicalTorsionFingerprintAsIntVect(
             molecule, radius)
         active_molecules_tt.append(tt_fingerprint)
     first_line = True
     with open(output_file, "w", encoding="utf-8") as output_stream:
         with open(fragments_file, "r", encoding="utf-8") as input_stream:
             for new_line in input_stream:
                 line = json.loads(new_line)
                 test_molecule_input = line["smiles"]
                 test_molecule_smiles = test_molecule_input.strip("\"")
                 test_molecule = Chem.MolFromSmiles(test_molecule_smiles)
                 test_mol_fingerprint = Torsions.GetTopologicalTorsionFingerprintAsIntVect(
                     test_molecule, radius)
                 max_sim = max([
                     DataStructs.TanimotoSimilarity(test_mol_fingerprint,
                                                    fingerprint)
                     for fingerprint in active_molecules_tt
                 ])
                 score = {"name": line["name"], "score": max_sim}
                 if first_line:
                     first_line = False
                 else:
                     output_stream.write("\n")
                 json.dump(score, output_stream)
Beispiel #3
0
def fp_tt_std(mols, nBits, chiral):
    for i in mols:
        fp_mol = Torsions.GetHashedTopologicalTorsionFingerprint(mols[i]["mol"], nBits=nBits, includeChirality=chiral)
        mols[i]["fp"] = fp_mol
        if id(mols[i]["mol"]) == id(mols[i]["mol_can"]):
            mols[i]["fp_can"] = fp_mol
        else:
            mols[i]["fp_can"] = Torsions.GetHashedTopologicalTorsionFingerprint(mols[i]["mol_can"], nBits=nBits,
                                                                                includeChirality=chiral)
Beispiel #4
0
 def testGetTopologicalTorsionFingerprintAsIds(self):
   mol = Chem.MolFromSmiles('C1CCCCN1')
   tt = Torsions.GetTopologicalTorsionFingerprint(mol)
   self.assertEqual(tt.GetNonzeroElements(), {4437590049: 2, 8732557345: 2, 4445978657: 2})
   tt = Torsions.GetTopologicalTorsionFingerprintAsIds(mol)
   self.assertEqual(
     sorted(tt), [4437590049, 4437590049, 4445978657, 4445978657, 8732557345, 8732557345])
   tt = Torsions.GetTopologicalTorsionFingerprintAsIntVect(mol)
   self.assertEqual(tt.GetNonzeroElements(), {4437590049: 2, 8732557345: 2, 4445978657: 2})
    def testTorsionsRegression(self):
        inF = gzip.open(os.path.join(self.testDataPath, 'mols1000.tts.pkl.gz'),
                        'rb')
        torsions = cPickle.load(inF, encoding='bytes')
        for i, m in enumerate(self.mols):
            tt = Torsions.GetTopologicalTorsionFingerprintAsIntVect(m)
            if tt != torsions[i]:
                print(Chem.MolToSmiles(m))
                pd = tt.GetNonzeroElements()
                rd = torsions[i].GetNonzeroElements()
                for k, v in pd.iteritems():
                    if rd.has_key(k):
                        if rd[k] != v:
                            print('>>>1', k, v, rd[k])
                    else:
                        print('>>>2', k, v)
                for k, v in rd.iteritems():
                    if pd.has_key(k):
                        if pd[k] != v:
                            print('>>>3', k, v, pd[k])
                    else:
                        print('>>>4', k, v)

            self.assertTrue(tt == torsions[i])
            self.assertTrue(tt != torsions[i - 1])
Beispiel #6
0
    def computeFP(self, typeFP):

        from rdkit.Chem.Fingerprints import FingerprintMols
        from rdkit.Chem import MACCSkeys
        from rdkit.Chem.AtomPairs import Pairs, Torsions
        from rdkit.Chem import AllChem

        if not "smiclean" in self.__dict__:
            self.log = self.log + "No smiles prepared\n"
            return 1
        else:
            self.mol = Chem.MolFromSmiles(self.smiclean)
            #print self.smiclean

        dFP = {}
        if typeFP == "Mol" or typeFP == "All":
            dFP["Mol"] = FingerprintMols.FingerprintMol(self.mol)
        if typeFP == "MACCS" or typeFP == "All":
            dFP["MACCS"] = MACCSkeys.GenMACCSKeys(self.mol)
        if typeFP == "pairs" or typeFP == "All":
            dFP["pairs"] = Pairs.GetAtomPairFingerprint(self.mol)
        if typeFP == "Torsion" or typeFP == "All":
            dFP["Torsion"] = Torsions.GetTopologicalTorsionFingerprint(
                self.mol)
        if typeFP == "Morgan" or typeFP == "All":
            dFP["Morgan"] = AllChem.GetMorganFingerprint(self.mol, 2)

        self.FP = dFP
        return 0
Beispiel #7
0
def _torsionsFingerprintsClustering(rdkit_mols):
    """
        Returns the dice distance matrix based on torsionsfingerprints method

        Parameters
        ----------
        rdkit_mols: list
            The list of rdkit.Chem.rdchem.Mol objects

        Returns
        -------
        dicematrix: np.array
            The numpy array containing the dice matrix
        """
    from rdkit.Chem.AtomPairs import Torsions  # Topological Torsions

    fps = []
    for m in tqdm(rdkit_mols):
        fps.append(Torsions.GetHashedTopologicalTorsionFingerprint(m))

    aprun = ParallelExecutor(n_jobs=-1)  # _config['ncpus'])
    dice_matrix = aprun(total=len(fps), desc='TorsionsFingerprints Distance') \
        (delayed(DiceDistances)(fp1, fps) for fp1 in fps)

    return np.array(dice_matrix)
Beispiel #8
0
 def testTorsionsRegression(self):
   inF = gzip.open(os.path.join(self.testDataPath, 'mols1000.tts.pkl.gz'), 'rb')
   torsions = cPickle.load(inF, encoding='bytes')
   for i, m in enumerate(self.mols):
     tt = Torsions.GetTopologicalTorsionFingerprintAsIntVect(m)
     if tt != torsions[i]:  # pragma: nocover
       debugFingerprint(m, tt, torsions[i])
     self.assertEqual(tt, torsions[i])
     self.assertNotEqual(tt, torsions[i - 1])
    def torsions_fp(self):
        df = pd.read_csv(self.csv_path)
        smiles_list = df['Smiles'].tolist()

        fingerprints = []
        not_found = []
        for i in tqdm(range(len(smiles_list))):
            try:

                mol = Chem.MolFromSmiles(smiles_list[i])
                fp = Torsions.GetTopologicalTorsionFingerprintAsIntVect(mol)
                fp._sumCache = fp.GetTotalVal(
                )  #Bit vector here will be huge, which is why taking TotalVal()
                #             bits = fp.ToBitString()
                #             bits_array = (np.fromstring(fp.ToBitString(),'u1') - ord('0'))
                fingerprints.append(fp._sumCache)

            except:

                fingerprints.append(np.nan)
                not_found.append(i)
                pass

        df.drop(not_found, axis=0, inplace=True)

        print('Number of FPs not found: {}'.format(len(not_found)))

        df.reset_index(drop=True, inplace=True)
        labelencoder = LabelEncoder()
        Y = labelencoder.fit_transform(df['Label'].values)
        Y = Y.reshape(Y.shape[0], 1)

        print('Output shape: {}'.format(Y.shape))

        fp_array = (np.asarray((fingerprints), dtype=object))
        X = np.delete(fp_array, not_found, axis=0)
        X = np.vstack(X).astype(np.float32)
        print('Input shape: {}'.format(X.shape))

        final_array = np.concatenate((X, Y), axis=1)

        # Removing rows, from final_array, where duplicate FPs are present
        final_array_slice = final_array[:, 0:(final_array.shape[1] - 1)]
        _, unq_row_indices = np.unique(final_array_slice,
                                       return_index=True,
                                       axis=0)
        final_array_unique = final_array[unq_row_indices]

        print(
            'Number of Duplicate FPs: {}'.format(final_array.shape[0] -
                                                 final_array_unique.shape[0]))

        print('Final Numpy array shape: {}'.format(final_array_unique.shape))
        print('Type of final array: {}'.format(type(final_array_unique)))
        final_numpy_array = np.asarray((final_array_unique), dtype=np.float32)

        return final_numpy_array
Beispiel #10
0
def GetMolFingerprint(mol,maxPathLength):
    FQuery = Chem.MolFromSmarts('F')
    CF3Query= Chem.MolFromSmarts('[$(C(F)(F)F)]')
    CF3Rxn = AllChem.ReactionFromSmarts('[*:1]-C(F)(F)F>>[*:1]-F')
    hasCF3 = mol.HasSubstructMatch(CF3Query)
    if hasCF3:
        p = CF3Rxn.RunReactants((mol,))[0][0]
        Chem.SanitizeMol(p)
        for nm in mol.GetPropNames():
            p.SetProp(nm,mol.GetProp(nm))
        mol = p
    match = mol.GetSubstructMatch(FQuery)
    fp = Torsions.GetHashedTopologicalTorsionFingerprint(mol,nBits=9192,targetSize=maxPathLength,fromAtoms=match)
    for i in range(2,maxPathLength):
        nfp = Torsions.GetHashedTopologicalTorsionFingerprint(mol,nBits=9192,targetSize=i,fromAtoms=match)
        for bit,v in nfp.GetNonzeroElements().iteritems():
            fp[bit] = fp[bit]+v
    return fp
Beispiel #11
0
def GetTorsionFPs(mol, nBits = 2048, binary = True):
    '''
    atompairs fingerprints
    '''
    fp = Torsions.GetHashedTopologicalTorsionFingerprint(mol, nBits = nBits)
    if binary:
        arr = np.zeros((0,),  dtype=np.bool)
    else:
        arr = np.zeros((0,),  dtype=np.int8)
    DataStructs.ConvertToNumpyArray(fp, arr)
    return arr
def getCountInfo(m, fpType):
    #     m = Chem.MolFromSmiles(formula)
    fp = None
    if fpType == 'AtomPair' or fpType.lower() == 'atom':
        fp = Pairs.GetAtomPairFingerprint(m)
        return fp.GetNonzeroElements()
    elif fpType.lower() == 'morgan' or fpType.lower() == 'circular':
        fp = AllChem.GetMorganFingerprint(m, 2)
        return fp.GetNonzeroElements()
    elif fpType == 'Topological' or fpType.lower() == 'topo':
        fp = Torsions.GetTopologicalTorsionFingerprint(m)
        Dict = fp.GetNonzeroElements()
        convertedDict = {}
        for elem in Dict:
            convertedDict[int(elem)] = Dict[elem]
        return convertedDict
Beispiel #13
0
def Fingerprints(mols, fingerprint):

    # Indigo fingerprints
    if fingerprint in indigofps:
        return [mol.fingerprint(fingerprint) for mol in mols]

    # RDKit fingerprints
    if fingerprint in rdkitfps:
        if fingerprint == "atompair":
            return [Pairs.GetAtomPairFingerprintAsBitVect(mol) for mol in mols]
        elif fingerprint == "avalon":
            return [pyAvalonTools.GetAvalonFP(mol) for mol in mols]
        elif fingerprint == "daylight":
            return [Chem.RDKFingerprint(mol, fpSize=2048) for mol in mols]
        elif fingerprint == "maccs":
            return [MACCSkeys.GenMACCSKeys(mol) for mol in mols]
        elif fingerprint == "morgan":
            return [(AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=1024))
                    for mol in mols]
        elif fingerprint == "pharm2d":
            return [
                Generate.Gen2DFingerprint(mol, Gobbi_Pharm2D.factory)
                for mol in mols
            ]
        elif fingerprint == "topological":
            return [FingerprintMols.FingerprintMol(mol) for mol in mols]

    # RDKit non-bit (integer or float) fingerprints
    if fingerprint in rdkitnonbitfps:
        if fingerprint == "sheridan":
            return [Sheridan.GetBPFingerprint(mol) for mol in mols]
        elif fingerprint == "topotorsion":
            return [
                Torsions.GetTopologicalTorsionFingerprint(mol) for mol in mols
            ]

    # E-state fingerprints
    if fingerprint in rdkitestatefps:
        if fingerprint == "estate1":
            return [Fingerprinter.FingerprintMol(mol)[0] for mol in mols]
        elif fingerprint == "estate2":
            return [Fingerprinter.FingerprintMol(mol)[1] for mol in mols]

    # unknown fingerprint
    return None
Beispiel #14
0
    def computeFP(self, typeFP):

        if not "mol" in self.__dict__:
            self.log = self.log + "No smiles prepared\n"
            self.err = 1
        else:
            d_FP = {}
            if typeFP == "Mol" or typeFP == "All":
                d_FP["Mol"] = FingerprintMols.FingerprintMol(self.mol)
            if typeFP == "MACCS" or typeFP == "All":
                d_FP["MACCS"] = MACCSkeys.GenMACCSKeys(self.mol)
            if typeFP == "pairs" or typeFP == "All":
                d_FP["pairs"] = Pairs.GetAtomPairFingerprint(self.mol)
            if typeFP == "Torsion" or typeFP == "All":
                d_FP["Torsion"] = Torsions.GetTopologicalTorsionFingerprint(self.mol)
            if typeFP == "Morgan" or typeFP == "All":
                d_FP["Morgan"] = AllChem.GetMorganFingerprint(self.mol, 2)
            
            self.d_FP = d_FP
Beispiel #15
0
def CalculateTopologicalTorsionFingerprint(
        mol: Chem.Mol,
        rtype: str = 'countstring',
        bits: int = 2048) -> Tuple[str, dict, Any]:
    """Calculate Topological Torsion fingerprints.

    :param rtype: Type of output, may either be:
                  countstring (default), returns a binary string
                  rdkit, return the native rdkit DataStructs
                  dict, for a dict of bits turned on
    :param bits: Number of folded bits (ignored if rtype != 'countstring')
    """
    res = Torsions.GetTopologicalTorsionFingerprint(mol)
    if rtype == 'rdkit':
        return res
    counts = res.GetNonzeroElements()
    if rtype == 'dict':
        return {f'TopolTorsions_{k}': v for k, v in counts.items()}
    folded = np.zeros(bits)
    for k, v in counts.items():
        folded[k % bits] += v
    return ';'.join(folded.tolist())
Beispiel #16
0
def CalculateTopologicalTorsionFingerprint(mol):
    """
    #################################################################
    Calculate Topological Torsion Fingerprints

    Usage:

        result=CalculateTopologicalTorsionFingerprint(mol)

        Input: mol is a molecule object.

        Output: result is a tuple form. The first is the number of

        fingerprints. The second is a dict form whose keys are the

        position which this molecule has some substructure. The third

        is the DataStructs which is used for calculating the similarity.
    #################################################################
    """
    res = Torsions.GetTopologicalTorsionFingerprint(mol)

    return res.GetLength(), res.GetNonzeroElements(), res
Beispiel #17
0
def TORSIONSfpDataFrame(chempandas, namecol, smicol):
    """
    Torsions-based fingerprints 2048 bits. 
    """
    assert chempandas.shape[0] <= MAXLINES
    molsmitmp = [Chem.MolFromSmiles(x) for x in chempandas.iloc[:, smicol]]
    i = 0
    molsmi = []
    for x in molsmitmp:
        if x is not None:
            x.SetProp("_Name", chempandas.iloc[i, namecol])
            molsmi.append(x)
        i += 1
    # TORSIONS Fingerprints.
    fps = [
        Torsions.GetTopologicalTorsionFingerprintAsIntVect(x) for x in molsmi
    ]
    fpsmat = np.matrix(fps)
    df = DataFrame(fpsmat, index=[x.GetProp("_Name")
                                  for x in molsmi])  # how to name the col?
    df['SMILES'] = [Chem.MolToSmiles(x) for x in molsmi]
    df['CHEMBL'] = df.index
    return (df)
Beispiel #18
0
def fp_torsion(mols, key, nBits, chiral):
    for i in mols:
        fp = Torsions.GetHashedTopologicalTorsionFingerprint(
            mols[i][key], nBits=nBits, includeChirality=chiral)
        mols[i]["fp"] = fp
Beispiel #19
0
FPDICT['fcfp6'] = lambda m: Chem.GetMorganFingerprintAsBitVect(
    m, 3, useFeatures=True, nBits=nbits)
FPDICT['fcfc2'] = lambda m: Chem.GetMorganFingerprint(m, 1, useFeatures=True)
FPDICT['fcfc4'] = lambda m: Chem.GetMorganFingerprint(m, 2, useFeatures=True)
FPDICT['fcfc6'] = lambda m: Chem.GetMorganFingerprint(m, 3, useFeatures=True)
FPDICT['lecfp4'] = lambda m: Chem.GetMorganFingerprintAsBitVect(
    m, 2, nBits=nbits_long)
FPDICT['lecfp6'] = lambda m: Chem.GetMorganFingerprintAsBitVect(
    m, 3, nBits=nbits_long)
FPDICT['lfcfp4'] = lambda m: Chem.GetMorganFingerprintAsBitVect(
    m, 2, useFeatures=True, nBits=nbits_long)
FPDICT['lfcfp6'] = lambda m: Chem.GetMorganFingerprintAsBitVect(
    m, 3, useFeatures=True, nBits=nbits_long)
FPDICT['maccs'] = lambda m: MACCSkeys.GenMACCSKeys(m)
FPDICT['ap'] = lambda m: Pairs.GetAtomPairFingerprint(m)
FPDICT['tt'] = lambda m: Torsions.GetTopologicalTorsionFingerprintAsIntVect(m)
FPDICT['hashap'] = lambda m: Desc.GetHashedAtomPairFingerprintAsBitVect(
    m, nBits=nbits)
FPDICT[
    'hashtt'] = lambda m: Desc.GetHashedTopologicalTorsionFingerprintAsBitVect(
        m, nBits=nbits)
FPDICT['rdk5'] = lambda m: Chem.RDKFingerprint(
    m, maxPath=5, fpSize=nbits, nBitsPerHash=2)
FPDICT['rdk6'] = lambda m: Chem.RDKFingerprint(
    m, maxPath=6, fpSize=nbits, nBitsPerHash=2)
FPDICT['rdk7'] = lambda m: Chem.RDKFingerprint(
    m, maxPath=7, fpSize=nbits, nBitsPerHash=2)
if USE_AVALON:
    FPDICT['avalon'] = lambda m: pyAv.GetAvalonFP(m, nbits)
    FPDICT['avalon_l'] = lambda m: pyAv.GetAvalonFP(m, nbits_long)
Beispiel #20
0
def ClusterOnFingerprint(filename, mols=None, fingerprint=0, cutoff=0.8, metric='Tanimoto', outMatrix=False):
    '''Clustering Structure based on Fingerprints in RDKit

    filename: Smile format file saving molecules. If set to None, use given "mols"
    mols: Input molecules. No use if set up "filename"
    cutoff: Cutoff using for Butina Clustering
    fingerprint: Fingerprint to use:
        0 or else:  RDKit Topological Fingerprint
        1: MACCS Fingerprint
        2: Atom Pair Fingerprint (AP)
        3: Topological Torsion Fingerprint (TT)
        4: Morgan Fingerprint similar to ECFP4 Fingerprint
        5: Morgan Fingerprint similar to FCFP4 Fingerprint
    metric: Available similarity metrics include: 
            Tanimoto, Dice, Cosine, Sokal, Russel, Kulczynski, McConnaughey, and Tversky.
    outMatrix: Change output to a similarity matrix
    Return: Default output "clusters, clusterOut":
        clusters: Clusters containing molecule number.
        clusterOut: Molecular Cluster Number in List.
    '''

    from rdkit import DataStructs
    from rdkit.Chem.Draw import SimilarityMaps
    from rdkit.Chem.Fingerprints import FingerprintMols
    from rdkit.Chem import MACCSkeys
    from rdkit.Chem.AtomPairs import Pairs, Torsions

    if filename:
        suppl = Chem.SmilesMolSupplier(filename)
        mols=[]
        for mol in suppl:
            mols.append(mol)
    molnums=len(mols)

    ### Calculate Molecular Fingerprint
    ## MACCS Fingerprint
    if fingerprint==1:
        fps = [MACCSkeys.GenMACCSKeys(mol) for mol in mols]
    ## Atom Pair Fingerprint (AP)
    elif fingerprint == 2:
        fps = [Pairs.GetAtomPairFingerprint(mol) for mol in mols]
    ## Topological Torsion Fingerprint (TT)
    elif fingerprint == 3:
        fps = [Torsions.GetTopologicalTorsionFingerprintAsIntVect(mol) for mol in mols]
    ## Morgan Fingerprint similar to ECFP4 Fingerprint
    elif fingerprint == 4:
        fps = [AllChem.GetMorganFingerprint(mol,2) for mol in mols]
    ## Morgan Fingerprint similar to FCFP4 Fingerprint
    elif fingerprint == 5:
        fps = [AllChem.GetMorganFingerprint(mol,2,useFeatures=True) for mol in mols]
    ## RDKit Topological Fingerprint
    else: #fingerprint==0:
        fps = [FingerprintMols.FingerprintMol(mol) for mol in mols]

    if outMatrix:
        ### Output the Fingerprint similarity Matrix
        metricsAvailable={'tanimoto':DataStructs.TanimotoSimilarity,"dice":DataStructs.DiceSimilarity,
        "cosine": DataStructs.CosineSimilarity, "sokal": DataStructs.SokalSimilarity, "russel": DataStructs.RusselSimilarity, 
        "rogotGoldberg": DataStructs.RogotGoldbergSimilarity, "allbit": DataStructs.AllBitSimilarity, 
        "kulczynski": DataStructs.KulczynskiSimilarity, "mcconnaughey": DataStructs.McConnaugheySimilarity,
        "asymmetric": DataStructs.AsymmetricSimilarity, "braunblanquet": DataStructs.BraunBlanquetSimilarity}
        
        if metric.lower() not in metricsAvailable:
            print "The given metric is unknown!"
            metric='Tanimoto'

        simMetrics=metricsAvailable[metric.lower()]

        ### Calculate Fingerprint similarity Matrix
        simdm=[[0.0]*molnums]*molnums
        for i in range(molnums):
            simdm[i,i]=1.0
            for j in range(i+1,molnums):
                simdm[i,j]=DataStructs.FingerprintSimilarity(fps[i],fps[j],metric=simMetrics)
                simdm[j,i]=DataStructs.FingerprintSimilarity(fps[j],fps[i],metric=simMetrics)

        for i in range(molnums):
            print
            for j in range(molnums):
                print '%3.2f' % simdm[i,j],
        return simdm

    else:
        clusters=ClusterFps(fps, cutoff=1-cutoff, metric='Tanimoto')
        clusterID=0
        clusterOut=[0]*len(mols)
        for cluster in clusters:
            clusterID+=1
            for idx in cluster:
                clusterOut[idx]=clusterID
            ## To depict cluster molecule
            if False:
                if len(cluster)>1:
                    print "Cluster: "
                    for idx in cluster:
                        mol2mpl(mols[idx])
        return clusters, clusterOut      
Beispiel #21
0
def fp_torsion_taut(query, nBits, chiral):
    for i in query:
        for j in range(len(query[i]["tauts"])):
            fp = Torsions.GetHashedTopologicalTorsionFingerprint(
                query[i]["tauts"][j], nBits=nBits, includeChirality=chiral)
            query[i][f"fp{j}"] = fp
Beispiel #22
0
def fp_torsion_mp(mol, i, nBits, chiral):
    fp = Torsions.GetHashedTopologicalTorsionFingerprint(
        mol, nBits=nBits, includeChirality=chiral)
    return (i, fp)
Beispiel #23
0
def sim_two_serial():
    #Load Data-----------------------------------------------------------------------
    path1 = input("Path for list 1: ")
    path2 = input("Path for list 2: ")

    smis1 = pd.read_csv(path1)
    smis1 = smis1["smiles"]
    smis2 = pd.read_csv(path2)
    smis2 = smis2["smiles"]
    l1 = len(smis1)
    l2 = len(smis2)
    l = l1 * l2
    lp = round(l / 20)

    #Get molecules from smiles-----------------------------------------------------------------------
    bad1 = []
    molecules1 = []
    for i, smi in enumerate(smis1):
        m = Chem.MolFromSmiles(smi)
        if m is None:
            print('smile with number:', i,
                  'in list 1 could not be converted to molecule')
            bad1.append(i)
            continue
        molecules1.append(m)

    bad2 = []
    molecules2 = []
    for i, smi in enumerate(smis2):
        m = Chem.MolFromSmiles(smi)
        if m is None:
            print('smile with number:', i,
                  'in list 2 could not be converted to molecule')
            bad2.append(i)
            continue
        molecules2.append(m)

    #can1=[Chem.MolToSmiles(x) for x in molecules1]
    #can2=[Chem.MolToSmiles(x) for x in molecules2]
    #for j in bad1:
    #can1.insert(j,"bad1")
    #for j in bad2:
    #can2.insert(j,"bad2")
    smis1 = []
    smis2 = []

    #Final output matrix-----------------------------------------------------------------------
    similarity = np.zeros(shape=(l1, l2), dtype=np.float32)

    from rdkit.Chem import MACCSkeys
    from rdkit.Chem.AtomPairs import Pairs
    from rdkit.Chem.AtomPairs import Torsions
    from rdkit.Chem import AllChem

    print('Begining fingerprint calculation...wait')
    fps_topol1 = [FingerprintMols.FingerprintMol(x) for x in molecules1]
    fps_maccs1 = [MACCSkeys.GenMACCSKeys(x) for x in molecules1]
    fps_pairs1 = [Pairs.GetAtomPairFingerprint(x) for x in molecules1]
    fps_tts1 = [
        Torsions.GetTopologicalTorsionFingerprintAsIntVect(x)
        for x in molecules1
    ]
    fps_ecfp4_1 = [
        AllChem.GetMorganFingerprintAsBitVect(x, 2, nBits=1024)
        for x in molecules1
    ]
    fps_ecfp6_1 = [
        AllChem.GetMorganFingerprintAsBitVect(x, 3, nBits=1024)
        for x in molecules1
    ]
    fps_fcfp4_1 = [
        AllChem.GetMorganFingerprintAsBitVect(x,
                                              2,
                                              nBits=1024,
                                              useFeatures=True)
        for x in molecules1
    ]
    fps_fcfp6_1 = [
        AllChem.GetMorganFingerprintAsBitVect(x,
                                              3,
                                              nBits=1024,
                                              useFeatures=True)
        for x in molecules1
    ]
    print('Begining fingerprint calculation...50%')
    fps_topol2 = [FingerprintMols.FingerprintMol(x) for x in molecules2]
    fps_maccs2 = [MACCSkeys.GenMACCSKeys(x) for x in molecules2]
    fps_pairs2 = [Pairs.GetAtomPairFingerprint(x) for x in molecules2]
    fps_tts2 = [
        Torsions.GetTopologicalTorsionFingerprintAsIntVect(x)
        for x in molecules2
    ]
    fps_ecfp4_2 = [
        AllChem.GetMorganFingerprintAsBitVect(x, 2, nBits=1024)
        for x in molecules2
    ]
    fps_ecfp6_2 = [
        AllChem.GetMorganFingerprintAsBitVect(x, 3, nBits=1024)
        for x in molecules2
    ]
    fps_fcfp4_2 = [
        AllChem.GetMorganFingerprintAsBitVect(x,
                                              2,
                                              nBits=1024,
                                              useFeatures=True)
        for x in molecules2
    ]
    fps_fcfp6_2 = [
        AllChem.GetMorganFingerprintAsBitVect(x,
                                              3,
                                              nBits=1024,
                                              useFeatures=True)
        for x in molecules2
    ]
    print('Begining fingerprint calculation...done\n')

    for j in bad1:
        fps_topol1.insert(j, 1)
        fps_maccs1.insert(j, 1)
        fps_pairs1.insert(j, 1)
        fps_tts1.insert(j, 1)
        fps_ecfp4_1.insert(j, 1)
        fps_ecfp6_1.insert(j, 1)
        fps_fcfp4_1.insert(j, 1)
        fps_fcfp6_1.insert(j, 1)

    for j in bad2:
        fps_topol2.insert(j, 1)
        fps_maccs2.insert(j, 1)
        fps_pairs2.insert(j, 1)
        fps_tts2.insert(j, 1)
        fps_ecfp4_2.insert(j, 1)
        fps_ecfp6_2.insert(j, 1)
        fps_fcfp4_2.insert(j, 1)
        fps_fcfp6_2.insert(j, 1)

    print('Begining of fingerprints similarity calculation\n')
    molecules1 = []
    molecules2 = []

    k = 0
    maxs = 2 / (0.65 * 10) + 2 / (0.6 * 10) + 2 / (0.7 * 10) + 1 / (
        0.75 * 5) + 1 / (0.85 * 5)
    for i in range(l1):
        for j in range(l2):
            if not ((i in bad1) or (j in bad2)):
                similarities_topol = DataStructs.FingerprintSimilarity(
                    fps_topol1[i], fps_topol2[j])
                similarities_maccs = DataStructs.FingerprintSimilarity(
                    fps_maccs1[i], fps_maccs2[j])
                similarities_pairs = DataStructs.DiceSimilarity(
                    fps_pairs1[i], fps_pairs2[j])
                similarities_tts = DataStructs.DiceSimilarity(
                    fps_tts1[i], fps_tts2[j])
                similarities_ecfp4 = DataStructs.FingerprintSimilarity(
                    fps_ecfp4_1[i], fps_ecfp4_2[j])
                similarities_ecfp6 = DataStructs.FingerprintSimilarity(
                    fps_ecfp6_1[i], fps_ecfp6_2[j])
                similarities_fcfp4 = DataStructs.FingerprintSimilarity(
                    fps_fcfp4_1[i], fps_fcfp4_2[j])
                similarities_fcfp6 = DataStructs.FingerprintSimilarity(
                    fps_fcfp6_1[i], fps_fcfp6_2[j])
                similarity[i][j] = (
                    0.5 *
                    (similarities_ecfp4 / 0.65 + similarities_ecfp6 / 0.6) +
                    0.5 *
                    (similarities_fcfp4 / 0.65 + similarities_fcfp6 / 0.6) +
                    0.5 * (similarities_tts / 0.7 + similarities_pairs / 0.7) +
                    similarities_maccs / 0.85 + similarities_topol / 0.75) / 5
            k = k + 1
            if k % lp == 0:
                print('running:', (k / l) * 100, '%')
        #for other similarity metrics use for example DataStructs.FingerprintSimilarity(fps[0],fps[1], metric=DataStructs.DiceSimilarity)

    similarity = similarity / maxs
    similarity[bad1, :] = 10
    similarity[:, bad2] = 10

    print('End of fingerprints similarity calculation')
    bad1 = []
    bad2 = []

    df_similarity = pd.DataFrame(similarity)
    similarity = []
    return df_similarity
Beispiel #24
0
def sim_one_serial():
    #Load Data-----------------------------------------------------------------------
    path = input("Path for list : ")
    smis = pd.read_csv(path)
    smis = smis["smiles"]
    l = len(smis)
    lp = round(l * l / 20)
    #Get molecules from smiles-----------------------------------------------------------------------
    bad = []
    molecules = []
    for i, smi in enumerate(smis):
        m = Chem.MolFromSmiles(smi)
        if m is None:
            print('smile with number:', i,
                  'in list could not be converted to molecule')
            bad.append(i)
            continue
        molecules.append(m)
    #can=[Chem.MolToSmiles(x) for x in molecules]
    #for j in bad:
    #can.insert(j,"bad")
    smis = []
    #Final output matrix-----------------------------------------------------------------------
    similarity = np.zeros(shape=(l, l), dtype=np.float32)

    from rdkit.Chem import MACCSkeys
    from rdkit.Chem.AtomPairs import Pairs
    from rdkit.Chem.AtomPairs import Torsions
    from rdkit.Chem import AllChem

    print('Begining fingerprint calculation...wait')
    fps_topol = [FingerprintMols.FingerprintMol(x) for x in molecules]
    fps_maccs = [MACCSkeys.GenMACCSKeys(x) for x in molecules]
    fps_pairs = [Pairs.GetAtomPairFingerprint(x) for x in molecules]
    fps_tts = [
        Torsions.GetTopologicalTorsionFingerprintAsIntVect(x)
        for x in molecules
    ]
    fps_ecfp4 = [
        AllChem.GetMorganFingerprintAsBitVect(x, 2, nBits=1024)
        for x in molecules
    ]
    fps_ecfp6 = [
        AllChem.GetMorganFingerprintAsBitVect(x, 3, nBits=1024)
        for x in molecules
    ]
    fps_fcfp4 = [
        AllChem.GetMorganFingerprintAsBitVect(x,
                                              2,
                                              nBits=1024,
                                              useFeatures=True)
        for x in molecules
    ]
    fps_fcfp6 = [
        AllChem.GetMorganFingerprintAsBitVect(x,
                                              3,
                                              nBits=1024,
                                              useFeatures=True)
        for x in molecules
    ]
    print('Begining fingerprint calculation...done\n')

    for j in bad:
        fps_topol.insert(j, 1)
        fps_maccs.insert(j, 1)
        fps_pairs.insert(j, 1)
        fps_tts.insert(j, 1)
        fps_ecfp4.insert(j, 1)
        fps_ecfp6.insert(j, 1)
        fps_fcfp4.insert(j, 1)
        fps_fcfp6.insert(j, 1)

    #molecules=[]

    print('Begining of fingerprints similarity calculation\n')
    k = 0
    maxs = 2 / (0.65 * 10) + 2 / (0.6 * 10) + 2 / (0.7 * 10) + 1 / (
        0.75 * 5) + 1 / (0.85 * 5)
    for i in range(l):
        for j in range(l):
            if i >= j:
                if not ((i in bad) or (j in bad)):
                    similarities_topol = DataStructs.FingerprintSimilarity(
                        fps_topol[i], fps_topol[j])
                    similarities_maccs = DataStructs.FingerprintSimilarity(
                        fps_maccs[i], fps_maccs[j])
                    similarities_pairs = DataStructs.DiceSimilarity(
                        fps_pairs[i], fps_pairs[j])
                    similarities_tts = DataStructs.DiceSimilarity(
                        fps_tts[i], fps_tts[j])
                    similarities_ecfp4 = DataStructs.FingerprintSimilarity(
                        fps_ecfp4[i], fps_ecfp4[j])
                    similarities_ecfp6 = DataStructs.FingerprintSimilarity(
                        fps_ecfp6[i], fps_ecfp6[j])
                    similarities_fcfp4 = DataStructs.FingerprintSimilarity(
                        fps_fcfp4[i], fps_fcfp4[j])
                    similarities_fcfp6 = DataStructs.FingerprintSimilarity(
                        fps_fcfp6[i], fps_fcfp6[j])
                    similarity[i][j] = (
                        0.5 *
                        (similarities_ecfp4 / 0.65 + similarities_ecfp6 / 0.6)
                        + 0.5 *
                        (similarities_fcfp4 / 0.65 + similarities_fcfp6 / 0.6)
                        + 0.5 *
                        (similarities_tts / 0.7 + similarities_pairs / 0.7) +
                        similarities_maccs / 0.85 +
                        similarities_topol / 0.75) / 5
                    similarity[j][i] = similarity[i][j]
                k = k + 1
                if k % lp == 0:
                    print('running:', (k / (l * l / 2)) * 100, '%')
        #for other similarity metrics use for example DataStructs.FingerprintSimilarity(fps[0],fps[1], metric=DataStructs.DiceSimilarity)

    similarity = similarity / maxs
    similarity[bad, :] = 10
    similarity[:, bad] = 10

    print('End of fingerprints similarity calculation')
    bad = []

    df_similarity = pd.DataFrame(similarity)
    similarity = []
    return df_similarity
Beispiel #25
0
def fp_torsion_taut_mp(taut, i, k, nBits, chiral):
    fp = Torsions.GetHashedTopologicalTorsionFingerprint(
        taut, nBits=nBits, includeChirality=chiral)
    return (i, fp, k)
Beispiel #26
0
def BuildTorsionsFP(mol):
    from rdkit.Chem.AtomPairs import Torsions
    fp = Torsions.GetTopologicalTorsionFingerprintAsIntVect(mol)
    fp._sumCache = fp.GetTotalVal()
    return fp
 def Calc_Torsions(self):
     tts = [Torsions.GetTopologicalTorsionFingerprintAsIntVect(x) for x in self.sd]
     return tts 
    '/drug_development/studyRdkit/st_rdcit/img/mol21.jpg'
)
pairFps = [Pairs.GetAtomPairFingerprint(x) for x in ms]
print(pairFps)
# 由于包含在原子对指纹中的位空间很大,因此他们以稀疏的方式存储为字典形式
d = pairFps[-1].GetNonzeroElements()
print(d)  # {541732: 1, 558113: 2, 558115: 2, 558146: 1, 1606690: 2, 1606721: 2}
print(d[541732])  # 1
# 位描述也可以像如下所示展示
de = Pairs.ExplainPairScore(558115)
print(de)  # (('C', 1, 0), 3, ('C', 2, 0))
# The above means: C with 1 neighbor and 0 pi electrons which is 3 bonds from a C with 2 neighbors and 0 pi electrons
# 碳带有一个邻位孤电子和0个π电子,这是因为碳与两个邻位原子和氧原子形成3个化学键。
# # 2.4 拓扑扭曲topological torsions

tts = [Torsions.GetTopologicalTorsionFingerprintAsIntVect(x) for x in ms]
d_ds = DataStructs.DiceSimilarity(tts[0], tts[1])
print(d_ds)  # 0.16666666666666666
# # 2.5 摩根指纹(圆圈指纹)AllChem.GetMorganFingerprint(mol,2)
# 通过将Morgan算法应用于一组用户提供的原子不变式,可以构建这一系列的指纹。生成Morgan指纹时,还必须提供指纹的半径
m1 = Chem.MolFromSmiles('Cc1ccccc1')
m2 = Chem.MolFromSmiles('Cc1ncccc1')

fp1 = AllChem.GetMorganFingerprint(m1, 2)
fp2 = AllChem.GetMorganFingerprint(m2, 2)
d_mf = DataStructs.DiceSimilarity(fp1, fp2)
print(d_mf)  # 0.55

# Morgan指纹像原子对和拓扑扭转一样,默认情况系按使用计数,但有也可以将他们计算为位向量
fp1 = AllChem.GetMorganFingerprintAsBitVect(m1, 2, nBits=1024)
fp2 = AllChem.GetMorganFingerprintAsBitVect(m2, 2, nBits=1024)
            similarities_pairs[i][j] = 1
    if i % 500 == 0:
        print('running:', i / len(fps_pairs) * 100, '%')

# In[ ]:

df = pd.DataFrame(similarities_pairs)
df.to_csv('similarities_pairs.csv')

# ### Topological torsion descriptors

# In[ ]:

from rdkit.Chem.AtomPairs import Torsions
fps_tts = [
    Torsions.GetTopologicalTorsionFingerprintAsIntVect(x) for x in molecules
]
similarities_tts = np.zeros(shape=((len(fps_tts), len(fps_tts))))

# In[ ]:

#compute similarities.  Comment this section if only the fingerprints are needed
for i in range(len(fps_tts)):
    for j in range(len(fps_tts)):
        if i > j:
            similarities_tts[i][j] = DataStructs.DiceSimilarity(
                fps_tts[i],
                fps_tts[j])  #default is the Dice similarity for these fps
            similarities_tts[j][i] = similarities_tts[i][j]
        elif i == j:
            similarities_tts[i][j] = 1
)
fpdict["lecfp4"] = lambda m: AllChem.GetMorganFingerprintAsBitVect(
    m, 2, nBits=longbits
)
fpdict["lecfp6"] = lambda m: AllChem.GetMorganFingerprintAsBitVect(
    m, 3, nBits=longbits
)
fpdict["lfcfp4"] = lambda m: AllChem.GetMorganFingerprintAsBitVect(
    m, 2, useFeatures=True, nBits=longbits
)
fpdict["lfcfp6"] = lambda m: AllChem.GetMorganFingerprintAsBitVect(
    m, 3, useFeatures=True, nBits=longbits
)
fpdict["maccs"] = lambda m: MACCSkeys.GenMACCSKeys(m)
fpdict["ap"] = lambda m: Pairs.GetAtomPairFingerprint(m)
fpdict["tt"] = lambda m: Torsions.GetTopologicalTorsionFingerprintAsIntVect(m)
fpdict[
    "hashap"
] = lambda m: rdMolDescriptors.GetHashedAtomPairFingerprintAsBitVect(
    m, nBits=nbits
)
fpdict[
    "hashap_cas_length"
] = lambda m: rdMolDescriptors.GetHashedAtomPairFingerprintAsBitVect(
    m, nBits=n_cas_bits
)
fpdict[
    "hashtt"
] = lambda m: rdMolDescriptors.GetHashedTopologicalTorsionFingerprintAsBitVect(
    m, nBits=nbits
)