def get_similarity_subset(fp1, fp2):
    """
    Get similarity score for fingerprints that are supplied as ExplicitBitVect
    or some other format.
    The following similarity metrics work with different intput formats:
        Tanimoto, Dice
    """
    similarity_scores = [
        DataStructs.TanimotoSimilarity(fp1, fp2),
        DataStructs.DiceSimilarity(fp1, fp2)
    ]

    return similarity_scores
Esempio n. 2
0
 def testTorsionValues(self):
   import base64
   testD = (
     ('CCCO', b'AQAAAAgAAAD/////DwAAAAEAAAAAAAAAIECAAAMAAAABAAAA\n'),
     ('CNc1ccco1',
      b'AQAAAAgAAAD/////DwAAAAkAAAAAAAAAIICkSAEAAAABAAAAKVKgSQEAAAABAAAAKVCgUAEAAAAB\nAAAAKVCgUQEAAAABAAAAKVCkCAIAAAABAAAAKdCkCAIAAAABAAAAKVCgSAMAAAABAAAAKVCkSAMA\nAAABAAAAIICkSAMAAAABAAAA\n'
      ), )
   for smi, txt in testD:
     pkl = base64.decodestring(txt)
     fp = rdMD.GetTopologicalTorsionFingerprint(Chem.MolFromSmiles(smi))
     fp2 = DataStructs.LongSparseIntVect(pkl)
     self.assertEqual(DataStructs.DiceSimilarity(fp, fp2), 1.0)
     self.assertEqual(fp, fp2)
    def calculate_dice_similarity_distance(self, i, j):
        """
        TODO
        Function to calculate the distance between two molecular fingerprints from a list using dice similarity.

        :param i:
        :param j:
        :param fps:
        :return:
        :rtype: object
        """
        return 1 - DataStructs.DiceSimilarity(self.fingerprint_list[i],
                                              self.fingerprint_list[j])
Esempio n. 4
0
    def test5Dice(self):
        """

    """
        v1 = ds.IntSparseIntVect(5)
        v1[4] = 4
        v1[0] = 2
        v1[3] = 1
        self.assertTrue(feq(ds.DiceSimilarity(v1, v1), 1.0))

        v1 = ds.IntSparseIntVect(5)
        v1[0] = 2
        v1[2] = 1
        v1[3] = 4
        v1[4] = 6
        v2 = ds.IntSparseIntVect(5)
        v2[1] = 2
        v2[2] = 3
        v2[3] = 4
        v2[4] = 4
        self.assertTrue(feq(ds.DiceSimilarity(v1, v2), 18.0 / 26.))
        self.assertTrue(feq(ds.DiceSimilarity(v2, v1), 18.0 / 26.))
Esempio n. 5
0
def sim_rdk_topo_fps(smiA, smisT):
    """ calculate the fingerprint similarity using the RDK atompair fingerprints
                input are a smiles string and a list of smiles strings
                returned is a list of similarities
        """
    fp_A = Pairs.GetAtomPairFingerprint(getMolFromSmiles(smiA))
    fps_T = [Pairs.GetAtomPairFingerprint(getMolFromSmiles(y)) for y in smisT]

    sim_vector = []
    for t in fps_T:
        sim_vector.append(DataStructs.DiceSimilarity(fp_A, t))

    return sim_vector
Esempio n. 6
0
def morgan_similarity(smiles_1: List[str], smiles_2: List[str], radius: int,
                      sample_rate: float):
    """
    Determines the similarity between the morgan fingerprints of two lists of smiles strings.

    :param smiles_1: A list of smiles strings.
    :param smiles_2: A list of smiles strings.
    :param radius: The radius of the morgan fingerprints.
    :param sample_rate: Rate at which to sample pairs of molecules for Morgan similarity (to reduce time).
    """
    # Compute similarities
    similarities = []
    num_pairs = len(smiles_1) * len(smiles_2)

    # Sample to improve speed
    if sample_rate < 1.0:
        sample_num_pairs = sample_rate * num_pairs
        sample_size = math.ceil(math.sqrt(sample_num_pairs))
        sample_smiles_1 = np.random.choice(smiles_1,
                                           size=sample_size,
                                           replace=True)
        sample_smiles_2 = np.random.choice(smiles_2,
                                           size=sample_size,
                                           replace=True)
    else:
        sample_smiles_1, sample_smiles_2 = smiles_1, smiles_2

    sample_num_pairs = len(sample_smiles_1) * len(sample_smiles_2)

    for smile_1, smile_2 in tqdm(product(sample_smiles_1, sample_smiles_2),
                                 total=sample_num_pairs):
        mol_1, mol_2 = Chem.MolFromSmiles(smile_1), Chem.MolFromSmiles(smile_2)
        fp_1, fp_2 = AllChem.GetMorganFingerprint(
            mol_1, radius), AllChem.GetMorganFingerprint(mol_2, radius)
        similarity = DataStructs.DiceSimilarity(fp_1, fp_2)
        similarities.append(similarity)
    similarities = np.array(similarities)

    # Print results
    print()
    print(
        f'Average dice similarity = {np.mean(similarities):.4f} +/- {np.std(similarities):.4f}'
    )
    print(f'Minimum dice similarity = {np.min(similarities):.4f}')
    print(f'Maximum dice similarity = {np.max(similarities):.4f}')
    print()
    print('Percentiles for dice similarity')
    print(' | '.join([
        f'{i}% = {np.percentile(similarities, i):.4f}'
        for i in range(0, 101, 10)
    ]))
Esempio n. 7
0
def rd_fingerprint_evaluation(references, candidates):
    """
    Enumerate linear Fragement
    """
    print("Calculating Similarity via RDFIngerprint Path Similarity")
    similarities = [
        [], [], [], [], []
    ]  # various similarities: Tanimoto, Dice, Cosine, Sokal, McConnaughey
    for img in references:
        similarity = [0, 0, 0, 0, 0]
        if img in candidates:
            candidate_rdkfingerprint = rdmolops.RDKFingerprint(candidates[img],
                                                               fpSize=2048,
                                                               minPath=1,
                                                               maxPath=7)
            reference_rdkfingerprint = rdmolops.RDKFingerprint(references[img],
                                                               fpSize=2048,
                                                               minPath=1,
                                                               maxPath=7)
            similarity[0] = round(
                DataStructs.TanimotoSimilarity(reference_rdkfingerprint,
                                               candidate_rdkfingerprint), 4)
            similarity[1] = round(
                DataStructs.DiceSimilarity(reference_rdkfingerprint,
                                           candidate_rdkfingerprint), 4)
            similarity[2] = round(
                DataStructs.CosineSimilarity(reference_rdkfingerprint,
                                             candidate_rdkfingerprint), 4)
            similarity[3] = round(
                DataStructs.SokalSimilarity(reference_rdkfingerprint,
                                            candidate_rdkfingerprint), 4)
            similarity[4] = round(
                DataStructs.McConnaugheySimilarity(reference_rdkfingerprint,
                                                   candidate_rdkfingerprint),
                4)
        similarities[0].append(similarity[0])
        similarities[1].append(similarity[1])
        similarities[2].append(similarity[2])
        similarities[3].append(similarity[3])
        similarities[4].append(similarity[4])
    print("Done Calculating Similarity via RDFIngerprint Path Similarity")
    print("##########################################")
    print("Tanimoto Similarity:{}".format(round(np.mean(similarities[0]), 4)))
    print("Dice Similarity:{}".format(round(np.mean(similarities[1]), 4)))
    print("Cosine Similarity:{}".format(round(np.mean(similarities[2]), 4)))
    print("Sokal Similarity:{}".format(round(np.mean(similarities[3]), 4)))
    print("McConnaughey Similarity:{}".format(
        round(np.mean(similarities[4]), 4)))
    print("##########################################")
    return round(np.mean(similarities[0]), 4)
Esempio n. 8
0
 def getSimilarity(self,
                   reference,
                   method='tanimoto',
                   alpha=None,
                   beta=None):
     if method == 'tanimoto':
         return DataStructs.TanimotoSimilarity(reference.IFPvector,
                                               self.IFPvector)
     elif method == 'dice':
         return DataStructs.DiceSimilarity(reference.IFPvector,
                                           self.IFPvector)
     elif method == 'tversky':
         return DataStructs.TverskySimilarity(reference.IFPvector,
                                              self.IFPvector, alpha, beta)
Esempio n. 9
0
 def testPairValues(self):
   import base64
   testD = (
     ('CCCO',
      b'AQAAAAQAAAAAAIAABgAAACGECAABAAAAIoQIAAEAAABBhAgAAQAAACNEGAABAAAAQUQYAAEAAABC\nRBgAAQAAAA==\n'
      ),
     ('CNc1ccco1',
      b'AQAAAAQAAAAAAIAAEAAAACOECgABAAAAJIQKAAIAAABBhQoAAgAAAEKFCgABAAAAIsQKAAEAAABB\nxQoAAQAAAELFCgACAAAAIYQQAAEAAABChRAAAQAAAEOFEAACAAAAYYUQAAEAAAAjhBoAAQAAAEGF\nGgABAAAAQoUaAAIAAABhhRoAAQAAAEKIGgABAAAA\n'
      ), )
   for smi, txt in testD:
     pkl = base64.decodestring(txt)
     fp = rdMD.GetAtomPairFingerprint(Chem.MolFromSmiles(smi))
     fp2 = DataStructs.IntSparseIntVect(pkl)
     self.assertEqual(DataStructs.DiceSimilarity(fp, fp2), 1.0)
     self.assertEqual(fp, fp2)
Esempio n. 10
0
def ECFP6_fp(mol, rc_names):
    fp = [AllChem.GetMorganFingerprint(x, 3) for x in mol]
    tc_df = pd.DataFrame(index=rc_names, columns=rc_names).fillna(0)

    for c1 in range(len(fp)):
        tc_df[rc_names[c1]] = [
            DataStructs.DiceSimilarity(fp[c1], fp[c2]) for c2 in range(len(fp))
        ]
    clusters = linkage(tc_df.as_matrix(columns=None), "ward")
    clust_tree = to_tree(clusters, rd=False)
    d3Dendro = dict(children=[], name=" ")
    add_node(clust_tree, d3Dendro)
    label_tree(d3Dendro["children"][0], rc_names)

    return d3Dendro
Esempio n. 11
0
def getXNN(trainSmilesList, train, predEx, smilesAttrName, nameAttr, X,
           simType):

    if simType == "Topological":
        fpsTrain = [FingerprintMols.FingerprintMol(x) for x in trainSmilesList]
        fp = FingerprintMols.FingerprintMol(
            Chem.MolFromSmiles(predEx[smilesAttrName].value))
    elif simType == "Morgan":
        fpsTrain = [
            AllChem.GetMorganFingerprint(x, 2) for x in trainSmilesList
        ]
        fp = AllChem.GetMorganFingerprint(
            Chem.MolFromSmiles(predEx[smilesAttrName].value), 2)
    elif simType == "MACCS":
        fpsTrain = [MACCSkeys.GenMACCSKeys(x) for x in trainSmilesList]
        fp = MACCSkeys.GenMACCSKeys(
            Chem.MolFromSmiles(predEx[smilesAttrName].value))
    else:
        print "This type of sim is not implemented ", simType

    simDict = {}
    idx = 0
    simList = []
    for ex in train:
        if simType == "Topological":
            sim = DataStructs.FingerprintSimilarity(fpsTrain[idx], fp)
        elif simType == "Morgan":
            sim = DataStructs.DiceSimilarity(fpsTrain[idx], fp)
        elif simType == "MACCS":
            sim = DataStructs.FingerprintSimilarity(fpsTrain[idx], fp)
        else:
            print "This type of sim is not implemented ", simType
        idx = idx + 1
        simDict[ex[nameAttr].value] = sim
        simList.append(sim)

    simList.sort(reverse=True)
    simList = simList[0:X]
    medSim = round(numpy.median(simList), 3)
    stdSim = round(numpy.std(simList), 3)
    minSim = round(min(simList), 3)
    maxSim = round(max(simList), 3)

    entropy = round(getRespVar(simList, simDict, train, nameAttr), 3)
    entropyClosest = round(
        getRespVar(simList[0:X / 2], simDict, train, nameAttr), 3)

    return medSim, stdSim, minSim, maxSim, entropy, entropyClosest
def calulate_similarities(ids, radius):
    ms = [Chem.MolFromSmiles(x) for x in smiles.smiles]
    fps = [AllChem.GetMorganFingerprint(x, radius) for x in ms]
    all_features = []
    for idx, cid in enumerate(ids):
        ms_sample = Chem.MolFromSmiles(smiles.loc[cid].smiles)
        fp_sample = AllChem.GetMorganFingerprint(ms_sample, radius)
        features = [cid]
        for fp in fps:
            features.append(DataStructs.DiceSimilarity(fp, fp_sample))
        print(idx, end='\r')
        all_features.append(features)
    all_features = pd.DataFrame(all_features)
    all_features = all_features.set_index(0)
    all_features.columns = smiles.index
    return all_features
Esempio n. 13
0
def sim_rdk_morgan_fps(smiA, smisT):
    """ calculate the fingerprint similarity using the RDK morgan fingerprints
                (circular fingerprints)
                input are a smiles string and a list of smiles strings
                returned is a list of similarities
        """
    fp_A = rdk.AllChem.GetMorganFingerprint(getMolFromSmiles(smiA), 2)
    fps_T = [
        rdk.AllChem.GetMorganFingerprint(getMolFromSmiles(y), 2) for y in smisT
    ]

    sim_vector = []
    for t in fps_T:
        sim_vector.append(DataStructs.DiceSimilarity(fp_A, t))

    return sim_vector
def get_similar_compound(condon):
    com = condon['smiles']

    save_img(com, 'static//compound_img//smiles_img.png', 300, 300)

    output_num = condon['MaxLength']
    smiles_file_path = 'data//kegg_smiles2.txt'

    with open(smiles_file_path) as file:
        f = file.readlines()

    smiles_list = [x.split()[1] for x in f]
    output_num = min(output_num, len(smiles_list))
    top_idx = [0] * output_num
    top_score = [0] * output_num

    mol1 = Chem.MolFromSmiles(com)
    if mol1 is None:
        print('input smiles not exist')
        return []
    mol1 = AllChem.AddHs(mol1)
    fps1 = AllChem.GetMorganFingerprint(mol1, 2)

    for i, item in enumerate(smiles_list):
        mol2 = Chem.MolFromSmiles(item)
        if mol2 is None:
            continue
        mol2 = AllChem.AddHs(mol2)
        fps2 = AllChem.GetMorganFingerprint(mol2, 2)

        score = DataStructs.DiceSimilarity(fps1, fps2)
        score = round(score, 2)

        if score > min(top_score):
            min_idx = top_score.index(min(top_score))
            top_idx[min_idx] = i
            top_score[min_idx] = score

    top_keggid = [f[i].split()[0] for i in top_idx]
    top_smiles = [f[i].split()[1] for i in top_idx]
    result = sorted(zip(top_keggid, top_smiles, top_score),
                    key=lambda x: x[2],
                    reverse=True)
    for i in range(len(result)):
        result[i] = list(result[i])
        result[i].insert(1, compound_dict[result[i][0]][0])
    return result
    def evaluate_distance(self) -> np.ndarray:
        """Calculates the euclidean distance between pixels of two different arrays
        on a vector of observations, and normalizes the result applying the relativize function.
        In a more general scenario, any function that quantifies the notion of "how different two
        observations are" could work, even if it is not a proper distance.
        """
        # Get random companion
        idx = np.random.permutation(np.arange(self.n_walkers, dtype=int))
        # Euclidean distance between states (pixels / RAM)
        dist = [
            DataStructs.DiceSimilarity(self.observations[i], self.observations[idx[i]])
            for i in range(self.n_walkers)
        ]

        dist = 1.0 - np.array(dist)

        return relativize_vector(dist).astype(np.float32)
Esempio n. 16
0
def sort_similarity(mols, sort):
    largest = [0, mols[0]]

    # Find the most similar molecule by finding largest similarity score
    for mol in mols:
        similarity = DataStructs.DiceSimilarity(sort[len(sort) - 1][0], mol[0])
        if similarity > largest[0]:
            largest = [similarity, mol]

    # Move the molecule from unsorted list to sorted list
    mols.remove(largest[1])
    sort.append(largest[1])

    # check if there are more mols to sort
    if len(mols) > 0:
        sort_similarity(mols, sort)
    return sort
Esempio n. 17
0
def atom_pairs_similarity(active_molecules1, test_molecules):
    similarity = []
    active_molecules_pairfps = [
        Pairs.GetAtomPairFingerprint(p) for p in active_molecules1
    ]
    test_molecules_pairsfps = [
        Pairs.GetAtomPairFingerprint(p) for p in test_molecules
    ]
    for i in range(len(test_molecules_pairsfps)):
        num_sim = 0
        for j in range(len(active_molecules_pairfps)):
            sim = DataStructs.DiceSimilarity(test_molecules_pairsfps[i],
                                             active_molecules_pairfps[j])
            if sim > num_sim:
                num_sim = sim
        similarity.append(num_sim)
    return similarity
Esempio n. 18
0
def ecfp_similarity(active_molecules1, test_molecules):
    similarity = []
    active_molecules_ecfpfps = [
        AllChem.GetMorganFingerprint(p, 3) for p in active_molecules1
    ]
    test_molecules_ecfpfps = [
        AllChem.GetMorganFingerprint(p, 3) for p in test_molecules
    ]
    for i in range(len(test_molecules_ecfpfps)):
        num_sim = 0
        for j in range(len(active_molecules_ecfpfps)):
            sim = DataStructs.DiceSimilarity(test_molecules_ecfpfps[i],
                                             active_molecules_ecfpfps[j])
            if sim > num_sim:
                num_sim = sim
        similarity.append(num_sim)
    return similarity
def get_similarity_all(fp1, fp2):
    """
    Get similarity score for fingerprints that are supplied always as SparseBitVect
    RDKit has the following similarity measures:
        Tanimoto, Dice, Cosine, Sokal, Russel, Kulczynski, McConnaughey, and Tversky.
    """
    similarity_scores = [
        DataStructs.TanimotoSimilarity(fp1, fp2),
        DataStructs.DiceSimilarity(fp1, fp2),
        DataStructs.CosineSimilarity(fp1, fp2),
        #        DataStructs.SokalSimilarity(fp1,fp2),
        DataStructs.RusselSimilarity(fp1, fp2),
        DataStructs.KulczynskiSimilarity(fp1, fp2),
        DataStructs.McConnaugheySimilarity(fp1, fp2)
    ]

    return similarity_scores
Esempio n. 20
0
def getTanDist(outMols):
    """Get tan dist between all pairs in outMols """
    tanDists = []
    tanDistsMorgan = []
    fps = [FingerprintMols.FingerprintMol(x) for x in outMols]
    for outIdx in range(len(outMols)):
        for inIdx in range(outIdx + 1, len(outMols)):
            print outIdx, inIdx
            tanDist = DataStructs.FingerprintSimilarity(
                fps[outIdx], fps[inIdx])
            fpsM1 = AllChem.GetMorganFingerprint(outMols[outIdx], 2)
            fpsM2 = AllChem.GetMorganFingerprint(outMols[inIdx], 2)
            #tanDistM = DataStructs.TanimotoSimilarity(fpsM1, fpsM2)
            tanDistM = DataStructs.DiceSimilarity(fpsM1, fpsM2)
            tanDists.append(round(tanDist, 2))
            tanDistsMorgan.append(round(tanDistM, 2))
    return tanDists, tanDistsMorgan
Esempio n. 21
0
def morgan_fingerprint_evaluation(references, candidates):
    """
    Circular based fingerprints
    https://doi.org/10.1021/ci100050t
    """
    print("Calculating Similarity via Morgan based Circular Fingerprint")
    similarities = [
        [], [], [], [], []
    ]  # various similarities: Tanimoto, Dice, Cosine, Sokal, McConnaughey
    for img in references:
        similarity = [0, 0, 0, 0, 0]
        if img in candidates:
            morgan_fp_candidate = AllChem.GetMorganFingerprintAsBitVect(
                candidates[img], 2, nBits=1024)
            morgan_fp_reference = AllChem.GetMorganFingerprintAsBitVect(
                references[img], 2, nBits=1024)
            similarity[0] = round(
                DataStructs.TanimotoSimilarity(morgan_fp_reference,
                                               morgan_fp_candidate), 4)
            similarity[1] = round(
                DataStructs.DiceSimilarity(morgan_fp_reference,
                                           morgan_fp_candidate), 4)
            similarity[2] = round(
                DataStructs.CosineSimilarity(morgan_fp_reference,
                                             morgan_fp_candidate), 4)
            similarity[3] = round(
                DataStructs.SokalSimilarity(morgan_fp_reference,
                                            morgan_fp_candidate), 4)
            similarity[4] = round(
                DataStructs.McConnaugheySimilarity(morgan_fp_reference,
                                                   morgan_fp_candidate), 4)
        similarities[0].append(similarity[0])
        similarities[1].append(similarity[1])
        similarities[2].append(similarity[2])
        similarities[3].append(similarity[3])
        similarities[4].append(similarity[4])
    print("Done Calculating Similarity via  Morgan based Circular Fingerprint")
    print("##########################################")
    print("Tanimoto Similarity:{}".format(round(np.mean(similarities[0]), 4)))
    print("Dice Similarity:{}".format(round(np.mean(similarities[1]), 4)))
    print("Cosine Similarity:{}".format(round(np.mean(similarities[2]), 4)))
    print("Sokal Similarity:{}".format(round(np.mean(similarities[3]), 4)))
    print("McConnaughey Similarity:{}".format(
        round(np.mean(similarities[4]), 4)))
    print("##########################################")
    return round(np.mean(similarities[0]), 4)
Esempio n. 22
0
    def similar_molecules(self, mols):
        """
        Returns molecules from `mols` ordered by similarity.

        The most similar molecule is at index 0.

        This method uses the Morgan fingerprints of radius 4 to
        evaluate how similar the molecules in `mols` are.

        Parameters
        ----------
        mols : :class:`iterable` of :class:`rdkit.Mol`
            A group of molecules to which similarity is compared.

        Returns
        -------
        :class:`list`
            A :class:`list` of the form,

            .. code-block:: python

                returned_list = [(8.9, mol1), (7.3, mol2), (3.4, mol3)]

            where the :class:`float` is the similarity of a given
            molecule in `mols` while the ```mol`` is corresponding
            ``rdkit`` molecule. Most similar molecule yielded first.

        """

        # First get the fingerprint of `self`.
        rdkit.GetSSSR(self.mol)
        self.mol.UpdatePropertyCache(strict=False)
        fp = rdkit.GetMorganFingerprint(self.mol, 4)

        # For every structure file in the database create a rdkit
        # molecule. Place these in a list.
        similarities = []
        for mol in mols:
            rdkit.GetSSSR(mol)
            mol.UpdatePropertyCache(strict=False)
            mol_fp = rdkit.GetMorganFingerprint(mol, 4)
            similarity = DataStructs.DiceSimilarity(fp, mol_fp)
            similarities.append((similarity, mol))

        return sorted(similarities, reverse=True, key=lambda x: x[0])
Esempio n. 23
0
    def get_neighbour(self):
        """

        Returns: List of (closest neighbour, similarity) of the generated smiles

        """
        all_neighbours = []
        for i in range(len(self.gen)):
            tmp_fp = self.gen_fps[i]
            similarity = 0
            neighbour = ""
            for j in range(len(self.training)):
                tmp_sim = DataStructs.DiceSimilarity(tmp_fp, self.train_fps[j])
                if tmp_sim > similarity:
                    similarity = tmp_sim
                    neighbour = self.training[j]
            all_neighbours.append((similarity, neighbour))
        return all_neighbours
Esempio n. 24
0
  def test6BulkDice(self):
    """

    """
    sz = 10
    nToSet = 5
    nVs = 6
    import random
    vs = []
    for i in range(nVs):
      v = ds.IntSparseIntVect(sz)
      for j in range(nToSet):
        v[random.randint(0, sz - 1)] = random.randint(1, 10)
      vs.append(v)

    baseDs = [ds.DiceSimilarity(vs[0], vs[x]) for x in range(1, nVs)]
    bulkDs = ds.BulkDiceSimilarity(vs[0], vs[1:])
    for i in range(len(baseDs)):
      self.assertTrue(feq(baseDs[i], bulkDs[i]))
Esempio n. 25
0
def maacs_fingerprint_evaluation(references, candidates):
    """ 
    Generate Similarity via MACCSKeys
    """
    print("Calculating Similarity via MACCS Keys")
    similarities = [
        [], [], [], [], []
    ]  # various similarities: Tanimoto, Dice, Cosine, Sokal, McConnaughey
    for img in references:
        similarity = [0, 0, 0, 0, 0]
        if img in candidates:
            candidate_maccs = MACCSkeys.GenMACCSKeys(candidates[img])
            reference_maccs = MACCSkeys.GenMACCSKeys(references[img])
            similarity[0] = round(
                DataStructs.TanimotoSimilarity(reference_maccs,
                                               candidate_maccs), 4)
            similarity[1] = round(
                DataStructs.DiceSimilarity(reference_maccs, candidate_maccs),
                4)
            similarity[2] = round(
                DataStructs.CosineSimilarity(reference_maccs, candidate_maccs),
                4)
            similarity[3] = round(
                DataStructs.SokalSimilarity(reference_maccs, candidate_maccs),
                4)
            similarity[4] = round(
                DataStructs.McConnaugheySimilarity(reference_maccs,
                                                   candidate_maccs), 4)
        similarities[0].append(similarity[0])
        similarities[1].append(similarity[1])
        similarities[2].append(similarity[2])
        similarities[3].append(similarity[3])
        similarities[4].append(similarity[4])
    print("Done Calculating Similarity via MACCS Keys")
    print("##########################################")
    print("Tanimoto Similarity:{}".format(round(np.mean(similarities[0]), 4)))
    print("Dice Similarity:{}".format(round(np.mean(similarities[1]), 4)))
    print("Cosine Similarity:{}".format(round(np.mean(similarities[2]), 4)))
    print("Sokal Similarity:{}".format(round(np.mean(similarities[3]), 4)))
    print("McConnaughey Similarity:{}".format(
        round(np.mean(similarities[4]), 4)))
    print("##########################################")
    return round(np.mean(similarities[0]), 4)
Esempio n. 26
0
def compare_structure(smiles1, smiles2, fp_type="Morgan", sim_type="Dice"):
    """
    Task: Compare structual similarity of two compound based on fingerprints.
    Parameters:
        smiles1: str, smiles of the compound 1
        smiles2: str, smiles of the compound 2
        fp_type: str, type of fingerprints
        sim_type: str, method for calculating similarity
    """
    if fp_type == "Morgan":
        getfp = lambda smi: AllChem.GetMorganFingerprint(
            Chem.MolFromSmiles(smi), 2, useFeatures=False)
    elif fp_type == "MorganWithFeature":
        getfp = lambda smi: AllChem.GetMorganFingerprint(
            Chem.MolFromSmiles(smi), 2, useFeatures=True)
    elif fp_type == "MACCS":
        getfp = lambda smi: Chem.MACCSkeys.GenMACCSKeys(Chem.MolFromSmiles(smi)
                                                        )
    elif fp_type == "Topological":
        getfp = lambda smi: FingerprintMols.FingerprintMol(
            Chem.MolFromSmiles(smi))
    elif fp_type == "AtomPairs":
        getfp = lambda smi: Pairs.GetAtomPairFingerprint(
            Chem.MolFromSmiles(smi))

    try:
        fp1 = getfp(smiles1)
        fp2 = getfp(smiles2)
        if sim_type == "Dice":
            sim_fp = DataStructs.DiceSimilarity(fp1, fp2)
        elif sim_type == "Tanimoto":
            sim_fp = DataStructs.TanimotoSimilarity(fp1, fp2)
        elif sim_type == "Cosine":
            sim_fp = DataStructs.CosineSimilarity(fp1, fp2)
        elif sim_type == "Sokal":
            sim_fp = DataStructs.SokalSimilarity(fp1, fp2)
        elif sim_type == "Russel":
            sim_fp = DataStructs.RusselSimilarity(fp1, fp2)

    except Exception as e:
        sim_fp = -1
    return sim_fp
Esempio n. 27
0
def build_deltaFP(reactions):
    print("Building FPs and writing to CSV..")
    FP_column = np.arange(0, 256).tolist()
    FP_column = ["pfp" + str(item) for item in FP_column]

    PerturbationFingerprints = [
        "Perturbation",
        "Reaction_SMILES",
        "fullmember1",
        "fullmember2",
        "Member_Similarity (Dice)",
    ]
    PerturbationFingerprints = [PerturbationFingerprints + FP_column]
    for reaction_members in reactions:
        pert = str(reaction_members[0])
        # deconstruct reaction smiles back into members:
        head, sep, tail = reaction_members[1].partition(">>")

        # take mol object from each member, retain hydrogens and override valency discrepancies
        member1 = Chem.MolFromSmiles(head, sanitize=False)
        member2 = Chem.MolFromSmiles(tail, sanitize=False)
        member1.UpdatePropertyCache(strict=False)
        member2.UpdatePropertyCache(strict=False)

        # create bitstring of 256 bits for each member.
        FP1 = (rdMolDescriptors.GetHashedAtomPairFingerprint(member1, 256))
        FP2 = (rdMolDescriptors.GetHashedAtomPairFingerprint(member2, 256))
        similarity = DataStructs.DiceSimilarity(FP1, FP2)

        # subtract and return reaction FP (=deltaFP) as list
        deltaFP = np.array(list(FP2)) - np.array(list(FP1))
        #        print("Perturbation FP for " + pert +" (" + str(reaction_members[1]) + ") is:")
        #        print(deltaFP)

        # join all the data together into one list and append to output:
        result = reaction_members + ([str(similarity)]) + deltaFP.tolist()

        PerturbationFingerprints.append(result)


#        print("##########################################################################")
    return PerturbationFingerprints
Esempio n. 28
0
def get_similarity(mols, compounds, fps_morgan):
    """
    Calculate the pairwise molecular similarity
    Args:
        mols: list of mol files for the compounds
        compounds: list of compound unique ids
        fps_morgan: list of fingerprints for the compounds

    Returns: lines containing the 'source','target','similarity' information

    """
    total_sim = ''
    for i in range(len(mols)):
        ref_fp = fps_morgan[i]
        for j in range(i + 1, len(mols)):
            morgan2_sim = DataStructs.DiceSimilarity(ref_fp, fps_morgan[j])
            sims = str(compounds[i]) + ',' + str(
                compounds[j].rstrip()) + ',' + str(morgan2_sim) + '\n'
            total_sim += sims
    return total_sim
Esempio n. 29
0
def orng_sim_rdk_atompair_fps(smile_active, train_instance):
    """ calculate the fingerprint similarity using the RDK atom pair fingerprints
                input are a smiles string and a orange data instance
                returned is a similaritie value
        """
    smilesName = getSMILESAttr(train_instance)
    if not smilesName: return None
    smile_train = str(train_instance[smilesName].value)

    molAct = getMolFromSmiles(smile_active)
    molTrain = getMolFromSmiles(smile_train)

    if not molAct: return None
    if not molTrain: return None

    fp_A = Pairs.GetAtomPairFingerprint(molAct)
    fp_T = Pairs.GetAtomPairFingerprint(molTrain)
    sim = DataStructs.DiceSimilarity(fp_A, fp_T)

    return sim
Esempio n. 30
0
def orng_sim_rdk_morgan_features_fps(smile_active, train_instance):
    """ calculate the fingerprint similarity using the RDK morgan fingerprints
                (circular fingerprints, FCFP, feature-based invariant)
                input are a smiles string and a orange data instance
                returned is a similaritie value
        """
    smilesName = getSMILESAttr(train_instance)
    if not smilesName: return None
    smile_train = str(train_instance[smilesName].value)

    molAct = getMolFromSmiles(smile_active)
    molTrain = getMolFromSmiles(smile_train)

    if not molAct: return None
    if not molTrain: return None

    fp_A = rdk.AllChem.GetMorganFingerprint(molAct, 2, useFeatures=True)
    fp_T = rdk.AllChem.GetMorganFingerprint(molTrain, 2, useFeatures=True)
    sim = DataStructs.DiceSimilarity(fp_A, fp_T)

    return sim