Exemple #1
0
def get_neighbors(query_fp, fp_list, cutoff=0.8, max_nbrs=5):
    sim_list = BulkTanimotoSimilarity(query_fp, [x[2] for x in fp_list])
    nbr_list = [
        x[0] + [x[1]] for x in zip(fp_list, sim_list) if x[1] >= cutoff
    ]
    nbr_list.sort(key=itemgetter(3), reverse=True)
    nbr_list = nbr_list[0:max_nbrs]
    return nbr_list
Exemple #2
0
def Get_compare(smiles_dict):
    """
    The function find the simmular neighbours monomers, using Tanimoto metric of 
    molecul similarity.

    Parameters
    ----------
    smiles_dict : dict
        Smiles of each monomers.
    Returns
    -------
    reps : list
        Simmular neighbours monomers.
    """
    reps = []
    mols = list(smiles_dict.keys())
    reps_count = 0

    for idx in range(len(mols) - 1):

        try:

            Tanimoto = BulkTanimotoSimilarity(
                Chem.RDKFingerprint(smiles_dict[mols[idx]]),
                [Chem.RDKFingerprint(smiles_dict[mols[idx + 1]])])[0]

        except:

            Tanimoto = 0

        if Tanimoto == 1:

            check = 0

            if len(reps) != 0:
                for rep in reps:
                    if mols[idx] in rep:

                        check = 1

                if check == 1:

                    reps[reps_count].append(mols[idx + 1])

                else:

                    reps_count += 1
                    reps.append([])
                    reps[reps_count].extend([mols[idx], mols[idx + 1]])
            else:

                reps.append([mols[idx], mols[idx + 1]])

    return reps
Exemple #3
0
 def _find_similarity(self, smiles):
     indices = list()
     if len(self.clusters) == 0:
         return None
     for i, clu in self.clusters.items():
         clu_fps = [s.fingerprint for s in clu if s.fingerprint is not None]
         sim = BulkTanimotoSimilarity(smiles.fingerprint, clu_fps)
         best_match = max(sim)
         if best_match > self.threshold:
             indices.append(i)
     if len(indices) != 0:
         return indices
     else:
         return None
Exemple #4
0
def find_not_aa_monomers(js):
    """
    The function searching non amino acids monomers of analyzing molecule

    Parameters
    ----------
    js : dict
        Opend rBAN peptideGraph.json.
    Returns
    -------
    not_aa_monomer : list
        List of non aminoacids monomers.
    js : dict 
        Opend rBAN peptideGraph.json, with modified names of some monomers.
    """
    compare_dict = {
        'iva': 'CC(C)CC(=O)O',
        'hiv': 'CC(C)C(C(=O)O)O',
        # 'dhb': 'C1=CC(=C(C(=C1)O)O)C(=O)O', #dOH-Bz
        'pip': 'C1CCNC(C1)C(=O)O'  # Hpr
    }
    not_aa_monomer = []

    for i in js['monomericGraph']['monomericGraph']['monomers']:

        target = Chem.MolFromSmiles(i['monomer']['monomer']['smiles'])

        if target is None:  # In some cases can be unparseble structure in rBAN output
            continue

        for mon in compare_dict.keys():

            compare = Chem.MolFromSmiles(compare_dict[mon])
            Tanimoto = BulkTanimotoSimilarity(
                Chem.RDKFingerprint(target), [Chem.RDKFingerprint(compare)])[0]

            if Tanimoto == 1:
                if mon == 'hiv':

                    mon = 'iva'  # iva == hiv, the most common case is hiv, but we have only iva hmm (homology structure)

                not_aa_monomer.append(i['monomer']['index'])
                i['monomer']['monomer']['monomer'] = mon

    return not_aa_monomer, js
Exemple #5
0
def most_simi(train_test):
    train_names, test_names = train_test
    train_fps, train_props, train_labels = load_smiles(train_names)
    test_fps, test_props, test_labels = load_smiles(test_names)
    train_actives = [fp for fp, y in zip(train_fps, train_labels) if y == 1]
    train_decoys = [fp for fp, y in zip(train_fps, train_labels) if y == 0]
    test_actives = [fp for fp, y in zip(test_fps, test_labels) if y == 1]
    test_decoys = [fp for fp, y in zip(test_fps, test_labels) if y == 0]
    fps_pairs = {
        'test_actives vs test_decoys': (test_actives, test_decoys),
        'test_actives vs train_actives': (test_actives, train_actives),
        'test_decoys vs train_actives': (test_decoys, train_actives),
        'test_actives vs train_decoys': (test_actives, train_decoys),
        'test_decoys vs train_decoys': (test_decoys, train_decoys),
    }
    most_simi = {}
    for k, (fps_a, fps_b) in fps_pairs.items():
        most_simi[k] = [max(BulkTanimotoSimilarity(i, fps_b)) for i in fps_a]
    return most_simi
    def _calculate(self, *, predictions: Sequence[str], labels: Sequence[str],
                   **kwargs) -> Tuple[Real, int]:
        pipe = make_pipeline(
            MoleculeTransformer(invalid='raise'),
            MorganFingerprint(
                radius=self._radius,
                n_bits=self._n_bits,
                return_type='bitvect_list',
            ),
        )
        label_fingerprints = pipe.fit_transform(labels)

        pipe[0].invalid = 'skip'
        prediction_fingerprints = pipe.fit_transform(predictions)

        total: List[float] = [
            max(BulkTanimotoSimilarity(fingerprint, label_fingerprints))
            for fingerprint in prediction_fingerprints
        ]

        return np.mean(total), 1
    def fit_transform(self,
                      molecules: Iterable[Mol],
                      y_ignored=None) -> np.array:
        """Return Tanimoto similarity matrix.

        Parameters
        ----------
        molecules : iterable of rdkit.Chem.Mol
            RDKit molecules.
        y_ignored : None
            This formal parameter will be ignored.

        Returns
        -------
        numpy.ndarray, shape = (len(molecules), len(molecules))
        """
        ecfp = MorganFingerprint(radius=self.radius,
                                 n_bits=self.n_bits,
                                 return_type='bitvect_list')
        fingerprints = ecfp.fit_transform(molecules)

        # noinspection PyAttributeOutsideInit
        self.ecfp_ = ecfp
        # noinspection PyAttributeOutsideInit
        self.n_features_in_ = 1

        n_fingerprints = len(fingerprints)
        sim_matrix = np.ones((n_fingerprints, n_fingerprints),
                             dtype=self.dtype)
        for i in range(1, n_fingerprints):
            sim_index = BulkTanimotoSimilarity(fingerprints[i],
                                               fingerprints[:i])
            sim_matrix[i, :i] = sim_index
            sim_matrix[:i, i] = sim_index

        return sim_matrix
Exemple #8
0
    BulkTverskySimilarity,
)
from rdkit.ML.Cluster import Butina

DEBUG = True

DESCRIPTORS = {
    'path': RDKFingerprint,
    'ecfp4': lambda mol: GetMorganFingerprintAsBitVect(mol, radius=2),
    'zinc':
    lambda mol: GetMorganFingerprintAsBitVect(mol, radius=2, nBits=512),
    'apair': lambda mol: GetAtomPairFingerprint(mol)
}

COEFFICIENTS = {
    'tanimoto': lambda x, ys, *args: BulkTanimotoSimilarity(x, ys),
    'dice': lambda x, ys, *args: BulkDiceSimilarity(x, ys),
    'tversky': lambda x, ys, a, b, *args: BulkTverskySimilarity(x, ys, a, b),
}

CLUSTERING_APPROACHES = [
    'butina',
    'cassidy',
]


def mol_parse(it, parser=MolFromSmiles):
    for num, line in enumerate(it, start=1):
        cid = str(num)
        try:
            tokens = str(line).split()