Beispiel #1
0
def check_if_two_molecules_are_equal_from_smiles(smiles1, smiles2):
    mol1 = MolFromSmiles(smiles1)
    mol2 = MolFromSmiles(smiles2)

    fgp1 = AllChem.GetMorganFingerprint(mol1,
                                        1,
                                        useFeatures=True,
                                        useChirality=True)
    fgp2 = AllChem.GetMorganFingerprint(mol2,
                                        1,
                                        useFeatures=True,
                                        useChirality=True)
    similarity = DataStructs.TanimotoSimilarity(fgp1, fgp2)

    if similarity == 1:
        return True

    else:
        return False
Beispiel #2
0
def fingerprint_features(smile_string, radius=2, size=2048):
    mol = MolFromSmiles(smile_string)
    new_order = rdmolfiles.CanonicalRankAtoms(mol)
    mol = rdmolops.RenumberAtoms(mol, new_order)
    return rdMolDescriptors.GetMorganFingerprintAsBitVect(mol,
                                                          radius,
                                                          nBits=size,
                                                          useChirality=True,
                                                          useBondTypes=True,
                                                          useFeatures=False)
Beispiel #3
0
def check_similarity_between_generic_and_complete_representation(
        generic_smiles, complete_smiles):
    complete_smiles, _ = NeutraliseCharges(complete_smiles)
    generic_smiles, _ = NeutraliseCharges(generic_smiles)
    complete_mol = MolFromSmiles(complete_smiles)
    generic_mol = MolFromSmarts(generic_smiles)
    match = complete_mol.GetSubstructMatch(generic_mol)
    if match:
        return True
    return False
Beispiel #4
0
 def test_nonexistent_mordred_descriptors(self):
     """Test ability to pass through descriptors to Mordred."""
     mol_graph = MolFromSmiles("C")
     for desc in ["", "ReallyInvalidDescriptorName"]:
         descriptor = Descriptor()
         with self.assertRaises(MordredCalculatorError):
             descriptor.make_fingerprint(
                 molecule_graph=mol_graph,
                 fingerprint_type="mordred:" + desc,
             )
Beispiel #5
0
def IsCorrectSMILES(smiles):
    try:
        resMol = MolFromSmiles(smiles, sanitize=True)
    except Exception:
        resMol = None

    if resMol == None:
        return 0
    else:
        return 1
    def _filter_by_mass_and_rt(
        self,
        possible_ranges: List[Tuple[float, float, str, str]],
        cpd_info: List[Tuple[str]],
    ) -> Tuple[Optional[str], Dict]:
        """Check to see if compound masses  (and optionally, retention time)
        each lie in any possible mass ranges.

        Parameters
        ----------
        possible_ranges : List[Tuple[float, float, str, str]]
            Possible mass ranges based on peak masses and tolerance.
        cpd_info : List[Tuple[str]]
            Tuple of compound ID, SMILES, peak ID, and adduct name.

        Returns
        -------
        c_id_if_matched : str, optional
            Contains the compound ID if a hit is found, None by default.
        cpd_dict : Dict
            Contains predicted retention time, matched peak IDs (if any), and
            matched adduct names (if any).
        """
        c_id_if_matched = None
        cpd_dict = {"Predicted_RT": None, "Matched_Peak_IDs": [], "Matched_Adducts": []}

        cpd_exact_mass = ExactMolWt(MolFromSmiles(cpd_info[1]))
        predicted_rt = None
        for possible_range in possible_ranges:
            if possible_range[0] < cpd_exact_mass < possible_range[1]:
                c_id = cpd_info[0]
                smiles = cpd_info[1]
                peak_id = possible_range[2]
                adduct = possible_range[3]

                if self.filter_by_rt:
                    if not predicted_rt:
                        predicted_rt = self._predict_rt(smiles)
                    if not predicted_rt:
                        # sometimes can't predict RT due to missing vals in fingerprint
                        continue

                    expt_rt = self.metabolomics_dataset.get_rt(peak_id)
                    if not expt_rt:
                        raise ValueError(f"No retention time found for peak, {peak_id}")

                    cpd_dict["Predicted_RT"] = predicted_rt
                    if abs(expt_rt - predicted_rt) > self.rt_threshold:
                        continue  # if outside threshold, don"t add to matched peaks

                c_id_if_matched = c_id
                cpd_dict["Matched_Peak_IDs"].append(peak_id)
                cpd_dict["Matched_Adducts"].append(adduct)

        return c_id_if_matched, cpd_dict
Beispiel #7
0
def save(vertices, edges, out='out.png'):
    from rdkit.Chem import Draw, MolFromSmiles
    s = deprocess(vertices, edges)
    m = MolFromSmiles(s)
    if s == '':
        raise ValueError()
    if m:
        Draw.MolToFile(m, out, size=(800, 800))
        return s
    else:
        raise ValueError()
Beispiel #8
0
def calculate_pIC50(mols):
    scores = []
    for i in range(len(mols)):
        m = MolFromSmiles(mols[i])
        G = convert_rdkit_to_nx(m)
        reward = MPNNReward(model,
                            atom_types=atom_types,
                            bond_types=bond_types,
                            maximize=False)
        scores.append(reward._call(G))
    return scores
Beispiel #9
0
 def __compound_to_dir__(compound):
     compounds_dir = __mkd__(f'{compound["Compound Id"]}')
     with open('smiles', 'w') as f:
         f.write(compound["smiles"])
     with open('molfile', 'w') as f:
         mol = MolFromSmiles(compound["smiles"])
         f.write(MolToMolBlock(mol))
     os.chdir(compounds_dir)
     comp = ET.SubElement(root, "Compound")
     ET.SubElement(comp, "Id").text = compound["Compound Id"]
     ET.SubElement(comp, "Cargos").text = "smiles molfile"
Beispiel #10
0
def tensorize(junc_tree_batch, vocab, use_graph_conv, assm=True):
    set_batch_nodeID(junc_tree_batch, vocab)
    smiles_batch = [junc_tree.smiles for junc_tree in junc_tree_batch]
    jtenc_holder, mess_dict = JTNNEncoder.tensorize(junc_tree_batch)

    prop_batch = []
    for smiles in smiles_batch:
        prop_batch.append(Descriptors.MolLogP(MolFromSmiles(smiles)))

    if use_graph_conv:
        molenc_holder = MolGraphEncoder.tensorize(smiles_batch)

        if assm is False:
            return junc_tree_batch, jtenc_holder, molenc_holder

        candidate_smiles = []
        cand_batch_idx = []
        for idx, junc_tree in enumerate(junc_tree_batch):
            for node in junc_tree.nodes:
                # leaf node's attachment is determined by neighboring node's attachment
                if node.is_leaf or len(node.candidates) == 1:
                    continue
                candidate_smiles.extend(
                    [candidate for candidate in node.candidates])
                cand_batch_idx.extend([idx] * len(node.candidates))

        cand_molenc_holder = MolGraphEncoder.tensorize(candidate_smiles)
        cand_batch_idx = torch.LongTensor(cand_batch_idx)

        return junc_tree_batch, jtenc_holder, molenc_holder, (
            cand_molenc_holder, cand_batch_idx), prop_batch

    else:
        mpn_holder = MessPassNet.tensorize(smiles_batch)

        if assm is False:
            return junc_tree_batch, jtenc_holder, mpn_holder

        candidates = []
        cand_batch_idx = []
        for idx, junc_tree in enumerate(junc_tree_batch):
            for node in junc_tree.nodes:
                # leaf node's attachment is determined by neighboring node's attachment
                if node.is_leaf or len(node.candidates) == 1:
                    continue
                candidates.extend([(candidate, junc_tree.nodes, node)
                                   for candidate in node.candidates])
                cand_batch_idx.extend([idx] * len(node.candidates))

        jtmpn_holder = JTMessPassNet.tensorize(candidates, mess_dict)
        cand_batch_idx = torch.LongTensor(cand_batch_idx)

        return junc_tree_batch, jtenc_holder, mpn_holder, (
            jtmpn_holder, cand_batch_idx), prop_batch
Beispiel #11
0
 def test_bad_descriptors_padelpy_descriptors(self):
     """Test ability to pass through invalid descriptors to padelpy."""
     mol_graph = MolFromSmiles("C")
     for desc in ["", "ReallyInvalidDescriptorName"]:
         descriptor = Descriptor()
         with self.assertRaises(RuntimeError):
             descriptor.make_fingerprint(
                 molecule_graph=mol_graph,
                 fingerprint_type="padelpy:" + desc,
                 fingerprint_params={'timeout': 2},
             )
Beispiel #12
0
        def fingerprints():

            rdkit_mols = [MolFromSmiles(smiles) for smiles in self.features]
            fps = [
                AllChem.GetMorganFingerprintAsBitVect(mol,
                                                      bond_radius,
                                                      nBits=nBits)
                for mol in rdkit_mols
            ]

            return np.asarray(fps)
Beispiel #13
0
def smiles_validator(smiles):
    if isinstance(smiles,numbers.Number):
        raise ValueError("Molecules must be valid SMILES notation not integer.")
        
    if smiles is None or (smiles.strip() == ""):
        raise ValueError("smiles field must not be empty")

    if isinstance(MolFromSmiles(smiles),rdkit.Chem.rdchem.Mol):
        return True
    else:
        raise ValueError("Molecules must be valid SMILE notation of chemical.")
Beispiel #14
0
def is_correct_smiles(smiles):
    """
    Using RDKit to calculate whether molecule is syntactically and semantically valid.
    """
    if smiles == "":
        return 0

    try:
        return int(MolFromSmiles(smiles, sanitize=True) is not None)
    except Exception:
        return 0
Beispiel #15
0
    def depict(self, filename=None, ipython=False):
        from rdkit.Chem.Draw import IPythonConsole
        from rdkit.Chem.Draw import MolToImage
        from rdkit.Chem.Draw import rdMolDraw2D
        from rdkit.Chem.AllChem import EmbedMolecule
        from IPython.display import SVG
        from rdkit.Chem import RWMol, MolFromSmiles, Atom, BondType, ChiralType

        _ = MolFromSmiles('C')
        rmol = RWMol(_)

        dict_old_new_idx = {}
        n = 1
        for a in self.atoms:
            old_idx = a.GetIdx()
            rmol.AddAtom(a)
            dict_old_new_idx[old_idx] = n
            n += 1

        for a in self.enviroments:
            old_idx = a.GetIdx()
            a.SetChiralTag(ChiralType.CHI_UNSPECIFIED)
            a.SetIsAromatic(0)
            rmol.AddAtom(a)
            dict_old_new_idx[old_idx] = n
            n += 1

        for b in self.Bonds:
            rmol.AddBond(dict_old_new_idx[b.GetBeginAtomIdx()],
                         dict_old_new_idx[b.GetEndAtomIdx()], b.GetBondType())
        for b in self.bondsenvironments:
            rmol.AddBond(dict_old_new_idx[b.GetBeginAtomIdx()],
                         dict_old_new_idx[b.GetEndAtomIdx()], b.GetBondType())

        rmol.RemoveAtom(0)

        EmbedMolecule(rmol)
        drawer = rdMolDraw2D.MolDraw2DSVG(400, 200)

        drawer.DrawMolecule(rmol)

        drawer.FinishDrawing()
        svg = drawer.GetDrawingText()

        if filename != None:
            f = open(filename, 'w')
            f.write(svg)
            f.close()

        if ipython:
            svg = svg.replace('svg:', '')
            return SVG(svg)
        else:
            return None
Beispiel #16
0
def batch_diversity(smiles, train_smiles):
    """
    Compares the Tanimoto distance of a given molecule
    with a random sample of the training smiles.
    """
    rand_smiles = random.sample(train_smiles, 100)
    rand_mols = [MolFromSmiles(s) for s in rand_smiles]
    fps = [Chem.GetMorganFingerprintAsBitVect(
        m, 4, nBits=2048) for m in rand_mols]
    vals = [apply_to_valid(s, diversity, fps=fps) for s in smiles]
    return vals
Beispiel #17
0
def generate_drug_list():
    filename = 'drug_list_copy.csv'
    filepath = os.path.join(DRUG_LIST_PATH, filename)
    df = pd.read_csv(filepath)

    data = list()
    for row_id, row_series in df.iterrows():
        row_dict = dict(row_series)
        row_dict.pop('Unnamed: 0')
        if MolFromSmiles(row_dict['smiles']) is not None:
            data.append(row_dict)

    new_filename = 'drug_list.csv'
    new_filepath = os.path.join(DRUG_LIST_PATH, new_filename)
    new_df = pd.DataFrame(data=data)
    new_df.to_csv(new_filepath)

    new_df = pd.read_csv(new_filepath)
    assert sum([MolFromSmiles(smiles) is None
                for smiles in new_df['smiles']]) == 0
Beispiel #18
0
    def test_hypergraph_rpe_parser_bad_smiles(self):
        g = HypergraphGrammar()

        trees = []
        for smile in bad_smiles:
            try:
                trees.append(
                    g.normalize_tree(hypergraph_parser(MolFromSmiles(smile))))
            except (AssertionError, IndexError):
                print('Failed for {}'.format(smile))
                raise
def check_node_type(new_compound):
    node_index = []
    valid_compound = []
    all_smile = []
    distance = []

    score = []
    for i in range(len(new_compound)):
        try:
            ko = Chem.MolFromSmiles(new_compound[i])

        except:
            ko = None

        if ko != None:
            try:
                molscore = MolFromSmiles(new_compound[i])
            except:
                molscore = None
            if molscore != None:
                SA_score = -sascorer.calculateScore(molscore)
            else:
                SA_score = 1000
            cycle_list = nx.cycle_basis(
                nx.Graph(
                    rdmolops.GetAdjacencyMatrix(MolFromSmiles(
                        new_compound[i]))))
            if len(cycle_list) == 0:
                cycle_length = 0
            else:
                cycle_length = max([len(j) for j in cycle_list])
            if cycle_length <= 6:
                cycle_length = 0
            if cycle_length == 0:
                m = rdock_score(new_compound[i])
                if m < 10**10:
                    node_index.append(i)
                    valid_compound.append(new_compound[i])
                    score.append(m)

    return node_index, score, valid_compound
def scorer(smiles):
    smiles_rdkit = []
    for i in range(len(smiles)):
        smiles_rdkit.append(
            MolToSmiles(MolFromSmiles(smiles[i]), isomericSmiles=True))

    logP_values = []
    for i in range(len(smiles)):
        logP_values.append(Descriptors.MolLogP(MolFromSmiles(smiles_rdkit[i])))

    SA_scores = []
    for i in range(len(smiles)):
        SA_scores.append(
            -sascorer.calculateScore(MolFromSmiles(smiles_rdkit[i])))

    cycle_scores = []
    for i in range(len(smiles)):
        cycle_list = nx.cycle_basis(
            nx.Graph(
                rdmolops.GetAdjacencyMatrix(MolFromSmiles(smiles_rdkit[i]))))
        if len(cycle_list) == 0:
            cycle_length = 0
        else:
            cycle_length = max([len(j) for j in cycle_list])
        if cycle_length <= 6:
            cycle_length = 0
        else:
            cycle_length = cycle_length - 6
        cycle_scores.append(-cycle_length)

    SA_scores_normalized = (np.array(SA_scores) -
                            np.mean(SA_scores)) / np.std(SA_scores)
    logP_values_normalized = (np.array(logP_values) -
                              np.mean(logP_values)) / np.std(logP_values)
    cycle_scores_normalized = (np.array(cycle_scores) -
                               np.mean(cycle_scores)) / np.std(cycle_scores)

    targets = (SA_scores_normalized + logP_values_normalized +
               cycle_scores_normalized)

    return (SA_scores, logP_values, cycle_scores, targets)
Beispiel #21
0
def canonicalize_and_filter(smi_list, showprogress=False):
    """
    Function that returns the set of unique RDKit molecules from a list of input RDKit molecules
    by turning them into canonical SMILES and checking the strings for uniqueness.

    Also performs rudimentary Lipinski rule-of-5 filtering by dropping molecules with logP >5 and 
    more than 17 heavy atoms.
    """
    mol_list = []
    if showprogress:
        print('Canonicalising mols')
        for smi in tqdm(smi_list):
            mol = MolFromSmiles(smi)
            if mol is not None:
                mol_list.append(MolToSmiles(mol))
    else:
        for smi in smi_list:
            mol = MolFromSmiles(smi)
            if mol is not None:
                mol_list.append(mol)
    mol_list = list(set(mol_list))
    final_list = []
    if showprogress:
        print('Size of unfiltered final library: {}'.format(len(mol_list)))
        print('Filtering by n_heavy and logP:')
        for smi in tqdm(mol_list):
            mol = MolFromSmiles(smi)
            n_heavy = mol.GetNumHeavyAtoms()
            if n_heavy > 17:
                logP = Crippen.MolLogP(mol)
                if logP <= 5:
                    final_list.append(smi)
    else:
        for smi in mol_list:
            mol = MolFromSmiles(smi)
            n_heavy = mol.GetNumHeavyAtoms()
            if n_heavy > 17:
                logP = Crippen.MolLogP(mol)
                if logP <= 5:
                    final_list.append(smi)
    return final_list
Beispiel #22
0
def logp(smiles):
    mol = MolFromSmiles(smiles)
    try:
        log_p = Descriptors.MolLogP(mol)
    except:
        print(mol)
        print(smiles)
        return -np.inf

    log_p = (log_p - LOGP_MEAN) / LOGP_STD

    return log_p
Beispiel #23
0
    def contains(self, smiles):
        """
        Returns true if the given SMILES string is a substructure of this RDMol.
        Uses a client-side RDKit installation.

        Returns
        -------
        contains : boolean
            True if the rdmol molecule attribute contains the specified substructure
            in SMILES format.
        """
        return self.rdmol.HasSubstructMatch(MolFromSmiles(str(smiles)))
Beispiel #24
0
    def analyze(self, smiles: List[str], only_drugs=True) -> pd.DataFrame:
        features = self.preprocessor.transform(smiles)

        # RDKit molecular properties
        inchikey = []
        weight = []
        logp = []
        hdonors = []
        hacceptors = []
        for example in smiles:
            mol = MolFromSmiles(example)
            if not mol:
                raise ValueError("Malformed molecule passed in to analyze")

            inchikey.append(MolToInchiKey(mol))
            weight.append(ExactMolWt(mol))
            logp.append(MolLogP(mol))
            hdonors.append(NumHDonors(mol))
            hacceptors.append(NumHAcceptors(mol))

        # Scores
        safety = self.safety.predict(features)
        feasibility = self.feasibility.predict(features)
        bbbp = self.bbbp.predict_proba(features)

        dataframe = pd.DataFrame(
            {
                "key": inchikey,
                "smiles": smiles,
                "weight": weight,
                "logp": logp,
                "hdonors": hdonors,
                "hacceptors": hacceptors,
                "safety": safety,
                "feasibility": feasibility,
                "bbbp": (i[1] for i in bbbp),
            }
        )

        if only_drugs:
            # Lipinsky's rules
            dataframe = dataframe[dataframe.weight < 500]
            dataframe = dataframe[dataframe.hdonors <= 5]
            dataframe = dataframe[dataframe.hacceptors <= 10]
            dataframe = dataframe[dataframe.logp <= 5]

            # Filter too toxic and infeasible compounds
            dataframe = dataframe[dataframe.safety > 0.75]
            dataframe = dataframe[dataframe.feasibility > 0.75]

            dataframe = dataframe.reset_index(drop=True)

        return dataframe
Beispiel #25
0
def make_mass_spectra(smiles_list):
    molecules = [MolFromSmiles(smiles) for smiles in smiles_list]
    weights = [ExactMolWt(mol) for mol in molecules]
    highest_mass = max(weights)
    least_mass = min(weights)
    # make a bar graph of the masses simulated by MOD.
    plt.hist(weights, bins=range(500))
    plt.xlabel("Exact Mass")
    plt.ylabel("Frequency")
    plt.title(
        "Mass spectra of the molecules simulated in the reaction network.")
    plt.show()
Beispiel #26
0
def get_target_data(data, target_id, act_type='IC50'):
    """Returns a data frame of all the ligands for a given target
       Also makes sure that all the smiles are valid, and
       filters by weight."""
    if act_type is not None:
        data = data[data.act_type == act_type]
    target_data = data[data.target_id == target_id]
    # Filter by molecules that can be converted by rdkit
    n_ligs = target_data.shape[0]
    mols = np.zeros(n_ligs, dtype=object)
    for i in range(n_ligs):
        try:
            mols[i] = MolFromSmiles(target_data.smiles.iloc[i])
        except:
            mols[i] = None
    mols = pd.Series(mols)
    target_data = target_data[[not m for m in mols.isna()]]
    # Filter by weight
    weights = target_data.smiles.apply(lambda x: ExactMolWt(MolFromSmiles(x)))
    target_data = target_data[(weights >= 100) & (weights <= 600)]
    return target_data
Beispiel #27
0
def canonicalize(mol_list, showprogress=False):
    """
    Function that returns the set of unique RDKit molecules from a list of input RDKit molecules
    by turning them into canonical SMILES and checking the strings for uniqueness. 
    """
    smi_list = []
    if showprogress:
        print('Canonicalising mols')
        for mol in tqdm(mol_list):
            if mol is not None:
                smi_list.append(MolToSmiles(mol))
    else:
        for mol in mol_list:
            if mol is not None:
                smi_list.append(MolToSmiles(mol))
    mol_list = list(set(smi_list))
    if showprogress:
        mol_list = [MolFromSmiles(smi) for smi in tqdm(mol_list)]
    else:
        mol_list = [MolFromSmiles(smi) for smi in mol_list]
    return mol_list
Beispiel #28
0
def smiles_to_bits(smiles, nBits):
    mols = [MolFromSmiles(s) for s in smiles]
    fps = [
        AllChem.GetMorganFingerprintAsBitVect(m, 2, nBits=nBits) for m in mols
    ]
    np_fps = []
    for fp in fps:
        arr = np.zeros((1, ), dtype=np.int8)
        DataStructs.ConvertToNumpyArray(fp, arr)
        np_fps.append(arr)
    df = pd.DataFrame(np_fps)
    return df
    def find_ptn_from(self, node, ptn_smiles):
        ptn = MolFromSmiles(ptn_smiles)
        matches = self.mol.GetSubstructMatches(ptn)

        for sub in matches:
            if node in sub:
                continue
            for n1 in sub:
                if self.G.has_edge(node, n1):
                    return sub

        return None
Beispiel #30
0
def test_remove_stereo():
    mol = Filters.remove_stereo(MolFromSmiles('C[C@@H](C(=O)[O-])O'))
    assert MolToSmiles(mol) == 'CC(O)C(=O)[O-]'
    mol = Filters.remove_stereo(
        MolFromInchi(
            'InChI=1S/C20H13N3O3/c24-10-5-6-15-12(7-10)14(9-21-15)17-8-13(19(25)23-17)18-11-3-1-2-4-16(11)22-20(18)26/h1-9,21,24H,(H,22,26)(H,23,25)/b18-13+'
        ))
    assert MolToSmiles(
        mol) == 'OC1=NC(c2c[nH]c3ccc(O)cc23)=CC1=C1C(O)=Nc2ccccc21'
    mol = Filters.commute_inchi(mol)  # Expected to change tautomerism
    assert MolToSmiles(
        mol) == 'O=C1NC(C2=CNC3=C2C=C(O)C=C3)=CC1=C1C(=O)NC2=CC=CC=C21'