Python Chem.CanonSmiles Beispiele, rdkit.Chem.CanonSmiles Python Beispiele

Beispiel #1

0

Datei anzeigen

Datei: test_on_datasets.py Projekt: ncfrey/selfies

def is_same_mol(smiles1, smiles2):
    try:
        can_smiles1 = Chem.CanonSmiles(smiles1)
        can_smiles2 = Chem.CanonSmiles(smiles2)
        return can_smiles1 == can_smiles2
    except Exception:
        return False

Beispiel #2

0

Datei anzeigen

 def test_queryStep(self):
     self.env.seed('C(=O)O')
     self.action.setAction("add", pos="back", mol='C1=CC=CC=C1')
     self.env.step(self.action)
     query = self.env._queryStep("remove", query=np.array(['Arom6']))
     self.assertTrue(query)
     smiles = self.env._listToSmiles()
     s = Chem.CanonSmiles(smiles)
     m = Chem.CanonSmiles('C(=O)O')
     self.assertEqual(m, s)

Beispiel #3

0

Datei anzeigen

Datei: rough_test.py Projekt: lmmentel/rdkit

 def testDeprotect(self):
   smiles = "N(C(=O)OC(C)(C)C)Cc1ccccc1NC(=O)OC(C)(C)C"
   m = Chem.MolFromSmiles(smiles)
   m2 = rd.Deprotect(m)
   self.assertEqual(Chem.MolToSmiles(m2), Chem.CanonSmiles("NCc1ccccc1N"))
   self.assertEqual(list(m2.GetPropsAsDict()["DEPROTECTIONS"]), ["Boc", "Boc"])
   self.assertEqual(m2.GetPropsAsDict()["DEPROTECTION_COUNT"], 2)

Beispiel #4

0

Datei anzeigen

Datei: build_db.py Projekt: rburri/fragalysis

def main():
    # Read in a SD or SMILES file - then write out into a specified directory
    parser = argparse.ArgumentParser(
        description=
        "Convert a SMILES or SDFile to input for Astex Fragment network.")
    parser.add_argument("--input")
    parser.add_argument("--input_format", default="smi")
    parser.add_argument("--base_dir")
    parser.add_argument("--iso_flag", default=True)

    args = parser.parse_args()
    attrs = []
    id = 0
    mols = parse_mols(args.input, args.input_format)
    for x in tqdm(mols):
        if x is None:
            continue
        attr = Attr(
            Chem.CanonSmiles(Chem.MolToSmiles(x, isomericSmiles=True)),
            ["EM", x.GetProp("_Name")],
        )
        attrs.append(attr)
        id += 1
    if not os.path.isdir(args.base_dir):
        os.mkdir(args.base_dir)
    # Build the network
    node_holder = NodeHolder(iso_flag=args.iso_flag)
    node_holder = build_network(attrs, node_holder)
    # Write the data out
    write_data(args.base_dir, node_holder, attrs)

Beispiel #5

0

Datei anzeigen

def A2AR(input, out):
    """Construction of A2AR set, which is used for fine-tuned model and predictor training.
    Arguments:
        input (str): the path of tab-delimited data file that contains CANONICAL_SMILES.
        out (str): the path saving the refined data after filtering the invalid data,
            including removing molecule contained metal atom, reserving the largest fragments,
            and replacing the nitrogen electrical group to nitrogen atom "N".
    """
    df = pd.read_table(input)
    df = df[[
        'CMPD_CHEMBLID', 'CANONICAL_SMILES', 'PCHEMBL_VALUE',
        'ACTIVITY_COMMENT'
    ]]
    df = df.dropna()
    for i, row in df.iterrows():
        # replacing the nitrogen electrical group to nitrogen atom "N"
        smile = row['CANONICAL_SMILES'].replace('[NH+]', 'N').replace(
            '[NH2+]', 'N').replace('[NH3+]', 'N')
        # removing the radioactivity of each atom
        smile = re.sub('\[\d+', '[', smile)
        # reserving the largest fragments
        if '.' in smile:
            frags = smile.split('.')
            ix = np.argmax([len(frag) for frag in frags])
            smile = frags[ix]
        # Transforming into canonical SMILES based on the Rdkit built-in algorithm.
        df.loc[i, 'CANONICAL_SMILES'] = Chem.CanonSmiles(smile, 0)
        # removing molecule contained metal atom
        if '[Au]' in smile or '[As]' in smile or '[Hg]' in smile or '[Se]' in smile or smile.count(
                'C') + smile.count('c') < 2:
            df = df.drop(i)
    # df = df.drop_duplicates(subset='CANONICAL_SMILES')
    df.to_csv(out, index=False, sep='\t')

Beispiel #6

0

Datei anzeigen

def smiles(dir="../data/xyz/DB3/", verbose=1):

    dir_str = "ls " + str(dir) + " | sort -d "
    temp = os.popen(dir_str).read()
    temp = str(temp).split()
    ret_list = []
    names = []
    for j, i in enumerate(temp):
        try:
            mol = next(pybel.readfile("xyz", dir + i))
            smi = mol.write(format="smi")
            smi = Chem.CanonSmiles(smi)
            ret_list.append(smi.split()[0].strip())
            names.append(i)
            if (verbose == 1):
                sys.stdout.write("\r %s / " % j + str(len(temp)))
                sys.stdout.flush()
        except:
            try:

                f = open(dir + i, "r")
                smi = f.readlines()[0]
                if (len(smi > 5)):
                    ret_list.append(smi)
                    names.append(i)
                if (verbose == 1):
                    sys.stdout.write("\r %s / " % j + str(len(temp)))
                    sys.stdout.flush()
            except:
                pass
    # print(ret_list[0:4])
    return names, ret_list

Beispiel #7

0

Datei anzeigen

def main():
    # Read in a SD or SMILES file - then write out into a specified directory
    parser = argparse.ArgumentParser(
        description=
        "Convert a SMILES or SDFile to input for Astex Fragment network.")
    parser.add_argument("--input")
    parser.add_argument("--input_format", default="smi")
    parser.add_argument("--base_dir")
    parser.add_argument("--isomeric", dest="iso_flag", action="store_true")
    parser.add_argument("--non_isomeric",
                        dest="iso_flag",
                        action="store_false")

    group = parser.add_mutually_exclusive_group()
    group.add_argument("-v", dest="verbosity", action="store_const", const=1)
    group.add_argument("-vv", dest="verbosity", action="store_const", const=2)

    parser.set_defaults(verbosity=0)
    parser.set_defaults(iso_flag=True)

    args = parser.parse_args()

    # Do we have an input and base directory?
    if not args.input:
        print('ERROR: Must specify an input')
        sys.exit(1)
    if not os.path.isfile(args.input):
        print('ERROR: input (%s) does not exist' % args.input)
        sys.exit(1)
    if not args.base_dir:
        print('ERROR: Must specify a base directory')
        sys.exit(1)
    if not os.path.isdir(args.base_dir):
        print('ERROR:input base directory (%s) does not exist' % args.base_dir)
        sys.exit(1)

    tqdm_disable = True if args.verbosity else False
    attrs = []
    id = 0
    mols = parse_mols(args.input, args.input_format)
    for x in tqdm(mols, disable=tqdm_disable):
        print("Processing " + Chem.MolToSmiles(x, isomericSmiles=True))
        if x is None:
            continue
        attr = Attr(
            Chem.CanonSmiles(Chem.MolToSmiles(x, isomericSmiles=True)),
            ["EM", x.GetProp("_Name")],
        )
        attrs.append(attr)
        id += 1
    if not os.path.isdir(args.base_dir):
        os.mkdir(args.base_dir)
    # Build the network
    node_holder = NodeHolder(iso_flag=args.iso_flag)
    max_frags = 0
    node_holder = build_network(attrs, node_holder, max_frags, args.base_dir,
                                args.verbosity)
    # Write the data out
    write_data(args.base_dir, node_holder, attrs)

Beispiel #8

0

Datei anzeigen

Datei: dataset.py Projekt: XuhanLiu/DrugEx

def graph_corpus(input, output, suffix='sdf'):
    metals = {'Na', 'Zn', 'Li', 'K', 'Ca', 'Mg', 'Ag', 'Cs', 'Ra', 'Rb', 'Al', 'Sr', 'Ba', 'Bi'}
    voc = utils.VocGraph('data/voc_atom.txt')
    inf = gzip.open(input)
    if suffix == 'sdf':
        mols = Chem.ForwardSDMolSupplier(inf)
        total = 2e6
    else:
        mols = pd.read_table(input).drop_duplicates(subset=['Smiles']).dropna(subset=['Smiles'])
        total = len(mols)
        mols = mols.iterrows()
    vals = {}
    exps = {}
    codes, ids = [], []
    chooser = rdMolStandardize.LargestFragmentChooser()
    disconnector = rdMolStandardize.MetalDisconnector()
    normalizer = rdMolStandardize.Normalizer()
    for i, mol in enumerate(tqdm(mols, total=total)):
        if mol is None: continue
        if suffix != 'sdf':
            idx = mol[1]['Molecule ChEMBL ID']

            mol = Chem.MolFromSmiles(mol[1].Smiles)
        else:
            idx = mol.GetPropsAsDict()
            idx = idx['chembl_id']
        try:
            mol = disconnector.Disconnect(mol)
            mol = normalizer.normalize(mol)
            mol = chooser.choose(mol)
            mol = disconnector.Disconnect(mol)
            mol = normalizer.normalize(mol)
        except:
            print(idx)
        symb = [a.GetSymbol() for a in mol.GetAtoms()]
        # Nr. of the atoms
        bonds = mol.GetBonds()
        if len(bonds) < 4 or len(bonds) >= 63: continue
        if {'C'}.isdisjoint(symb): continue
        if not metals.isdisjoint(symb): continue

        smile = Chem.MolToSmiles(mol)
        try:
            s0 = smile.replace('[O]', 'O').replace('[C]', 'C') \
                 .replace('[N]', 'N').replace('[B]', 'B') \
                 .replace('[2H]', '[H]').replace('[3H]', '[H]')
            s0 = Chem.CanonSmiles(s0, 0)
            code = voc.encode([smile])
            s1 = voc.decode(code)[0]
            assert s0 == s1
            codes.append(code[0].reshape(-1).tolist())
            ids.append(idx)
        except Exception as ex:
            print(ex)
            print('Parse Error:', idx)
    df = pd.DataFrame(codes, index=ids, columns=['C%d' % i for i in range(64*4)])
    df.to_csv(output, sep='\t', index=True)
    print(vals)
    print(exps)

Beispiel #9

0

Datei anzeigen

def make_canon_smiles(smiles):
    canon_smiles = []
    for mol in smiles:
        try:
            canon_smiles.append(Chem.CanonSmiles(mol))
        except:
            return print('Invalid SMILE:', mol)
    return canon_smiles

Beispiel #10

0

Datei anzeigen

 def __call__(self, molecule) -> None:
     """
     Updates the fitness value of a molecule.
     """
     molecular_graph = Chem.MolFromSmiles(Chem.CanonSmiles(molecule.smiles))
     molecule_fingerprint = self.get_fingerprint(molecular_graph, self.fingerprint_type)
     fitness = TanimotoSimilarity(self.target_fingerprint, molecule_fingerprint)
     molecule.fitness = fitness
     return molecule

Beispiel #11

0

Datei anzeigen

Datei: illuminate.py Projekt: Jonas-Verhellen/Argenomic

 def load_from_database(self) -> List[Molecule]:
     dataframe = pd.read_csv(hydra.utils.to_absolute_path(self.data_file))
     smiles_list = dataframe['smiles'].sample(n=self.initial_size).tolist()
     pedigree = ("database", "no reaction", "no parent")
     molecules = [
         Molecule(Chem.CanonSmiles(smiles), pedigree)
         for smiles in smiles_list
     ]
     return molecules

Beispiel #12

0

Datei anzeigen

 def __call__(self, molecule_pair):
     pedigree = ("crossover", molecule_pair[0].smiles,
                 molecule_pair[1].smiles)
     smiles_list = self.merge(molecule_pair)
     molecules = [
         Molecule(Chem.CanonSmiles(smiles), pedigree)
         for smiles in smiles_list if Chem.MolFromSmiles(smiles)
     ]
     return molecules

Beispiel #13

0

Datei anzeigen

Datei: ligand_anchoring.py Projekt: pk-organics/CAT

def substructure_split(ligand: Molecule,
                       idx: Tuple[int, int],
                       split: bool = True) -> Molecule:
    """Delete the hydrogen or mono-/polyatomic counterion attached to the functional group.

    Sets the charge of the remaining heteroatom to -1 if ``split=True``.

    Parameters
    ----------
    ligand: |plams.Molecule|_
        The ligand molecule.

    idx : |tuple|_ [|int|_]
        A tuple with 2 atomic indices associated with a functional group.

    split : bool
        If a functional group should be split from **ligand** (``True``) or not (``False``).

    Returns
    -------
    |plams.Molecule|_
        A copy of **ligand**, with part of its functional group removed (see **split**).

    """
    lig = ligand.copy()
    at1 = lig[idx[0] + 1]
    at2 = lig[idx[-1] + 1]

    if split:
        lig.delete_atom(at2)
        mol_list = lig.separate_mod()
        for mol in mol_list:
            if at1 not in mol:
                continue

            lig = mol
            break

        # Check if the ligand heteroatom has a charge assigned, assigns a charge if not
        if not at1.properties.charge:
            at1.properties.charge = -1

    # Update ligand properties
    lig.properties.dummies = at1
    lig.properties.anchor = at1.symbol + str(lig.atoms.index(at1) + 1)
    lig.properties.charge = sum(
        atom.properties.get('charge', 0) for atom in lig)

    # Update the ligand smiles string
    rdmol = molkit.to_rdmol(lig)
    smiles = Chem.MolToSmiles(rdmol)
    lig.properties.smiles = Chem.CanonSmiles(smiles)
    lig.properties.name = santize_smiles(
        lig.properties.smiles) + '@' + lig.properties.anchor
    lig.properties.path = ligand.properties.path

    return lig

Beispiel #14

0

Datei anzeigen

def get_canonical_smiles(mols):
    smiles = list(set([Chem.MolToSmiles(m) for m in mols]))
    canon_smiles = []
    for s in smiles:
        try:
            canon_smiles.append(Chem.CanonSmiles(s))
        except:
            print(s)
    return canon_smiles

Beispiel #15

0

Datei anzeigen

    def test_step(self):
        #test add-back
        smile = "CC"
        smile = Chem.CanonSmiles(smile)
        m = Chem.CanonSmiles(self.env._listToSmiles())
        self.assertEqual(m, smile)
        mols = []
        legends = []
        mols.append(RWMol(Chem.MolFromSmiles("C")))
        legends.append("1. C")
        mols.append(RWMol(Chem.MolFromSmiles("CC")))
        legends.append("2. CC")

        #test add-front
        self.action.setAction("add", pos="front", mol="C1=CC=CC=C1")
        self.env.step(self.action)
        smile = "CCC1=CC=CC=C1"
        smile = Chem.CanonSmiles(smile)
        m = Chem.CanonSmiles(self.env._listToSmiles())
        self.assertEqual(m, smile)
        mols.append(RWMol(Chem.MolFromSmiles("CCC1=CC=CC=C1")))
        l = "3. " + self.env._listToSmiles()
        legends.append(l)

        #test remove-back
        self.action.setAction("remove", pos="back", mol="C")
        self.env.step(self.action)
        smile = "C1=CC=CC=C1C"
        smile = Chem.CanonSmiles(smile)
        m = Chem.CanonSmiles(self.env._listToSmiles())
        self.assertEqual(m, smile)
        mols.append(RWMol(Chem.MolFromSmiles("CC1=CC=CC=C1")))
        l = "3. " + self.env._listToSmiles()
        legends.append(l)

        #test remove-front
        self.action.setAction("remove", pos="front", mol="C1=CC=CC=C1")
        self.env.step(self.action)
        smile = "C"
        smile = Chem.CanonSmiles(smile)
        m = Chem.CanonSmiles(self.env._listToSmiles())
        self.assertEqual(m, smile)
        mols.append(RWMol(Chem.MolFromSmiles("C")))
        l = "3. " + self.env._listToSmiles()
        legends.append(l)

        #test current molecule
        mol = self.env.current_molecule
        self.action.setAction("add", pos="front", mol="CC")
        self.env.step(self.action)
        self.assertNotEqual(self.env.current_molecule, mol)
        mols.append(RWMol(Chem.MolFromSmiles("CCC")))
        l = "3. " + self.env._listToSmiles()
        legends.append(l)

Beispiel #16

0

Datei anzeigen

def main():
    # Read in a SD or SMILES file - then write out into a specified directory
    parser = argparse.ArgumentParser(
        description="Convert a SMILES to nodes, edges and attributes"
    )
    parser.add_argument("--smiles")
    parser.add_argument("--id")
    parser.add_argument("--standardize", action="store_true")
    parser.add_argument("--isomeric", dest="iso_flag", action="store_true")
    parser.add_argument("--non_isomeric", dest="iso_flag", action="store_false")

    group = parser.add_mutually_exclusive_group()
    group.add_argument("-v", dest="verbosity", action="store_const", const=1)
    group.add_argument("-vv", dest="verbosity", action="store_const", const=2)

    parser.set_defaults(verbosity=0)
    parser.set_defaults(iso_flag=True)

    args = parser.parse_args()

    # Do we have an input and base directory?
    if not args.smiles:
        print('ERROR: Must specify a SMILES')
        sys.exit(1)

    attrs = []
    print("Original SMILES: " + args.smiles)
    mol = Chem.MolFromSmiles(args.smiles)
    if args.standardize:
        mol = standardize(mol)
        print("Standardized SMILES: " + Chem.MolToSmiles(mol))
    smiles = Chem.CanonSmiles(Chem.MolToSmiles(mol, isomericSmiles=True))
    print("Canonical SMILES: " + smiles)

    id = args.id
    if id is None:
        id = "smiles1"
    attr = Attr(smiles, ["EM", id])
    attrs.append(attr)
    # Build the network
    node_holder = NodeHolder(iso_flag=args.iso_flag)
    max_frags = 0
    node_holder = build_network(attrs, node_holder,
                                max_frags, smiles, args.verbosity)
    # Write the data out
    for node in node_holder.node_list:
        print(str(node))

    for edge in node_holder.get_edges():
        print(str(edge))

    for attr in attrs:
        print(str(attr))

    print("Number of nodes: " + str(len(node_holder.node_list)))
    print("Number of edges: " + str(len(node_holder.get_edges())))

Beispiel #17

0

Datei anzeigen

Datei: test_operations.py Projekt: Jonas-Verhellen/Argenomic

def default_molecules():
    smiles_list = [
        "Clc1ccc(cc1)C(c2ccccc2)N3CCN(CC3)CCOCC(=O)O",
        "CC1=CC(Cl)=CC(C(=O)N[C@@H]2C[C@@H]3CCCC[C@@H]32)=C1C"
    ]
    pedigree = ("database", "no reaction", "no parent")
    molecules = [
        Molecule(Chem.CanonSmiles(smiles), pedigree) for smiles in smiles_list
    ]
    return molecules

Beispiel #18

0

Datei anzeigen

 def test_examples(self):
     count = 0
     for data in rd.GetDeprotections():
         if data.example:
             start, end = data.example.split(">>")
             m = Chem.MolFromSmiles(start)
             m2 = rd.Deprotect(m, [data])
             print("Testing", data.full_name)
             self.assertEqual(Chem.MolToSmiles(m2), Chem.CanonSmiles(end))
             count += 1
     assert count

Beispiel #19

0

Datei anzeigen

Datei: lm_utils.py Projekt: lantunes/chemgrams

 def sanitize(smi):
     mol = Chem.MolFromSmiles(smi, sanitize=False)
     if mol is None:
         raise Exception("could not convert SMILES to RDKit Mol: %s" % smi)
     # if we don't exclude SANITIZE_FINDRADICALS, then we get [C] in about 10% of generated mols
     Chem.SanitizeMol(mol,
                      sanitizeOps=Chem.SanitizeFlags.SANITIZE_ALL
                      ^ Chem.SanitizeFlags.SANITIZE_FINDRADICALS)
     # we somehow still get SMILES that cannot be later converted to a Mol, if we just call Chem.MolToSmiles, and
     #  not follow it with a call to Chem.CanonSmiles
     return Chem.CanonSmiles(Chem.MolToSmiles(mol, isomericSmiles=True))

Beispiel #20

0

Datei anzeigen

def main(args):
    count = 0
    with open(args.input_file, 'r') as f:
        with open(args.output_file, 'w') as w:
            for i, l in enumerate(tqdm(f)):
                l = l.strip()
                try:
                    canonical = Chem.CanonSmiles(l)
                    w.write("{}\n".format(canonical))
                except:
                    pass

Beispiel #21

0

Datei anzeigen

Datei: misc_functions.py Projekt: nelse003/pipeline

def create_sd_file(name, smiles, save_directory):
    """
    Create a 2D sdf file in the proasis project directory for successfully detected ligands
    """
    # create sdf file for ligand and save to hit directory
    canon_smiles = Chem.CanonSmiles(smiles)
    mol = Chem.MolFromSmiles(canon_smiles)
    AllChem.Compute2DCoords(mol)
    print('Generating sdf file and saving to ' + name + ' directory...\n')
    sd_file = Chem.SDWriter(save_directory)
    sd_file.write(mol)

Beispiel #22

0

Datei anzeigen

def tanimoto_dist(x_test, autoencoded_selfies, dim, selfies_alphabet):
    '''
    method that computes the equality in encode-decode performance between a test
    dataset and an encode-decoded dataset
    '''

    test_size = len(x_test)
    count_good = 0
    dist = []

    for i, mol in enumerate(x_test):

        # single point - through vae
        one_hot = np.zeros((dim[0], dim[1]))
        one_hot_true = np.zeros((dim[0], dim[1]))

        for ind, row in enumerate(autoencoded_selfies[i].reshape(
                dim[0], dim[1])):
            lab_temp = np.argmax(row)
            one_hot[ind][lab_temp] = 1

        # single point - non vae
        for ind, row in enumerate(mol.reshape(dim[0], dim[1])):
            lab_temp = np.argmax(row)
            one_hot_true[ind][lab_temp] = 1

        self_test = sf.encoding_to_selfies(one_hot.tolist(), selfies_alphabet,
                                           "one_hot")
        self_true = sf.encoding_to_selfies(one_hot_true.tolist(),
                                           selfies_alphabet, "one_hot")

        canonical_smiles = Chem.CanonSmiles(sf.decoder(self_test))
        canonical_autoencoder_smiles = Chem.CanonSmiles(sf.decoder(self_true))

        fps1 = Chem.RDKFingerprint(Chem.MolFromSmiles(canonical_smiles))
        fps2 = Chem.RDKFingerprint(
            Chem.MolFromSmiles(canonical_autoencoder_smiles))
        diff = DataStructs.FingerprintSimilarity(fps1, fps2)
        dist.append(float(diff))

    return np.array(dist)

Beispiel #23

0

Datei anzeigen

def corpus(input: str, out: str, *, vocab_path: str):
    """Constructing the molecular corpus by splitting each SMILES into
    a range of tokens contained in vocabulary.

    Arguments:
        input : the path of tab-delimited data file that contains CANONICAL_SMILES.
        out : the path for vocabulary (containing all of tokens for SMILES construction)
            and output table (including CANONICAL_SMILES and whitespace delimited token sentence)
    """
    df = pd.read_table(input).CANONICAL_SMILES
    voc = Voc(vocab_path)
    canons = []
    tokens = []
    smiles = set()
    it = tqdm(df, desc='Reading SMILES')
    for smile in it:
        # replacing the radioactive atom into nonradioactive atom
        smile = SUB_RE.sub('[', smile)
        # reserving the largest one if the molecule contains more than one fragments,
        # which are separated by '.'.
        if '.' in smile:
            frags = smile.split('.')
            ix = np.argmax([len(frag) for frag in frags])
            smile = frags[ix]
            # TODO replace with: smile = max(frags, key=len)
        # if it doesn't contain carbon atom, it cannot be drug-like molecule, just remove
        if smile.count('C') + smile.count('c') < 2:
            continue
        if smile in smiles:
            it.write('duplicate: {}'.format(smile))
        smiles.add(smile)
    # collecting all of the tokens in the sentences for vocabulary construction.
    words = set()
    it = tqdm(smiles, desc='Collecting tokens')
    for smile in it:
        try:
            token = voc.tokenize(smile)
            if len(token) <= 100:
                words.update(token)
                canons.append(Chem.CanonSmiles(smile, 0))
                tokens.append(' '.join(token))
        except Exception as e:
            it.write('{} {}'.format(e, smile))
    # persisting the vocabulary on the hard drive.
    with open(out + '_voc.txt', 'w') as file:
        file.write('\n'.join(sorted(words)))

    # saving the canonical smiles and token sentences as a table into hard drive.
    log = pd.DataFrame()
    log['CANONICAL_SMILES'] = canons
    log['SENT'] = tokens
    log.drop_duplicates(subset='CANONICAL_SMILES')
    log.to_csv(out + '_corpus.txt', sep='\t', index=None)

Beispiel #24

0

Datei anzeigen

def fragment_molecule_on_explicit_hydrogens(smiles):
    num_heavies = get_num_heavies_from_smiles(smiles)
    smiles_with_H = Chem.CanonSmiles(smiles)
    input_mol = Chem.MolFromSmiles(
        smiles,
        sanitize=False)  # use santize=False to preserve explicit hydrogens
    Chem.SanitizeMol(input_mol, Chem.SANITIZE_ALL)

    cut_pairs = input_mol.GetSubstructMatches(_hydrogen_cut_pat)

    fragmentations = []
    for cut_pair in cut_pairs:
        bond_idx = input_mol.GetBondBetweenAtoms(*cut_pair).GetIdx()
        fragmented_mol = Chem.FragmentOnBonds(input_mol, [bond_idx],
                                              dummyLabels=[(0, 0)])
        new_smiles = Chem.MolToSmiles(fragmented_mol, isomericSmiles=True)

        left, mid, right = new_smiles.partition(".")
        assert mid == ".", new_smiles

        if left == "[*][H]":  # Hard-coded
            cut_smiles = right
        elif right == "[*][H]":
            cut_smiles = left
        else:
            raise AssertionError("did not split hydrogen correctly: %r %r" %
                                 (smiles, new_smiles))

        if "[H]" in cut_smiles:
            # If there were multiple [H] atoms, then we cut on one but others remain.
            # Recanonicalize to remove them.
            cut_smiles = Chem.CanonSmiles(cut_smiles)

        new_fragmentation = Fragmentation(1, EnumerationLabel.NO_ENUMERATION,
                                          0, "1", "[*][H]", "0", num_heavies,
                                          "1", cut_smiles, None)

        fragmentations.append(new_fragmentation)

    return fragmentations

Beispiel #25

0

Datei anzeigen

Datei: dataset.py Projekt: XuhanLiu/DrugEx

def corpus(input, output, suffix='sdf'):
    if suffix =='sdf':
        inf = gzip.open(input)
        mols = Chem.ForwardSDMolSupplier(inf)
        # mols = [mol for mol in suppl]
    else:
        df = pd.read_table(input).Smiles.dropna()
        mols = [Chem.MolFromSmiles(s) for s in df]
    voc = Voc('data/voc_smiles.txt')
    charger = rdMolStandardize.Uncharger()
    chooser = rdMolStandardize.LargestFragmentChooser()
    disconnector = rdMolStandardize.MetalDisconnector()
    normalizer = rdMolStandardize.Normalizer()
    words = set()
    canons = []
    tokens = []
    smiles = set()
    for mol in tqdm(mols):
        try:
            mol = disconnector.Disconnect(mol)
            mol = normalizer.normalize(mol)
            mol = chooser.choose(mol)
            mol = charger.uncharge(mol)
            mol = disconnector.Disconnect(mol)
            mol = normalizer.normalize(mol)
            smileR = Chem.MolToSmiles(mol, 0)
            smiles.add(Chem.CanonSmiles(smileR))
        except:
            print('Parsing Error:') #, Chem.MolToSmiles(mol))

    for smile in tqdm(smiles):
        token = voc.split(smile) + ['EOS']
        if {'C', 'c'}.isdisjoint(token):
            print('Warning:', smile)
            continue
        if not {'[Na]', '[Zn]'}.isdisjoint(token):
            print('Redudent', smile)
            continue
        if 10 < len(token) <= 100:
            words.update(token)
            canons.append(smile)
            tokens.append(' '.join(token))
    log = open(output + '_voc.txt', 'w')
    log.write('\n'.join(sorted(words)))
    log.close()

    log = pd.DataFrame()
    log['Smiles'] = canons
    log['Token'] = tokens
    log.drop_duplicates(subset='Smiles')
    log.to_csv(output + '_corpus.txt', sep='\t', index=False)

Beispiel #26

0

Datei anzeigen

def compare_equality(x_test, autoencoded_selfies, dim, selfies_alphabet):
    '''
    method that computes the equality in encode-decode performance between a test
    dataset and an encode-decoded dataset
    '''

    test_size = len(x_test)
    count_good = 0
    for i, mol in enumerate(x_test):

        # single point - through vae
        one_hot = np.zeros((dim[0], dim[1]))
        one_hot_true = np.zeros((dim[0], dim[1]))

        for ind, row in enumerate(autoencoded_selfies[i].reshape(
                dim[0], dim[1])):
            lab_temp = np.argmax(row)
            one_hot[ind][lab_temp] = 1

        # single point - non vae
        for ind, row in enumerate(mol.reshape(dim[0], dim[1])):
            lab_temp = np.argmax(row)
            one_hot_true[ind][lab_temp] = 1

        self_test = sf.encoding_to_selfies(one_hot.tolist(), selfies_alphabet,
                                           "one_hot")
        self_true = sf.encoding_to_selfies(one_hot_true.tolist(),
                                           selfies_alphabet, "one_hot")

        canonical_smiles = Chem.CanonSmiles(sf.decoder(self_true))
        canonical_autoencoder_smiles = Chem.CanonSmiles(sf.decoder(self_test))
        if (i == 1):
            print("Autoencoded Smiles: " + canonical_autoencoder_smiles)
            print("True Smiles: " + canonical_smiles)

        if (canonical_autoencoder_smiles == canonical_smiles):
            count_good += 1
    print("Percent Reconstructed Molescules: " + str(count_good / test_size))

Beispiel #27

0

Datei anzeigen

Datei: test_network.py Projekt: xchem/fragalysis

def parse_node(input_str):
    """
    Convert something like to a Node:
    NODE O=CCCc1ccc(cc1)c2ccccc2 16 12 OCCCC1CCC(CC1)C2CCCCC2 0
    :param input_str:
    :return:
    """
    smiles = input_str.split()[1]
    new_node = Node()
    new_node.SMILES = Chem.CanonSmiles(smiles)
    new_node.HAC = input_str.split()[2]
    new_node.RAC = input_str.split()[3]
    new_node.RING_SMILES = input_str.split()[4]
    return new_node

Beispiel #28

0

Datei anzeigen

def clean_mol(smile, is_deep=True):
    smile = smile.replace('[O]', 'O').replace('[C]', 'C') \
        .replace('[N]', 'N').replace('[B]', 'B') \
        .replace('[2H]', '[H]').replace('[3H]', '[H]')
    try:
        mol = Chem.MolFromSmiles(smile)
        if is_deep:
            mol = rdMolStandardize.ChargeParent(mol)
        smileR = Chem.MolToSmiles(mol, 0)
        smile = Chem.CanonSmiles(smileR)
    except:
        print('Parsing Error:', smile)
        smile = None
    return smile

Beispiel #29

0

Datei anzeigen

def replace_wildcard_with_H(smiles):
    # The cache gives about 2% overall performance improvement.
    # My tests suggest there's about a 50% cache hit.
    try:
        return _H_cache[smiles]
    except KeyError:
        pass
    assert smiles.count("[*]") == 1, smiles
    smiles_with_H = smiles.replace("[*]", "[H]")
    new_smiles = Chem.CanonSmiles(smiles_with_H)
    if len(_H_cache) > 10000:
        _H_cache.clear()
    _H_cache[smiles] = new_smiles
    return new_smiles

Beispiel #30

0

Datei anzeigen

Datei: views.py Projekt: jidushanbojue/yatcm

    def post(self, request):
        if request.is_ajax():
            data = request.POST
            is_sub = data.get("is_sub")
            tanimoto = float(data.get("tanimoto", 0.8))
            smiles = data.get("smiles")
            try:
                smiles = Chem.CanonSmiles(smiles)
            except:
                return "Sorry, the structure is not appropriate!"
            if is_sub and smiles:
                compounds = self._substructure_search(smiles)
                paginator = Paginator(compounds, 10)

                ## My paginator code ###
                try:
                    paginator_compounds = paginator.page(1)
                except PageNotAnInteger:
                    paginator_compounds = paginator.page(1)
                except EmptyPage:
                    paginator_compounds = paginator.page(paginator.num_pages)

                respone = render(
                    request, template_name="result/compounds_result.html",
                    context={
                        "compounds": paginator_compounds,
                             }
                )
                return StreamingHttpResponse(respone.content)

            elif not is_sub and smiles:
                compounds = self._similarity_search(smiles, tanimoto=tanimoto)
                paginator = Paginator(compounds, 10)

                try:
                    paginator_compounds = paginator.page(1)
                except PageNotAnInteger:
                    paginator_compounds = paginator.page(1)
                except EmptyPage:
                    paginator_compounds = paginator.page(paginator.num_pages)

                response = render(
                    request,
                    template_name="result/compounds_result.html",
                    context={
                        "compounds": paginator_compounds,
                    }
                )
                return StreamingHttpResponse(response.content)