Beispiel #1
0
def is_same_mol(smiles1, smiles2):
    try:
        can_smiles1 = Chem.CanonSmiles(smiles1)
        can_smiles2 = Chem.CanonSmiles(smiles2)
        return can_smiles1 == can_smiles2
    except Exception:
        return False
Beispiel #2
0
 def test_queryStep(self):
     self.env.seed('C(=O)O')
     self.action.setAction("add", pos="back", mol='C1=CC=CC=C1')
     self.env.step(self.action)
     query = self.env._queryStep("remove", query=np.array(['Arom6']))
     self.assertTrue(query)
     smiles = self.env._listToSmiles()
     s = Chem.CanonSmiles(smiles)
     m = Chem.CanonSmiles('C(=O)O')
     self.assertEqual(m, s)
Beispiel #3
0
 def testDeprotect(self):
   smiles = "N(C(=O)OC(C)(C)C)Cc1ccccc1NC(=O)OC(C)(C)C"
   m = Chem.MolFromSmiles(smiles)
   m2 = rd.Deprotect(m)
   self.assertEqual(Chem.MolToSmiles(m2), Chem.CanonSmiles("NCc1ccccc1N"))
   self.assertEqual(list(m2.GetPropsAsDict()["DEPROTECTIONS"]), ["Boc", "Boc"])
   self.assertEqual(m2.GetPropsAsDict()["DEPROTECTION_COUNT"], 2)
Beispiel #4
0
def main():
    # Read in a SD or SMILES file - then write out into a specified directory
    parser = argparse.ArgumentParser(
        description=
        "Convert a SMILES or SDFile to input for Astex Fragment network.")
    parser.add_argument("--input")
    parser.add_argument("--input_format", default="smi")
    parser.add_argument("--base_dir")
    parser.add_argument("--iso_flag", default=True)

    args = parser.parse_args()
    attrs = []
    id = 0
    mols = parse_mols(args.input, args.input_format)
    for x in tqdm(mols):
        if x is None:
            continue
        attr = Attr(
            Chem.CanonSmiles(Chem.MolToSmiles(x, isomericSmiles=True)),
            ["EM", x.GetProp("_Name")],
        )
        attrs.append(attr)
        id += 1
    if not os.path.isdir(args.base_dir):
        os.mkdir(args.base_dir)
    # Build the network
    node_holder = NodeHolder(iso_flag=args.iso_flag)
    node_holder = build_network(attrs, node_holder)
    # Write the data out
    write_data(args.base_dir, node_holder, attrs)
Beispiel #5
0
def A2AR(input, out):
    """Construction of A2AR set, which is used for fine-tuned model and predictor training.
    Arguments:
        input (str): the path of tab-delimited data file that contains CANONICAL_SMILES.
        out (str): the path saving the refined data after filtering the invalid data,
            including removing molecule contained metal atom, reserving the largest fragments,
            and replacing the nitrogen electrical group to nitrogen atom "N".
    """
    df = pd.read_table(input)
    df = df[[
        'CMPD_CHEMBLID', 'CANONICAL_SMILES', 'PCHEMBL_VALUE',
        'ACTIVITY_COMMENT'
    ]]
    df = df.dropna()
    for i, row in df.iterrows():
        # replacing the nitrogen electrical group to nitrogen atom "N"
        smile = row['CANONICAL_SMILES'].replace('[NH+]', 'N').replace(
            '[NH2+]', 'N').replace('[NH3+]', 'N')
        # removing the radioactivity of each atom
        smile = re.sub('\[\d+', '[', smile)
        # reserving the largest fragments
        if '.' in smile:
            frags = smile.split('.')
            ix = np.argmax([len(frag) for frag in frags])
            smile = frags[ix]
        # Transforming into canonical SMILES based on the Rdkit built-in algorithm.
        df.loc[i, 'CANONICAL_SMILES'] = Chem.CanonSmiles(smile, 0)
        # removing molecule contained metal atom
        if '[Au]' in smile or '[As]' in smile or '[Hg]' in smile or '[Se]' in smile or smile.count(
                'C') + smile.count('c') < 2:
            df = df.drop(i)
    # df = df.drop_duplicates(subset='CANONICAL_SMILES')
    df.to_csv(out, index=False, sep='\t')
Beispiel #6
0
def smiles(dir="../data/xyz/DB3/", verbose=1):

    dir_str = "ls " + str(dir) + " | sort -d "
    temp = os.popen(dir_str).read()
    temp = str(temp).split()
    ret_list = []
    names = []
    for j, i in enumerate(temp):
        try:
            mol = next(pybel.readfile("xyz", dir + i))
            smi = mol.write(format="smi")
            smi = Chem.CanonSmiles(smi)
            ret_list.append(smi.split()[0].strip())
            names.append(i)
            if (verbose == 1):
                sys.stdout.write("\r %s / " % j + str(len(temp)))
                sys.stdout.flush()
        except:
            try:

                f = open(dir + i, "r")
                smi = f.readlines()[0]
                if (len(smi > 5)):
                    ret_list.append(smi)
                    names.append(i)
                if (verbose == 1):
                    sys.stdout.write("\r %s / " % j + str(len(temp)))
                    sys.stdout.flush()
            except:
                pass
    # print(ret_list[0:4])
    return names, ret_list
Beispiel #7
0
def main():
    # Read in a SD or SMILES file - then write out into a specified directory
    parser = argparse.ArgumentParser(
        description=
        "Convert a SMILES or SDFile to input for Astex Fragment network.")
    parser.add_argument("--input")
    parser.add_argument("--input_format", default="smi")
    parser.add_argument("--base_dir")
    parser.add_argument("--isomeric", dest="iso_flag", action="store_true")
    parser.add_argument("--non_isomeric",
                        dest="iso_flag",
                        action="store_false")

    group = parser.add_mutually_exclusive_group()
    group.add_argument("-v", dest="verbosity", action="store_const", const=1)
    group.add_argument("-vv", dest="verbosity", action="store_const", const=2)

    parser.set_defaults(verbosity=0)
    parser.set_defaults(iso_flag=True)

    args = parser.parse_args()

    # Do we have an input and base directory?
    if not args.input:
        print('ERROR: Must specify an input')
        sys.exit(1)
    if not os.path.isfile(args.input):
        print('ERROR: input (%s) does not exist' % args.input)
        sys.exit(1)
    if not args.base_dir:
        print('ERROR: Must specify a base directory')
        sys.exit(1)
    if not os.path.isdir(args.base_dir):
        print('ERROR:input base directory (%s) does not exist' % args.base_dir)
        sys.exit(1)

    tqdm_disable = True if args.verbosity else False
    attrs = []
    id = 0
    mols = parse_mols(args.input, args.input_format)
    for x in tqdm(mols, disable=tqdm_disable):
        print("Processing " + Chem.MolToSmiles(x, isomericSmiles=True))
        if x is None:
            continue
        attr = Attr(
            Chem.CanonSmiles(Chem.MolToSmiles(x, isomericSmiles=True)),
            ["EM", x.GetProp("_Name")],
        )
        attrs.append(attr)
        id += 1
    if not os.path.isdir(args.base_dir):
        os.mkdir(args.base_dir)
    # Build the network
    node_holder = NodeHolder(iso_flag=args.iso_flag)
    max_frags = 0
    node_holder = build_network(attrs, node_holder, max_frags, args.base_dir,
                                args.verbosity)
    # Write the data out
    write_data(args.base_dir, node_holder, attrs)
Beispiel #8
0
def graph_corpus(input, output, suffix='sdf'):
    metals = {'Na', 'Zn', 'Li', 'K', 'Ca', 'Mg', 'Ag', 'Cs', 'Ra', 'Rb', 'Al', 'Sr', 'Ba', 'Bi'}
    voc = utils.VocGraph('data/voc_atom.txt')
    inf = gzip.open(input)
    if suffix == 'sdf':
        mols = Chem.ForwardSDMolSupplier(inf)
        total = 2e6
    else:
        mols = pd.read_table(input).drop_duplicates(subset=['Smiles']).dropna(subset=['Smiles'])
        total = len(mols)
        mols = mols.iterrows()
    vals = {}
    exps = {}
    codes, ids = [], []
    chooser = rdMolStandardize.LargestFragmentChooser()
    disconnector = rdMolStandardize.MetalDisconnector()
    normalizer = rdMolStandardize.Normalizer()
    for i, mol in enumerate(tqdm(mols, total=total)):
        if mol is None: continue
        if suffix != 'sdf':
            idx = mol[1]['Molecule ChEMBL ID']

            mol = Chem.MolFromSmiles(mol[1].Smiles)
        else:
            idx = mol.GetPropsAsDict()
            idx = idx['chembl_id']
        try:
            mol = disconnector.Disconnect(mol)
            mol = normalizer.normalize(mol)
            mol = chooser.choose(mol)
            mol = disconnector.Disconnect(mol)
            mol = normalizer.normalize(mol)
        except:
            print(idx)
        symb = [a.GetSymbol() for a in mol.GetAtoms()]
        # Nr. of the atoms
        bonds = mol.GetBonds()
        if len(bonds) < 4 or len(bonds) >= 63: continue
        if {'C'}.isdisjoint(symb): continue
        if not metals.isdisjoint(symb): continue

        smile = Chem.MolToSmiles(mol)
        try:
            s0 = smile.replace('[O]', 'O').replace('[C]', 'C') \
                 .replace('[N]', 'N').replace('[B]', 'B') \
                 .replace('[2H]', '[H]').replace('[3H]', '[H]')
            s0 = Chem.CanonSmiles(s0, 0)
            code = voc.encode([smile])
            s1 = voc.decode(code)[0]
            assert s0 == s1
            codes.append(code[0].reshape(-1).tolist())
            ids.append(idx)
        except Exception as ex:
            print(ex)
            print('Parse Error:', idx)
    df = pd.DataFrame(codes, index=ids, columns=['C%d' % i for i in range(64*4)])
    df.to_csv(output, sep='\t', index=True)
    print(vals)
    print(exps)
Beispiel #9
0
def make_canon_smiles(smiles):
    canon_smiles = []
    for mol in smiles:
        try:
            canon_smiles.append(Chem.CanonSmiles(mol))
        except:
            return print('Invalid SMILE:', mol)
    return canon_smiles
Beispiel #10
0
 def __call__(self, molecule) -> None:
     """
     Updates the fitness value of a molecule.
     """
     molecular_graph = Chem.MolFromSmiles(Chem.CanonSmiles(molecule.smiles))
     molecule_fingerprint = self.get_fingerprint(molecular_graph, self.fingerprint_type)
     fitness = TanimotoSimilarity(self.target_fingerprint, molecule_fingerprint)
     molecule.fitness = fitness
     return molecule
Beispiel #11
0
 def load_from_database(self) -> List[Molecule]:
     dataframe = pd.read_csv(hydra.utils.to_absolute_path(self.data_file))
     smiles_list = dataframe['smiles'].sample(n=self.initial_size).tolist()
     pedigree = ("database", "no reaction", "no parent")
     molecules = [
         Molecule(Chem.CanonSmiles(smiles), pedigree)
         for smiles in smiles_list
     ]
     return molecules
Beispiel #12
0
 def __call__(self, molecule_pair):
     pedigree = ("crossover", molecule_pair[0].smiles,
                 molecule_pair[1].smiles)
     smiles_list = self.merge(molecule_pair)
     molecules = [
         Molecule(Chem.CanonSmiles(smiles), pedigree)
         for smiles in smiles_list if Chem.MolFromSmiles(smiles)
     ]
     return molecules
Beispiel #13
0
def substructure_split(ligand: Molecule,
                       idx: Tuple[int, int],
                       split: bool = True) -> Molecule:
    """Delete the hydrogen or mono-/polyatomic counterion attached to the functional group.

    Sets the charge of the remaining heteroatom to -1 if ``split=True``.

    Parameters
    ----------
    ligand: |plams.Molecule|_
        The ligand molecule.

    idx : |tuple|_ [|int|_]
        A tuple with 2 atomic indices associated with a functional group.

    split : bool
        If a functional group should be split from **ligand** (``True``) or not (``False``).

    Returns
    -------
    |plams.Molecule|_
        A copy of **ligand**, with part of its functional group removed (see **split**).

    """
    lig = ligand.copy()
    at1 = lig[idx[0] + 1]
    at2 = lig[idx[-1] + 1]

    if split:
        lig.delete_atom(at2)
        mol_list = lig.separate_mod()
        for mol in mol_list:
            if at1 not in mol:
                continue

            lig = mol
            break

        # Check if the ligand heteroatom has a charge assigned, assigns a charge if not
        if not at1.properties.charge:
            at1.properties.charge = -1

    # Update ligand properties
    lig.properties.dummies = at1
    lig.properties.anchor = at1.symbol + str(lig.atoms.index(at1) + 1)
    lig.properties.charge = sum(
        atom.properties.get('charge', 0) for atom in lig)

    # Update the ligand smiles string
    rdmol = molkit.to_rdmol(lig)
    smiles = Chem.MolToSmiles(rdmol)
    lig.properties.smiles = Chem.CanonSmiles(smiles)
    lig.properties.name = santize_smiles(
        lig.properties.smiles) + '@' + lig.properties.anchor
    lig.properties.path = ligand.properties.path

    return lig
Beispiel #14
0
def get_canonical_smiles(mols):
    smiles = list(set([Chem.MolToSmiles(m) for m in mols]))
    canon_smiles = []
    for s in smiles:
        try:
            canon_smiles.append(Chem.CanonSmiles(s))
        except:
            print(s)
    return canon_smiles
Beispiel #15
0
    def test_step(self):
        #test add-back
        smile = "CC"
        smile = Chem.CanonSmiles(smile)
        m = Chem.CanonSmiles(self.env._listToSmiles())
        self.assertEqual(m, smile)
        mols = []
        legends = []
        mols.append(RWMol(Chem.MolFromSmiles("C")))
        legends.append("1. C")
        mols.append(RWMol(Chem.MolFromSmiles("CC")))
        legends.append("2. CC")

        #test add-front
        self.action.setAction("add", pos="front", mol="C1=CC=CC=C1")
        self.env.step(self.action)
        smile = "CCC1=CC=CC=C1"
        smile = Chem.CanonSmiles(smile)
        m = Chem.CanonSmiles(self.env._listToSmiles())
        self.assertEqual(m, smile)
        mols.append(RWMol(Chem.MolFromSmiles("CCC1=CC=CC=C1")))
        l = "3. " + self.env._listToSmiles()
        legends.append(l)

        #test remove-back
        self.action.setAction("remove", pos="back", mol="C")
        self.env.step(self.action)
        smile = "C1=CC=CC=C1C"
        smile = Chem.CanonSmiles(smile)
        m = Chem.CanonSmiles(self.env._listToSmiles())
        self.assertEqual(m, smile)
        mols.append(RWMol(Chem.MolFromSmiles("CC1=CC=CC=C1")))
        l = "3. " + self.env._listToSmiles()
        legends.append(l)

        #test remove-front
        self.action.setAction("remove", pos="front", mol="C1=CC=CC=C1")
        self.env.step(self.action)
        smile = "C"
        smile = Chem.CanonSmiles(smile)
        m = Chem.CanonSmiles(self.env._listToSmiles())
        self.assertEqual(m, smile)
        mols.append(RWMol(Chem.MolFromSmiles("C")))
        l = "3. " + self.env._listToSmiles()
        legends.append(l)

        #test current molecule
        mol = self.env.current_molecule
        self.action.setAction("add", pos="front", mol="CC")
        self.env.step(self.action)
        self.assertNotEqual(self.env.current_molecule, mol)
        mols.append(RWMol(Chem.MolFromSmiles("CCC")))
        l = "3. " + self.env._listToSmiles()
        legends.append(l)
Beispiel #16
0
def main():
    # Read in a SD or SMILES file - then write out into a specified directory
    parser = argparse.ArgumentParser(
        description="Convert a SMILES to nodes, edges and attributes"
    )
    parser.add_argument("--smiles")
    parser.add_argument("--id")
    parser.add_argument("--standardize", action="store_true")
    parser.add_argument("--isomeric", dest="iso_flag", action="store_true")
    parser.add_argument("--non_isomeric", dest="iso_flag", action="store_false")

    group = parser.add_mutually_exclusive_group()
    group.add_argument("-v", dest="verbosity", action="store_const", const=1)
    group.add_argument("-vv", dest="verbosity", action="store_const", const=2)

    parser.set_defaults(verbosity=0)
    parser.set_defaults(iso_flag=True)

    args = parser.parse_args()

    # Do we have an input and base directory?
    if not args.smiles:
        print('ERROR: Must specify a SMILES')
        sys.exit(1)

    attrs = []
    print("Original SMILES: " + args.smiles)
    mol = Chem.MolFromSmiles(args.smiles)
    if args.standardize:
        mol = standardize(mol)
        print("Standardized SMILES: " + Chem.MolToSmiles(mol))
    smiles = Chem.CanonSmiles(Chem.MolToSmiles(mol, isomericSmiles=True))
    print("Canonical SMILES: " + smiles)

    id = args.id
    if id is None:
        id = "smiles1"
    attr = Attr(smiles, ["EM", id])
    attrs.append(attr)
    # Build the network
    node_holder = NodeHolder(iso_flag=args.iso_flag)
    max_frags = 0
    node_holder = build_network(attrs, node_holder,
                                max_frags, smiles, args.verbosity)
    # Write the data out
    for node in node_holder.node_list:
        print(str(node))

    for edge in node_holder.get_edges():
        print(str(edge))

    for attr in attrs:
        print(str(attr))

    print("Number of nodes: " + str(len(node_holder.node_list)))
    print("Number of edges: " + str(len(node_holder.get_edges())))
def default_molecules():
    smiles_list = [
        "Clc1ccc(cc1)C(c2ccccc2)N3CCN(CC3)CCOCC(=O)O",
        "CC1=CC(Cl)=CC(C(=O)N[C@@H]2C[C@@H]3CCCC[C@@H]32)=C1C"
    ]
    pedigree = ("database", "no reaction", "no parent")
    molecules = [
        Molecule(Chem.CanonSmiles(smiles), pedigree) for smiles in smiles_list
    ]
    return molecules
Beispiel #18
0
 def test_examples(self):
     count = 0
     for data in rd.GetDeprotections():
         if data.example:
             start, end = data.example.split(">>")
             m = Chem.MolFromSmiles(start)
             m2 = rd.Deprotect(m, [data])
             print("Testing", data.full_name)
             self.assertEqual(Chem.MolToSmiles(m2), Chem.CanonSmiles(end))
             count += 1
     assert count
Beispiel #19
0
 def sanitize(smi):
     mol = Chem.MolFromSmiles(smi, sanitize=False)
     if mol is None:
         raise Exception("could not convert SMILES to RDKit Mol: %s" % smi)
     # if we don't exclude SANITIZE_FINDRADICALS, then we get [C] in about 10% of generated mols
     Chem.SanitizeMol(mol,
                      sanitizeOps=Chem.SanitizeFlags.SANITIZE_ALL
                      ^ Chem.SanitizeFlags.SANITIZE_FINDRADICALS)
     # we somehow still get SMILES that cannot be later converted to a Mol, if we just call Chem.MolToSmiles, and
     #  not follow it with a call to Chem.CanonSmiles
     return Chem.CanonSmiles(Chem.MolToSmiles(mol, isomericSmiles=True))
Beispiel #20
0
def main(args):
    count = 0
    with open(args.input_file, 'r') as f:
        with open(args.output_file, 'w') as w:
            for i, l in enumerate(tqdm(f)):
                l = l.strip()
                try:
                    canonical = Chem.CanonSmiles(l)
                    w.write("{}\n".format(canonical))
                except:
                    pass
Beispiel #21
0
def create_sd_file(name, smiles, save_directory):
    """
    Create a 2D sdf file in the proasis project directory for successfully detected ligands
    """
    # create sdf file for ligand and save to hit directory
    canon_smiles = Chem.CanonSmiles(smiles)
    mol = Chem.MolFromSmiles(canon_smiles)
    AllChem.Compute2DCoords(mol)
    print('Generating sdf file and saving to ' + name + ' directory...\n')
    sd_file = Chem.SDWriter(save_directory)
    sd_file.write(mol)
Beispiel #22
0
def tanimoto_dist(x_test, autoencoded_selfies, dim, selfies_alphabet):
    '''
    method that computes the equality in encode-decode performance between a test
    dataset and an encode-decoded dataset
    '''

    test_size = len(x_test)
    count_good = 0
    dist = []

    for i, mol in enumerate(x_test):

        # single point - through vae
        one_hot = np.zeros((dim[0], dim[1]))
        one_hot_true = np.zeros((dim[0], dim[1]))

        for ind, row in enumerate(autoencoded_selfies[i].reshape(
                dim[0], dim[1])):
            lab_temp = np.argmax(row)
            one_hot[ind][lab_temp] = 1

        # single point - non vae
        for ind, row in enumerate(mol.reshape(dim[0], dim[1])):
            lab_temp = np.argmax(row)
            one_hot_true[ind][lab_temp] = 1

        self_test = sf.encoding_to_selfies(one_hot.tolist(), selfies_alphabet,
                                           "one_hot")
        self_true = sf.encoding_to_selfies(one_hot_true.tolist(),
                                           selfies_alphabet, "one_hot")

        canonical_smiles = Chem.CanonSmiles(sf.decoder(self_test))
        canonical_autoencoder_smiles = Chem.CanonSmiles(sf.decoder(self_true))

        fps1 = Chem.RDKFingerprint(Chem.MolFromSmiles(canonical_smiles))
        fps2 = Chem.RDKFingerprint(
            Chem.MolFromSmiles(canonical_autoencoder_smiles))
        diff = DataStructs.FingerprintSimilarity(fps1, fps2)
        dist.append(float(diff))

    return np.array(dist)
Beispiel #23
0
def corpus(input: str, out: str, *, vocab_path: str):
    """Constructing the molecular corpus by splitting each SMILES into
    a range of tokens contained in vocabulary.

    Arguments:
        input : the path of tab-delimited data file that contains CANONICAL_SMILES.
        out : the path for vocabulary (containing all of tokens for SMILES construction)
            and output table (including CANONICAL_SMILES and whitespace delimited token sentence)
    """
    df = pd.read_table(input).CANONICAL_SMILES
    voc = Voc(vocab_path)
    canons = []
    tokens = []
    smiles = set()
    it = tqdm(df, desc='Reading SMILES')
    for smile in it:
        # replacing the radioactive atom into nonradioactive atom
        smile = SUB_RE.sub('[', smile)
        # reserving the largest one if the molecule contains more than one fragments,
        # which are separated by '.'.
        if '.' in smile:
            frags = smile.split('.')
            ix = np.argmax([len(frag) for frag in frags])
            smile = frags[ix]
            # TODO replace with: smile = max(frags, key=len)
        # if it doesn't contain carbon atom, it cannot be drug-like molecule, just remove
        if smile.count('C') + smile.count('c') < 2:
            continue
        if smile in smiles:
            it.write('duplicate: {}'.format(smile))
        smiles.add(smile)
    # collecting all of the tokens in the sentences for vocabulary construction.
    words = set()
    it = tqdm(smiles, desc='Collecting tokens')
    for smile in it:
        try:
            token = voc.tokenize(smile)
            if len(token) <= 100:
                words.update(token)
                canons.append(Chem.CanonSmiles(smile, 0))
                tokens.append(' '.join(token))
        except Exception as e:
            it.write('{} {}'.format(e, smile))
    # persisting the vocabulary on the hard drive.
    with open(out + '_voc.txt', 'w') as file:
        file.write('\n'.join(sorted(words)))

    # saving the canonical smiles and token sentences as a table into hard drive.
    log = pd.DataFrame()
    log['CANONICAL_SMILES'] = canons
    log['SENT'] = tokens
    log.drop_duplicates(subset='CANONICAL_SMILES')
    log.to_csv(out + '_corpus.txt', sep='\t', index=None)
Beispiel #24
0
def fragment_molecule_on_explicit_hydrogens(smiles):
    num_heavies = get_num_heavies_from_smiles(smiles)
    smiles_with_H = Chem.CanonSmiles(smiles)
    input_mol = Chem.MolFromSmiles(
        smiles,
        sanitize=False)  # use santize=False to preserve explicit hydrogens
    Chem.SanitizeMol(input_mol, Chem.SANITIZE_ALL)

    cut_pairs = input_mol.GetSubstructMatches(_hydrogen_cut_pat)

    fragmentations = []
    for cut_pair in cut_pairs:
        bond_idx = input_mol.GetBondBetweenAtoms(*cut_pair).GetIdx()
        fragmented_mol = Chem.FragmentOnBonds(input_mol, [bond_idx],
                                              dummyLabels=[(0, 0)])
        new_smiles = Chem.MolToSmiles(fragmented_mol, isomericSmiles=True)

        left, mid, right = new_smiles.partition(".")
        assert mid == ".", new_smiles

        if left == "[*][H]":  # Hard-coded
            cut_smiles = right
        elif right == "[*][H]":
            cut_smiles = left
        else:
            raise AssertionError("did not split hydrogen correctly: %r %r" %
                                 (smiles, new_smiles))

        if "[H]" in cut_smiles:
            # If there were multiple [H] atoms, then we cut on one but others remain.
            # Recanonicalize to remove them.
            cut_smiles = Chem.CanonSmiles(cut_smiles)

        new_fragmentation = Fragmentation(1, EnumerationLabel.NO_ENUMERATION,
                                          0, "1", "[*][H]", "0", num_heavies,
                                          "1", cut_smiles, None)

        fragmentations.append(new_fragmentation)

    return fragmentations
Beispiel #25
0
def corpus(input, output, suffix='sdf'):
    if suffix =='sdf':
        inf = gzip.open(input)
        mols = Chem.ForwardSDMolSupplier(inf)
        # mols = [mol for mol in suppl]
    else:
        df = pd.read_table(input).Smiles.dropna()
        mols = [Chem.MolFromSmiles(s) for s in df]
    voc = Voc('data/voc_smiles.txt')
    charger = rdMolStandardize.Uncharger()
    chooser = rdMolStandardize.LargestFragmentChooser()
    disconnector = rdMolStandardize.MetalDisconnector()
    normalizer = rdMolStandardize.Normalizer()
    words = set()
    canons = []
    tokens = []
    smiles = set()
    for mol in tqdm(mols):
        try:
            mol = disconnector.Disconnect(mol)
            mol = normalizer.normalize(mol)
            mol = chooser.choose(mol)
            mol = charger.uncharge(mol)
            mol = disconnector.Disconnect(mol)
            mol = normalizer.normalize(mol)
            smileR = Chem.MolToSmiles(mol, 0)
            smiles.add(Chem.CanonSmiles(smileR))
        except:
            print('Parsing Error:') #, Chem.MolToSmiles(mol))

    for smile in tqdm(smiles):
        token = voc.split(smile) + ['EOS']
        if {'C', 'c'}.isdisjoint(token):
            print('Warning:', smile)
            continue
        if not {'[Na]', '[Zn]'}.isdisjoint(token):
            print('Redudent', smile)
            continue
        if 10 < len(token) <= 100:
            words.update(token)
            canons.append(smile)
            tokens.append(' '.join(token))
    log = open(output + '_voc.txt', 'w')
    log.write('\n'.join(sorted(words)))
    log.close()

    log = pd.DataFrame()
    log['Smiles'] = canons
    log['Token'] = tokens
    log.drop_duplicates(subset='Smiles')
    log.to_csv(output + '_corpus.txt', sep='\t', index=False)
Beispiel #26
0
def compare_equality(x_test, autoencoded_selfies, dim, selfies_alphabet):
    '''
    method that computes the equality in encode-decode performance between a test
    dataset and an encode-decoded dataset
    '''

    test_size = len(x_test)
    count_good = 0
    for i, mol in enumerate(x_test):

        # single point - through vae
        one_hot = np.zeros((dim[0], dim[1]))
        one_hot_true = np.zeros((dim[0], dim[1]))

        for ind, row in enumerate(autoencoded_selfies[i].reshape(
                dim[0], dim[1])):
            lab_temp = np.argmax(row)
            one_hot[ind][lab_temp] = 1

        # single point - non vae
        for ind, row in enumerate(mol.reshape(dim[0], dim[1])):
            lab_temp = np.argmax(row)
            one_hot_true[ind][lab_temp] = 1

        self_test = sf.encoding_to_selfies(one_hot.tolist(), selfies_alphabet,
                                           "one_hot")
        self_true = sf.encoding_to_selfies(one_hot_true.tolist(),
                                           selfies_alphabet, "one_hot")

        canonical_smiles = Chem.CanonSmiles(sf.decoder(self_true))
        canonical_autoencoder_smiles = Chem.CanonSmiles(sf.decoder(self_test))
        if (i == 1):
            print("Autoencoded Smiles: " + canonical_autoencoder_smiles)
            print("True Smiles: " + canonical_smiles)

        if (canonical_autoencoder_smiles == canonical_smiles):
            count_good += 1
    print("Percent Reconstructed Molescules: " + str(count_good / test_size))
Beispiel #27
0
def parse_node(input_str):
    """
    Convert something like to a Node:
    NODE O=CCCc1ccc(cc1)c2ccccc2 16 12 OCCCC1CCC(CC1)C2CCCCC2 0
    :param input_str:
    :return:
    """
    smiles = input_str.split()[1]
    new_node = Node()
    new_node.SMILES = Chem.CanonSmiles(smiles)
    new_node.HAC = input_str.split()[2]
    new_node.RAC = input_str.split()[3]
    new_node.RING_SMILES = input_str.split()[4]
    return new_node
Beispiel #28
0
def clean_mol(smile, is_deep=True):
    smile = smile.replace('[O]', 'O').replace('[C]', 'C') \
        .replace('[N]', 'N').replace('[B]', 'B') \
        .replace('[2H]', '[H]').replace('[3H]', '[H]')
    try:
        mol = Chem.MolFromSmiles(smile)
        if is_deep:
            mol = rdMolStandardize.ChargeParent(mol)
        smileR = Chem.MolToSmiles(mol, 0)
        smile = Chem.CanonSmiles(smileR)
    except:
        print('Parsing Error:', smile)
        smile = None
    return smile
Beispiel #29
0
def replace_wildcard_with_H(smiles):
    # The cache gives about 2% overall performance improvement.
    # My tests suggest there's about a 50% cache hit.
    try:
        return _H_cache[smiles]
    except KeyError:
        pass
    assert smiles.count("[*]") == 1, smiles
    smiles_with_H = smiles.replace("[*]", "[H]")
    new_smiles = Chem.CanonSmiles(smiles_with_H)
    if len(_H_cache) > 10000:
        _H_cache.clear()
    _H_cache[smiles] = new_smiles
    return new_smiles
Beispiel #30
0
    def post(self, request):
        if request.is_ajax():
            data = request.POST
            is_sub = data.get("is_sub")
            tanimoto = float(data.get("tanimoto", 0.8))
            smiles = data.get("smiles")
            try:
                smiles = Chem.CanonSmiles(smiles)
            except:
                return "Sorry, the structure is not appropriate!"
            if is_sub and smiles:
                compounds = self._substructure_search(smiles)
                paginator = Paginator(compounds, 10)

                ## My paginator code ###
                try:
                    paginator_compounds = paginator.page(1)
                except PageNotAnInteger:
                    paginator_compounds = paginator.page(1)
                except EmptyPage:
                    paginator_compounds = paginator.page(paginator.num_pages)

                respone = render(
                    request, template_name="result/compounds_result.html",
                    context={
                        "compounds": paginator_compounds,
                             }
                )
                return StreamingHttpResponse(respone.content)

            elif not is_sub and smiles:
                compounds = self._similarity_search(smiles, tanimoto=tanimoto)
                paginator = Paginator(compounds, 10)

                try:
                    paginator_compounds = paginator.page(1)
                except PageNotAnInteger:
                    paginator_compounds = paginator.page(1)
                except EmptyPage:
                    paginator_compounds = paginator.page(paginator.num_pages)

                response = render(
                    request,
                    template_name="result/compounds_result.html",
                    context={
                        "compounds": paginator_compounds,
                    }
                )
                return StreamingHttpResponse(response.content)