def generate_structures(vae, smi, char_to_index, limit=1e4, write=False):
    rdkit_mols = []
    temps = []
    iterations = []
    iteration = limit_counter = 0
    while True:
        iteration += 1
        limit_counter += 1
        t = random.random()*2
        candidate = decode_smiles(vae, smi, char_to_index, temp=t).split(" ")[0]
        try:
            sampled = Chem.MolFromSmiles(candidate)
            cation = Chem.AddHs(sampled)
            Chem.EmbedMolecule(cation, Chem.ETKDG())
            Chem.UFFOptimizeMolecule(cation)
            cation = Chem.RemoveHs(cation)
            candidate = Chem.MolToSmiles(cation)
            if candidate not in rdkit_mols:
                temps.append(t)
                iterations.append(iteration)
                rdkit_mols.append(candidate) 
                limit_counter = 0
                df = pd.DataFrame([rdkit_mols,temps,iterations]).T
                df.columns = ['smiles', 'temperature', 'iteration']
                print(df)
        except:
            pass
        if limit_counter > limit:
            break
        if write:
            df = pd.DataFrame([rdkit_mols,temps,iterations]).T
            df.columns = ['smiles', 'temperature', 'iteration']
            pd.DataFrame.to_csv(df, path_or_buf='{}.csv'.format(write), index=False)
    return df
Example #2
0
def get_descriptors(smiles):
    """
    Get a dictionary of RDKit descriptors from a SMILES string.

    Parameters
    ----------
    smiles : str
        The SMILES string of the chemical of interest

    Returns
    -------
    descriptors : dict
        A collection of molecular descriptors
    
    Notes: Developed with RDKit 2019.03.4, although doc pages listed 2019.03.1
    """

    mol = Chem.MolFromSmiles(smiles)
    mol = Chem.AddHs(mol)

    Chem.EmbedMolecule(mol, Chem.ETKDG())

    descriptors = {}

    # Starting with simple descriptors:
    # https://www.rdkit.org/docs/source/rdkit.Chem.Descriptors.html

    # Molecular weight
    descriptors['molwt'] = Descriptors.ExactMolWt(mol)

    # Partial charge metrics
    descriptors['max_abs_partial_charge'] = Descriptors.MaxAbsPartialCharge(mol)
    descriptors['max_partial_charge'] = Descriptors.MaxPartialCharge(mol)
    descriptors['min_abs_partial_charge'] = Descriptors.MinAbsPartialCharge(mol)
    descriptors['min_partial_charge'] = Descriptors.MinPartialCharge(mol)

    # Basic electron counts
    descriptors['num_radical_electrons'] = Descriptors.NumRadicalElectrons(mol)
    descriptors['num_valence_electrons'] = Descriptors.NumValenceElectrons(mol)

    # 3-D descriptors
    # https://www.rdkit.org/docs/source/rdkit.Chem.Descriptors3D.html

    # Calculating these should produce the same result, according to some basic tests
    # descriptors['asphericity'] = rdMolDescriptors.CalcAsphericity(mol)
    # descriptors['eccentricity'] = rdMolDescriptors.CalcEccentricity(mol)
    descriptors['asphericity'] = Descriptors3D.Asphericity(mol)
    descriptors['eccentricity'] = Descriptors3D.Eccentricity(mol)

    descriptors['inertial_shape_factor'] = Descriptors3D.InertialShapeFactor(mol)

    descriptors['radius_of_gyration'] = Descriptors3D.RadiusOfGyration(mol)
    descriptors['spherocity_index'] = Descriptors3D.SpherocityIndex(mol)

    # Graph descriptors
    # https://www.rdkit.org/docs/source/rdkit.Chem.GraphDescriptors.html
    descriptors['balaban_j'] = GraphDescriptors.BalabanJ(mol)
    descriptors['bertz_ct'] = GraphDescriptors.BertzCT(mol)

    descriptors['chi0'] = GraphDescriptors.Chi0(mol)
    descriptors['chi0n'] = GraphDescriptors.Chi0n(mol)
    descriptors['chi0v'] = GraphDescriptors.Chi0v(mol)
    descriptors['chi1'] = GraphDescriptors.Chi1(mol)
    descriptors['chi1n'] = GraphDescriptors.Chi1n(mol)
    descriptors['chi1v'] = GraphDescriptors.Chi1v(mol)
    descriptors['chi2n'] = GraphDescriptors.Chi2n(mol)
    descriptors['chi2v'] = GraphDescriptors.Chi2v(mol)
    descriptors['chi3n'] = GraphDescriptors.Chi3n(mol)
    descriptors['chi3v'] = GraphDescriptors.Chi3v(mol)
    descriptors['chi4n'] = GraphDescriptors.Chi4n(mol)
    descriptors['chi4v'] = GraphDescriptors.Chi4v(mol)

    descriptors['hall_kier_alpha'] = GraphDescriptors.HallKierAlpha(mol)

    descriptors['kappa1'] = GraphDescriptors.Kappa1(mol)
    descriptors['kappa2'] = GraphDescriptors.Kappa2(mol)
    descriptors['kappa3'] = GraphDescriptors.Kappa3(mol)

    # Predicted properties from Wildman and Crippen
    descriptors['log_p'] = Descriptors.MolLogP(mol)
    descriptors['refractivity'] = Descriptors.MolMR(mol)

    return descriptors
Example #3
0
def summ_search(mol,
                name,
                args,
                log,
                dup_data,
                dup_data_idx,
                coord_Map=None,
                alg_Map=None,
                mol_template=None):
    '''embeds core conformers, then optimizes and filters based on RMSD. Finally the rotatable torsions are systematically rotated'''

    sdwriter = Chem.SDWriter(name + '_' + 'rdkit' + args.output)

    Chem.SanitizeMol(mol)
    mol = Chem.AddHs(mol)
    mol.SetProp("_Name", name)

    # detects and applies auto-detection of initial number of conformers
    if args.sample == 'auto':
        initial_confs = int(auto_sampling(args.auto_sample, mol, log))

    else:
        initial_confs = int(args.sample)

    #
    dup_data.at[dup_data_idx, 'Molecule'] = name
    dup_data.at[dup_data_idx, 'RDKIT-Initial-samples'] = initial_confs

    if args.nodihedrals == False:
        rotmatches = getDihedralMatches(mol, args.heavyonly, log)
    else:
        rotmatches = []

    if len(rotmatches) > args.max_torsions:
        log.write("x  Too many torsions (%d). Skipping %s" %
                  (len(rotmatches), (name + args.output)))
        status = -1
    else:
        if coord_Map == None and alg_Map == None and mol_template == None:
            if args.etkdg:
                ps = Chem.ETKDG()
                ps.randomSeed = args.seed
                ps.ignoreSmoothingFailures = True
                ps.numThreads = 0
                cids = rdDistGeom.EmbedMultipleConfs(mol,
                                                     initial_confs,
                                                     params=ps)
            else:
                cids = rdDistGeom.EmbedMultipleConfs(
                    mol,
                    initial_confs,
                    ignoreSmoothingFailures=True,
                    randomSeed=args.seed,
                    numThreads=0)
            if len(cids) == 0 or len(cids) == 1 and initial_confs != 1:
                log.write(
                    "o  conformers initially sampled with random coordinates")
                cids = rdDistGeom.EmbedMultipleConfs(
                    mol,
                    initial_confs,
                    randomSeed=args.seed,
                    useRandomCoords=True,
                    boxSizeMult=10.0,
                    ignoreSmoothingFailures=True,
                    numZeroFail=1000,
                    numThreads=0)
            if args.verbose:
                log.write("o  " + str(len(cids)) +
                          " conformers initially sampled")
        # case of embed for templates
        else:
            if args.etkdg:
                ps = Chem.ETKDG()
                ps.randomSeed = args.seed
                ps.coordMap = coord_Map
                ps.ignoreSmoothingFailures = True
                ps.numThreads = 0
                cids = rdDistGeom.EmbedMultipleConfs(mol,
                                                     initial_confs,
                                                     params=ps)
            else:
                cids = rdDistGeom.EmbedMultipleConfs(
                    mol,
                    initial_confs,
                    randomSeed=args.seed,
                    ignoreSmoothingFailures=True,
                    coordMap=coord_Map,
                    numThreads=0)
            if len(cids) == 0 or len(cids) == 1 and initial_confs != 1:
                log.write(
                    "o  conformers initially sampled with random coordinates")
                cids = rdDistGeom.EmbedMultipleConfs(
                    mol,
                    initial_confs,
                    randomSeed=args.seed,
                    useRandomCoords=True,
                    boxSizeMult=10.0,
                    numZeroFail=1000,
                    ignoreSmoothingFailures=True,
                    coordMap=coord_Map,
                    numThreads=0)
            if args.verbose:
                log.write("o  " + str(len(cids)) +
                          " conformers initially sampled")

        #energy minimize all to get more realistic results
        #identify the atoms and decide Force Field

        for atom in mol.GetAtoms():
            if atom.GetAtomicNum() > 36:  #upto Kr for MMFF, if not use UFF
                args.ff = "UFF"
                #log.write("UFF is used because there are atoms that MMFF doesn't recognise")
        if args.verbose:
            log.write("o  Optimizing " + str(len(cids)) +
                      " initial conformers with" + args.ff)
        if args.verbose:
            if args.nodihedrals == False:
                log.write("o  Found " + str(len(rotmatches)) +
                          " rotatable torsions")
                # for [a,b,c,d] in rotmatches:
                # 	log.write('  '+mol.GetAtomWithIdx(a).GetSymbol()+str(a+1)+ mol.GetAtomWithIdx(b).GetSymbol()+str(b+1)+ mol.GetAtomWithIdx(c).GetSymbol()+str(c+1)+mol.GetAtomWithIdx(d).GetSymbol()+str(d+1))
            else:
                log.write("o  Systematic torsion rotation is set to OFF")

        cenergy, outmols = [], []
        bar = IncrementalBar('o  Minimizing', max=len(cids))
        for i, conf in enumerate(cids):
            if coord_Map == None and alg_Map == None and mol_template == None:
                if args.ff == "MMFF":
                    GetFF = Chem.MMFFGetMoleculeForceField(
                        mol, Chem.MMFFGetMoleculeProperties(mol), confId=conf)
                elif args.ff == "UFF":
                    GetFF = Chem.UFFGetMoleculeForceField(mol, confId=conf)
                else:
                    log.write('   Force field {} not supported!'.format(
                        args.ff))
                    sys.exit()

                GetFF.Initialize()
                converged = GetFF.Minimize(maxIts=args.opt_steps_RDKit)
                energy = GetFF.CalcEnergy()
                cenergy.append(GetFF.CalcEnergy())

                #if args.verbose:
                #    log.write("-   conformer", (i+1), "optimized: ", args.ff, "energy", GetFF.CalcEnergy())
            #id template realign before doing calculations
            else:
                num_atom_match = mol.GetSubstructMatch(mol_template)
                # Force field parameters
                if args.ff == "MMFF":
                    GetFF = lambda mol, confId=conf: Chem.MMFFGetMoleculeForceField(
                        mol, Chem.MMFFGetMoleculeProperties(mol), confId=conf)
                elif args.ff == "UFF":
                    GetFF = lambda mol, confId=conf: Chem.UFFGetMoleculeForceField(
                        mol, confId=conf)
                else:
                    log.write('   Force field {} not supported!'.format(
                        options.ff))
                    sys.exit()
                getForceField = GetFF

                # clean up the conformation
                ff_temp = getForceField(mol, confId=conf)
                for k, idxI in enumerate(num_atom_match):
                    for l in range(k + 1, len(num_atom_match)):
                        idxJ = num_atom_match[l]
                        d = coord_Map[idxI].Distance(coord_Map[idxJ])
                        ff_temp.AddDistanceConstraint(idxI, idxJ, d, d, 10000)
                ff_temp.Initialize()
                #reassignned n from 4 to 10 for better embed and minimzation
                n = 10
                more = ff_temp.Minimize()
                while more and n:
                    more = ff_temp.Minimize()
                    n -= 1
                energy = ff_temp.CalcEnergy()
                # rotate the embedded conformation onto the core_mol:
                rms = rdMolAlign.AlignMol(mol,
                                          mol_template,
                                          prbCid=conf,
                                          atomMap=alg_Map,
                                          reflect=True,
                                          maxIters=100)
                # elif len(num_atom_match) == 5:
                #     ff_temp = GetFF(mol, confId=conf)
                #     conf_temp = mol_template.GetConformer()
                #     for k in range(mol_template.GetNumAtoms()):
                #         p = conf_temp.GetAtomPosition(k)
                #         q = mol.GetConformer(conf).GetAtomPosition(k)
                #         pIdx = ff_temp.AddExtraPoint(p.x, p.y, p.z, fixed=True) - 1
                #         ff_temp.AddDistanceConstraint(pIdx, num_atom_match[k], 0, 0, 10000)
                #     ff_temp.Initialize()
                #     n = 10
                #     more = ff_temp.Minimize(energyTol=1e-6, forceTol=1e-5)
                #     while more and n:
                #         more = ff_temp.Minimize(energyTol=1e-6, forceTol=1e-5)
                #         n -= 1
                #     # realign
                #     energy = ff_temp.CalcEnergy()
                #     rms = rdMolAlign.AlignMol(mol, mol_template,prbCid=conf, atomMap=alg_Map,reflect=True,maxIters=50)
                cenergy.append(energy)

            # outmols is gonna be a list containing "initial_confs" mol objects with "initial_confs"
            # conformers. We do this to SetProp (Name and Energy) to the different conformers
            # and log.write in the SDF file. At the end, since all the mol objects has the same
            # conformers, but the energies are different, we can log.write conformers to SDF files
            # with the energies of the parent mol objects. We measured the computing time and
            # it's the same as using only 1 parent mol object with 10 conformers, but we couldn'temp
            # SetProp correctly
            pmol = PropertyMol.PropertyMol(mol)
            outmols.append(pmol)
            bar.next()
        bar.finish()

        for i, cid in enumerate(cids):
            outmols[cid].SetProp('_Name', name + ' conformer ' + str(i + 1))
            outmols[cid].SetProp('Energy', cenergy[cid])

        cids = list(range(len(outmols)))
        sortedcids = sorted(cids, key=lambda cid: cenergy[cid])

        log.write("\n\no  Filters after intial embedding of " +
                  str(initial_confs) + " conformers")
        selectedcids, selectedcids_initial, eng_dup, eng_rms_dup = [], [], -1, -1
        bar = IncrementalBar('o  Filtering based on energy (pre-filter)',
                             max=len(sortedcids))
        for i, conf in enumerate(sortedcids):
            # This keeps track of whether or not your conformer is unique
            excluded_conf = False
            # include the first conformer in the list to start the filtering process
            if i == 0:
                selectedcids_initial.append(conf)
            # check rmsd
            for seenconf in selectedcids_initial:
                E_diff = abs(cenergy[conf] - cenergy[seenconf])  # in kcal/mol
                if E_diff < args.initial_energy_threshold:
                    eng_dup += 1
                    excluded_conf = True
                    break
            if excluded_conf == False:
                if conf not in selectedcids_initial:
                    selectedcids_initial.append(conf)
            bar.next()
        bar.finish()

        if args.verbose == True:
            log.write("o  " + str(eng_dup) +
                      " Duplicates removed  pre-energy filter (E < " +
                      str(args.initial_energy_threshold) + " kcal/mol )")

        #reduce to unique set
        if args.verbose:
            log.write("o  Removing duplicate conformers ( RMSD < " +
                      str(args.rms_threshold) + " and E difference < " +
                      str(args.energy_threshold) + " kcal/mol)")

        bar = IncrementalBar('o  Filtering based on energy and rms',
                             max=len(selectedcids_initial))
        #check rmsd
        for i, conf in enumerate(selectedcids_initial):

            #set torsions to same value
            for m in rotmatches:
                rdMolTransforms.SetDihedralDeg(
                    outmols[conf].GetConformer(conf), *m, 180.0)

            # This keeps track of whether or not your conformer is unique
            excluded_conf = False
            # include the first conformer in the list to start the filtering process
            if i == 0:
                selectedcids.append(conf)
            # check rmsd
            for seenconf in selectedcids:
                E_diff = abs(cenergy[conf] - cenergy[seenconf])  # in kcal/mol
                if E_diff < args.energy_threshold:
                    rms = get_conf_RMS(outmols[conf], outmols[conf], seenconf,
                                       conf, args.heavyonly,
                                       args.max_matches_RMSD, log)
                    if rms < args.rms_threshold:
                        excluded_conf = True
                        eng_rms_dup += 1
                        break
            if excluded_conf == False:
                if conf not in selectedcids:
                    selectedcids.append(conf)
            bar.next()
        bar.finish()

        # unique_mols, unique_energies = [],[]
        # for id in selectedcids:
        #     unique_mols.append(outmols[id])
        #     unique_energies.append(cenergy[id])

        # log.write(unique_mols[0:2].GetConformers()[0].GetPositions())

        if args.verbose == True:
            log.write("o  " + str(eng_rms_dup) +
                      " Duplicates removed (RMSD < " +
                      str(args.rms_threshold) + " / E < " +
                      str(args.energy_threshold) + " kcal/mol) after rotation")
        if args.verbose:
            log.write("o  " + str(len(selectedcids)) +
                      " unique (ignoring torsions) starting conformers remain")

        dup_data.at[dup_data_idx, 'RDKit-energy-duplicates'] = eng_dup
        dup_data.at[dup_data_idx,
                    'RDKit-RMS-and-energy-duplicates'] = eng_rms_dup
        dup_data.at[dup_data_idx,
                    'RDKIT-Unique-conformers'] = len(selectedcids)

        # now exhaustively drive torsions of selected conformers
        n_confs = int(len(selectedcids) * (360 / args.degree)**len(rotmatches))
        if args.verbose and len(rotmatches) != 0:
            log.write("\n\no  Systematic generation of " + str(n_confs) +
                      " confomers")
            bar = IncrementalBar(
                'o  Generating conformations based on dihedral rotation',
                max=len(selectedcids))
        else:
            bar = IncrementalBar('o  Generating conformations',
                                 max=len(selectedcids))

        total = 0
        for conf in selectedcids:
            #log.write(outmols[conf])
            total += genConformer_r(outmols[conf], conf, 0, rotmatches,
                                    args.degree, sdwriter, args,
                                    outmols[conf].GetProp('_Name'), log)
            bar.next()
        bar.finish()
        if args.verbose and len(rotmatches) != 0:
            log.write("o  %d total conformations generated" % total)
        status = 1
    sdwriter.close()

    #getting the energy from and mols after rotations
    if len(rotmatches) != 0:
        rdmols = Chem.SDMolSupplier(name + '_' + 'rdkit' + args.output,
                                    removeHs=False)
        if rdmols is None:
            log.write("Could not open " + name + args.output)
            sys.exit(-1)

        bar = IncrementalBar(
            'o  Filtering based on energy and rms after rotation of dihedrals',
            max=len(rdmols))
        sdwriter = Chem.SDWriter(name + '_' + 'rdkit' + '_' + 'rotated' +
                                 args.output)

        rd_count = 0
        rd_selectedcids, rd_dup_energy, rd_dup_rms_eng = [], -1, 0
        for i in range(len(rdmols)):
            # This keeps track of whether or not your conformer is unique
            excluded_conf = False
            # include the first conformer in the list to start the filtering process
            if rd_count == 0:
                rd_selectedcids.append(i)
                if args.metal_complex == True:
                    for atom in rdmols[i].GetAtoms():
                        if atom.GetSymbol() == 'I' and (
                                len(atom.GetBonds()) == 6
                                or len(atom.GetBonds()) == 5
                                or len(atom.GetBonds()) == 4
                                or len(atom.GetBonds()) == 3
                                or len(atom.GetBonds()) == 2):
                            for el in elementspt:
                                if el.symbol == args.metal:
                                    atomic_number = el.number
                            atom.SetAtomicNum(atomic_number)
                sdwriter.write(rdmols[i])
            # Only the first ID gets included
            rd_count = 1
            # check rmsd
            for j in rd_selectedcids:
                if abs(
                        float(rdmols[i].GetProp('Energy')) -
                        float(rdmols[j].GetProp('Energy'))
                ) < args.initial_energy_threshold:  # comparison in kcal/mol
                    excluded_conf = True
                    rd_dup_energy += 1
                    break
                if abs(
                        float(rdmols[i].GetProp('Energy')) -
                        float(rdmols[j].GetProp('Energy'))
                ) < args.energy_threshold:  # in kcal/mol
                    rms = get_conf_RMS(rdmols[i], rdmols[j], -1, -1,
                                       args.heavyonly, args.max_matches_RMSD,
                                       log)
                    if rms < args.rms_threshold:
                        excluded_conf = True
                        rd_dup_rms_eng += 1
                        break
            if excluded_conf == False:
                if args.metal_complex == True:
                    for atom in rdmols[i].GetAtoms():
                        if atom.GetSymbol() == 'I' and (
                                len(atom.GetBonds()) == 6
                                or len(atom.GetBonds()) == 5
                                or len(atom.GetBonds()) == 4
                                or len(atom.GetBonds()) == 3
                                or len(atom.GetBonds()) == 2):
                            for el in elementspt:
                                if el.symbol == args.metal:
                                    atomic_number = el.number
                            atom.SetAtomicNum(atomic_number)
                sdwriter.write(rdmols[i])
                if i not in rd_selectedcids:
                    rd_selectedcids.append(i)
            bar.next()
        bar.finish()
        sdwriter.close()

        if args.verbose == True:
            log.write("o  " + str(rd_dup_energy) +
                      " Duplicates removed initial energy ( E < " +
                      str(args.initial_energy_threshold) + " kcal/mol )")
        if args.verbose == True:
            log.write("o  " + str(rd_dup_rms_eng) +
                      " Duplicates removed (RMSD < " +
                      str(args.rms_threshold) + " / E < " +
                      str(args.energy_threshold) + " kcal/mol) after rotation")
        if args.verbose == True:
            log.write("o  " + str(len(rd_selectedcids)) +
                      " unique (after torsions) conformers remain")

        #filtering process after rotations
        dup_data.at[dup_data_idx, 'RDKIT-Rotated-conformers'] = total
        dup_data.at[dup_data_idx,
                    'RDKIT-Rotated-Unique-conformers'] = len(rd_selectedcids)

    return status