def canonicalize(smi_list, showprogress=False):
    mol_list = []
    if showprogress:
        print('Canonicalising mols')
        for smi in tqdm(smi_list):
            mol = MolFromSmiles(smi)
            if mol is not None:
                mol_list.append(MolToSmiles(mol))
    else:
        for smi in smi_list:
            mol = MolFromSmiles(smi)
            if mol is not None:
                mol_list.append(MolToSmiles(mol))
    mol_list = list(set(mol_list))
    final_list = []
    if showprogress:
        print('Size of unfiltered final library: {}'.format(len(mol_list)))
        print('Filtering by n_heavy and logP:')
        for smi in tqdm(mol_list):
            mol = MolFromSmiles(smi)
            n_heavy = mol.GetNumHeavyAtoms()
            if n_heavy > 17:
                logP = Crippen.MolLogP(mol)
                if logP <= 5:
                    final_list.append(smi)
    else:
        for smi in mol_list:
            mol = MolFromSmiles(smi)
            n_heavy = mol.GetNumHeavyAtoms()
            if n_heavy > 17:
                logP = Crippen.MolLogP(mol)
                if logP <= 5:
                    final_list.append(smi)
    return final_list
Example #2
0
def canonicalize_and_filter(smi_list, showprogress=False):
    """
    Function that returns the set of unique RDKit molecules from a list of input RDKit molecules
    by turning them into canonical SMILES and checking the strings for uniqueness.

    Also performs rudimentary Lipinski rule-of-5 filtering by dropping molecules with logP >5 and 
    more than 17 heavy atoms.
    """
    mol_list = []
    if showprogress:
        print('Canonicalising mols')
        for smi in tqdm(smi_list):
            mol = MolFromSmiles(smi)
            if mol is not None:
                mol_list.append(MolToSmiles(mol))
    else:
        for smi in smi_list:
            mol = MolFromSmiles(smi)
            if mol is not None:
                mol_list.append(mol)
    mol_list = list(set(mol_list))
    final_list = []
    if showprogress:
        print('Size of unfiltered final library: {}'.format(len(mol_list)))
        print('Filtering by n_heavy and logP:')
        for smi in tqdm(mol_list):
            mol = MolFromSmiles(smi)
            n_heavy = mol.GetNumHeavyAtoms()
            if n_heavy > 17:
                logP = Crippen.MolLogP(mol)
                if logP <= 5:
                    final_list.append(smi)
    else:
        for smi in mol_list:
            mol = MolFromSmiles(smi)
            n_heavy = mol.GetNumHeavyAtoms()
            if n_heavy > 17:
                logP = Crippen.MolLogP(mol)
                if logP <= 5:
                    final_list.append(smi)
    return final_list
def worker(line):
    smiles, cid = line.strip().split()[:2]
    mol = MolFromSmiles(smiles)
    if mol:
        if '.' in smiles:
            mol = remover.StripMol(mol)
        logp = MolLogP(mol)
        num_heavy_atoms = mol.GetNumHeavyAtoms()
        if num_heavy_atoms > 99:
            num_heavy_atoms = 99
        sign = 'M' if logp < 0.0 else 'P'
        return f'{smiles} {cid} H{num_heavy_atoms:02}{sign}{abs(scale_logp_value(logp)):03}\n'
Example #4
0
def dataset_distribution(f_paths):
    n_heavy = []
    f_path = None

    for f_path in glob(f_paths):
        dataset = pd.read_csv(f_path)

        for smiles in dataset["SMILES"]:
            mol = MolFromSmiles(smiles)
            n_heavy.append(mol.GetNumHeavyAtoms())

    f_dir = osp.dirname(f_path)
    f_base = osp.basename(f_path).split(".")[0]
    plt.hist(n_heavy, bins=range(min(n_heavy), max(n_heavy) + 1))
    plt.xlabel("num of heavy atoms")
    plt.ylabel("count")
    plt.title("dd_sol")
    plt.savefig(osp.join(f_dir, f_base + "_mmff_dist.png"))