Exemple #1
0
    def test7Fragment(self):
        fragremover = rdMolStandardize.FragmentRemover()
        mol = Chem.MolFromSmiles("CN(C)C.Cl.Cl.Br")
        nm = fragremover.remove(mol)
        self.assertEqual(Chem.MolToSmiles(nm), "CN(C)C")

        lfragchooser = rdMolStandardize.LargestFragmentChooser()
        mol2 = Chem.MolFromSmiles("[N+](=O)([O-])[O-].[CH3+]")
        nm2 = lfragchooser.choose(mol2)
        self.assertEqual(Chem.MolToSmiles(nm2), "O=[N+]([O-])[O-]")

        lfragchooser2 = rdMolStandardize.LargestFragmentChooser(
            preferOrganic=True)
        nm3 = lfragchooser2.choose(mol2)
        self.assertEqual(Chem.MolToSmiles(nm3), "[CH3+]")
def main( ):

  args  = UserInput()
  if args.id is None:
    args.id = 'ID'
  if args.start is None:
    args.start = 0
  if args.end is None:
    args.end = -1

  df = RDkitRead(args.infile, args.id, removeHs=True, add_Hs=False)[int(args.start):int(args.end)].dropna()

  remover = SaltRemover.SaltRemover()
  normzer = rdMolStandardize.Normalizer()
  chooser = rdMolStandardize.LargestFragmentChooser(preferOrganic=True)

  ## remove salts
  print('\033[34m## Desalting moleucles...\033[0m\n')
  df['mol'] = df.MOL.apply(remover.StripMol)
  ## choose largest fragment (most Hs)
  print('\033[34m## Choosing moleucles...\033[0m\n')
  df['mol2'] = df.mol.apply(chooser.choose)
  ## clean molecule (not really relevant?)
  print('\033[34m## Cleaning moleucles...\033[0m\n')
  df['mol3'] = df.mol2.apply(normzer.normalize)
  ## rewrite SMILES with newest mol3
  print('\033[34m## Converting moleucles...\033[0m\n')
  df['smiles'] = df.mol3.apply(Chem.MolToSmiles)

  if   args.format == 'sdf':
    rdpd.WriteSDF(df, args.outpref+'.'+args.format, molColName='mol3', idName=args.id, properties=['smiles'])
  elif args.format == 'smi':
    df.to_csv(args.outpref+'.'+args.format, index=False, sep=' ', columns=['smiles',args.id], header=True)
Exemple #3
0
def graph_corpus(input, output, suffix='sdf'):
    metals = {'Na', 'Zn', 'Li', 'K', 'Ca', 'Mg', 'Ag', 'Cs', 'Ra', 'Rb', 'Al', 'Sr', 'Ba', 'Bi'}
    voc = utils.VocGraph('data/voc_atom.txt')
    inf = gzip.open(input)
    if suffix == 'sdf':
        mols = Chem.ForwardSDMolSupplier(inf)
        total = 2e6
    else:
        mols = pd.read_table(input).drop_duplicates(subset=['Smiles']).dropna(subset=['Smiles'])
        total = len(mols)
        mols = mols.iterrows()
    vals = {}
    exps = {}
    codes, ids = [], []
    chooser = rdMolStandardize.LargestFragmentChooser()
    disconnector = rdMolStandardize.MetalDisconnector()
    normalizer = rdMolStandardize.Normalizer()
    for i, mol in enumerate(tqdm(mols, total=total)):
        if mol is None: continue
        if suffix != 'sdf':
            idx = mol[1]['Molecule ChEMBL ID']

            mol = Chem.MolFromSmiles(mol[1].Smiles)
        else:
            idx = mol.GetPropsAsDict()
            idx = idx['chembl_id']
        try:
            mol = disconnector.Disconnect(mol)
            mol = normalizer.normalize(mol)
            mol = chooser.choose(mol)
            mol = disconnector.Disconnect(mol)
            mol = normalizer.normalize(mol)
        except:
            print(idx)
        symb = [a.GetSymbol() for a in mol.GetAtoms()]
        # Nr. of the atoms
        bonds = mol.GetBonds()
        if len(bonds) < 4 or len(bonds) >= 63: continue
        if {'C'}.isdisjoint(symb): continue
        if not metals.isdisjoint(symb): continue

        smile = Chem.MolToSmiles(mol)
        try:
            s0 = smile.replace('[O]', 'O').replace('[C]', 'C') \
                 .replace('[N]', 'N').replace('[B]', 'B') \
                 .replace('[2H]', '[H]').replace('[3H]', '[H]')
            s0 = Chem.CanonSmiles(s0, 0)
            code = voc.encode([smile])
            s1 = voc.decode(code)[0]
            assert s0 == s1
            codes.append(code[0].reshape(-1).tolist())
            ids.append(idx)
        except Exception as ex:
            print(ex)
            print('Parse Error:', idx)
    df = pd.DataFrame(codes, index=ids, columns=['C%d' % i for i in range(64*4)])
    df.to_csv(output, sep='\t', index=True)
    print(vals)
    print(exps)
Exemple #4
0
def corpus(input, output, suffix='sdf'):
    if suffix =='sdf':
        inf = gzip.open(input)
        mols = Chem.ForwardSDMolSupplier(inf)
        # mols = [mol for mol in suppl]
    else:
        df = pd.read_table(input).Smiles.dropna()
        mols = [Chem.MolFromSmiles(s) for s in df]
    voc = Voc('data/voc_smiles.txt')
    charger = rdMolStandardize.Uncharger()
    chooser = rdMolStandardize.LargestFragmentChooser()
    disconnector = rdMolStandardize.MetalDisconnector()
    normalizer = rdMolStandardize.Normalizer()
    words = set()
    canons = []
    tokens = []
    smiles = set()
    for mol in tqdm(mols):
        try:
            mol = disconnector.Disconnect(mol)
            mol = normalizer.normalize(mol)
            mol = chooser.choose(mol)
            mol = charger.uncharge(mol)
            mol = disconnector.Disconnect(mol)
            mol = normalizer.normalize(mol)
            smileR = Chem.MolToSmiles(mol, 0)
            smiles.add(Chem.CanonSmiles(smileR))
        except:
            print('Parsing Error:') #, Chem.MolToSmiles(mol))

    for smile in tqdm(smiles):
        token = voc.split(smile) + ['EOS']
        if {'C', 'c'}.isdisjoint(token):
            print('Warning:', smile)
            continue
        if not {'[Na]', '[Zn]'}.isdisjoint(token):
            print('Redudent', smile)
            continue
        if 10 < len(token) <= 100:
            words.update(token)
            canons.append(smile)
            tokens.append(' '.join(token))
    log = open(output + '_voc.txt', 'w')
    log.write('\n'.join(sorted(words)))
    log.close()

    log = pd.DataFrame()
    log['Smiles'] = canons
    log['Token'] = tokens
    log.drop_duplicates(subset='Smiles')
    log.to_csv(output + '_corpus.txt', sep='\t', index=False)
Exemple #5
0
def choose_largest_fragment(mol):
    """Gets the largest fragment.

    Parameters
    ----------
    mol: rdkit.Chem.Mol
        A molecule with various fragments in various sizes.

    Returns
    -------
    mol: rdkit.Chem.Mol
        Returns a molecule containing the largest fragment.
    """
    return rdMolStandardize.LargestFragmentChooser().choose(mol)
Exemple #6
0
    def salt_remover(self, smiles: str) -> str:
        """
            Removes salts and counterions. Non sanitizable molecules can't be processed

            :param smiles: smiles string

            :return cleaned_smiles: 
        """

        rmv = rdMolStandardize.LargestFragmentChooser()

        if "." in smiles and Chem.MolFromSmiles(smiles):
            cleaned_smiles = Chem.MolToSmiles(rmv.choose(self.smiles_mol))
        else:
            cleaned_smiles = smiles

        return cleaned_smiles
    def test7Fragment(self):
        fragremover = rdMolStandardize.FragmentRemover()
        mol = Chem.MolFromSmiles("CN(C)C.Cl.Cl.Br")
        nm = fragremover.remove(mol)
        self.assertEqual(Chem.MolToSmiles(nm), "CN(C)C")

        lfragchooser = rdMolStandardize.LargestFragmentChooser()
        mol2 = Chem.MolFromSmiles("[N+](=O)([O-])[O-].[CH3+]")
        nm2 = lfragchooser.choose(mol2)
        self.assertEqual(Chem.MolToSmiles(nm2), "O=[N+]([O-])[O-]")

        lfragchooser2 = rdMolStandardize.LargestFragmentChooser(
            preferOrganic=True)
        nm3 = lfragchooser2.choose(mol2)
        self.assertEqual(Chem.MolToSmiles(nm3), "[CH3+]")

        fragremover = rdMolStandardize.FragmentRemover(skip_if_all_match=True)
        mol = Chem.MolFromSmiles("[Na+].Cl.Cl.Br")
        nm = fragremover.remove(mol)
        self.assertEqual(nm.GetNumAtoms(), mol.GetNumAtoms())

        smi3 = "CNC[C@@H]([C@H]([C@@H]([C@@H](CO)O)O)O)O.c1cc2c(cc1C(=O)O)oc(n2)c3cc(cc(c3)Cl)Cl"

        lfParams = rdMolStandardize.CleanupParameters()
        lfrag_params = rdMolStandardize.LargestFragmentChooser(lfParams)
        mol3 = Chem.MolFromSmiles(smi3)
        lfrag3 = lfrag_params.choose(mol3)
        self.assertEqual(Chem.MolToSmiles(lfrag3),
                         "CNC[C@H](O)[C@@H](O)[C@H](O)[C@H](O)CO")

        lfParams = rdMolStandardize.CleanupParameters()
        lfParams.largestFragmentChooserCountHeavyAtomsOnly = True
        lfrag_params = rdMolStandardize.LargestFragmentChooser(lfParams)
        mol3 = Chem.MolFromSmiles(smi3)
        lfrag3 = lfrag_params.choose(mol3)
        self.assertEqual(Chem.MolToSmiles(lfrag3),
                         "O=C(O)c1ccc2nc(-c3cc(Cl)cc(Cl)c3)oc2c1")

        lfParams = rdMolStandardize.CleanupParameters()
        lfParams.largestFragmentChooserUseAtomCount = False
        lfrag_params = rdMolStandardize.LargestFragmentChooser(lfParams)
        mol3 = Chem.MolFromSmiles(smi3)
        lfrag3 = lfrag_params.choose(mol3)
        self.assertEqual(Chem.MolToSmiles(lfrag3),
                         "O=C(O)c1ccc2nc(-c3cc(Cl)cc(Cl)c3)oc2c1")

        smi4 = "CC.O=[Pb]=O"

        lfParams = rdMolStandardize.CleanupParameters()
        lfrag_params = rdMolStandardize.LargestFragmentChooser(lfParams)
        mol4 = Chem.MolFromSmiles(smi4)
        lfrag4 = lfrag_params.choose(mol4)
        self.assertEqual(Chem.MolToSmiles(lfrag4), "CC")

        lfParams = rdMolStandardize.CleanupParameters()
        lfParams.largestFragmentChooserCountHeavyAtomsOnly = True
        lfrag_params = rdMolStandardize.LargestFragmentChooser(lfParams)
        mol4 = Chem.MolFromSmiles(smi4)
        lfrag4 = lfrag_params.choose(mol4)
        self.assertEqual(Chem.MolToSmiles(lfrag4), "O=[Pb]=O")

        lfParams = rdMolStandardize.CleanupParameters()
        lfParams.largestFragmentChooserUseAtomCount = False
        lfrag_params = rdMolStandardize.LargestFragmentChooser(lfParams)
        mol4 = Chem.MolFromSmiles(smi4)
        lfrag4 = lfrag_params.choose(mol4)
        self.assertEqual(Chem.MolToSmiles(lfrag4), "O=[Pb]=O")

        lfParams = rdMolStandardize.CleanupParameters()
        lfParams.largestFragmentChooserCountHeavyAtomsOnly = True
        lfParams.preferOrganic = True
        lfrag_params = rdMolStandardize.LargestFragmentChooser(lfParams)
        mol4 = Chem.MolFromSmiles(smi4)
        lfrag4 = lfrag_params.choose(mol4)
        self.assertEqual(Chem.MolToSmiles(lfrag4), "CC")

        lfParams = rdMolStandardize.CleanupParameters()
        lfParams.largestFragmentChooserUseAtomCount = False
        lfParams.preferOrganic = True
        lfrag_params = rdMolStandardize.LargestFragmentChooser(lfParams)
        mol4 = Chem.MolFromSmiles(smi4)
        lfrag4 = lfrag_params.choose(mol4)
        self.assertEqual(Chem.MolToSmiles(lfrag4), "CC")
def main(prm_file):

    pref = prm_file.split('.sdf')[0]

    print('## Reading file...')
    prm_df = PandasTools.LoadSDF(prm_file,
                                 smilesName='SMILES',
                                 molColName='MOL',
                                 includeFingerprints=False)
    print(prm_df[:10])

    ## remove salts and rename the smiles
    print('## Cleaning moleucles...')
    remover = SaltRemover.SaltRemover()
    chooser = rdMolStandardize.LargestFragmentChooser(preferOrganic=True)

    prm_df['molx'] = prm_df.MOL.apply(remover.StripMol)
    prm_df['mol'] = prm_df.molx.apply(chooser.choose)
    prm_df['smiles'] = prm_df.mol.apply(Chem.MolToSmiles)

    def add_cb(inp):
        return 'CB_' + str(inp)

    prm_df['ID'] = prm_df.CB_ID.apply(add_cb)

    #  prm_df['ID'] = prm_df.CB_ID

    ## shuffle
    print('## Shuffling molecules...')
    df = prm_df.sample(frac=1).reset_index(drop=True)

    print(prm_df[:10])

    ## recalculate molecular properties
    print('## Calculating properties...')
    prm_df['qed'] = prm_df.mol.apply(QED.properties)
    prm_df['MW'] = prm_df.qed.apply(lambda x: x.MW)
    #  prm_df['logP'] = prm_df.qed.apply(lambda x: x.ALOGP)
    #  prm_df['HBA']  = prm_df.qed.apply(lambda x: x.HBA)
    #  prm_df['HBD']  = prm_df.qed.apply(lambda x: x.HBD)
    #  prm_df['PSA']  = prm_df.qed.apply(lambda x: x.PSA)
    #  prm_df['ROTB'] = prm_df.qed.apply(lambda x: x.ROTB)
    #  prm_df['AROM'] = prm_df.qed.apply(lambda x: x.AROM)
    #  prm_df['HA']   = prm_df.mol.apply(rdchem.Mol.GetNumHeavyAtoms)
    print(prm_df[:10])
    print(' > number of molecules... ', len(prm_df))

    ## print out molecule properties and smiles (shuffled)
    print('## Writing results...')
    Cols_csv = [
        'ID', 'MW', 'HA', 'logP', 'LogS', 'HBA', 'HBD', 'PSA', 'ROTB', 'AROM',
        'SaltType', 'smiles'
    ]
    Cols_smi = ['smiles', 'ID']

    prm_df.loc[(prm_df.MW > 150.) & (prm_df.MW <= 300.)].to_csv(
        pref + '.frag.csv.bz2',
        sep=',',
        float_format='%.2f',
        columns=Cols_csv,
        index=False)
    prm_df.loc[(prm_df.MW > 150.) & (prm_df.MW <= 300.)].to_csv(
        pref + '.frag.smi', sep='\t', columns=Cols_smi, index=False)

    prm_df.loc[(prm_df.MW > 300.) & (prm_df.MW <= 400.)].to_csv(
        pref + '.lead.csv.bz2',
        sep=',',
        float_format='%.2f',
        columns=Cols_csv,
        index=False)
    prm_df.loc[(prm_df.MW > 300.) & (prm_df.MW <= 400.)].to_csv(
        pref + '.lead.smi', sep='\t', columns=Cols_smi, index=False)

    prm_df.loc[prm_df.MW > 400.].to_csv(pref + '.drug.csv.bz2',
                                        sep=',',
                                        float_format='%.2f',
                                        columns=Cols_csv,
                                        index=False)
    prm_df.loc[prm_df.MW > 400.].to_csv(pref + '.drug.smi',
                                        sep='\t',
                                        columns=Cols_smi,
                                        index=False)

    prm_df.loc[prm_df.MW <= 150.].to_csv(pref + '.small.csv.bz2',
                                         sep=',',
                                         float_format='%.2f',
                                         columns=Cols_csv,
                                         index=False)
    prm_df.loc[prm_df.MW <= 150.].to_csv(pref + '.small.smi',
                                         sep='\t',
                                         columns=Cols_smi,
                                         index=False)