Ejemplo n.º 1
0
    def read_targets(self):
        """
        Get labels for molecules from SD data fields matching dataset names.

        Returns
        -------
        data : dict
            Nested dictionary containing SMILES and targets for compounds in
            each dataset. Keyed by data->dataset->SMILES->target, where target
            is a list.
        """
        engine = SmilesGenerator()
        data = {dataset: {} for dataset in self.dataset_names}
        skipped = []
        for mol in self.read_data():
            smiles = engine.get_smiles(mol)
            for prop in list(mol.GetPropNames()):
                if prop in data:
                    score = int(mol.GetProp(prop))
                    if smiles not in data[prop]:
                        data[prop][smiles] = []
                    data[prop][smiles].append(score)
                else:  # skip irrelevant SD fields
                    if prop not in skipped:
                        skipped.append(prop)
                    continue
        print 'Skipped properties:\n{}'.format('\n'.join(skipped))
        return data
Ejemplo n.º 2
0
    def read_targets(self):
        """
        Get labels for molecules from SD data fields matching dataset names.

        Returns
        -------
        data : dict
            Nested dictionary containing SMILES and targets for compounds in
            each dataset. Keyed by data->dataset->SMILES->target, where target
            is a list.
        """
        engine = SmilesGenerator()
        data = {dataset: {} for dataset in self.dataset_names}
        skipped = []
        for mol in self.read_data():
            smiles = engine.get_smiles(mol)
            for prop in list(mol.GetPropNames()):
                if prop in data:
                    score = int(mol.GetProp(prop))
                    if smiles not in data[prop]:
                        data[prop][smiles] = []
                    data[prop][smiles].append(score)
                else:  # skip irrelevant SD fields
                    if prop not in skipped:
                        skipped.append(prop)
                    continue
        print 'Skipped properties:\n{}'.format('\n'.join(skipped))
        return data
Ejemplo n.º 3
0
class TestSmilesGenerator(SmilesTests):
    """
    Test SmilesGenerator.
    """
    def setUp(self):
        """
        Set up tests.
        """
        super(TestSmilesGenerator, self).setUp()
        self.engine = SmilesGenerator()

    def test_get_smiles(self):
        """
        Test SmilesGenerator.get_smiles.
        """
        for mol in self.mols:
            smiles = self.engine.get_smiles(mol)
            new = Chem.MolFromSmiles(smiles)
            assert new.GetNumAtoms() == mol.GetNumAtoms()

    def test_get_smiles_3d(self):
        """
        Test SmilesGenerator.get_smiles with stereochemistry assigned from 3D
        coordinates.
        """
        # generate conformers for ibuprofen
        engine = conformers.ConformerGenerator()
        mol = engine.generate_conformers(self.mols[1])
        assert mol.GetNumConformers() > 0

        # check that chirality has not yet been assigned
        smiles = self.engine.get_smiles(mol)
        assert '@' not in smiles  # check for absence of chirality marker
        chiral_types = [
            Chem.ChiralType.CHI_TETRAHEDRAL_CW,
            Chem.ChiralType.CHI_TETRAHEDRAL_CCW
        ]
        chiral = False
        for atom in mol.GetAtoms():
            if atom.GetChiralTag() in chiral_types:
                chiral = True
        assert not chiral

        # generate SMILES
        self.engine = SmilesGenerator(assign_stereo_from_3d=True)
        smiles = self.engine.get_smiles(mol)
        assert '@' in smiles  # check for chirality marker
        new = Chem.MolFromSmiles(smiles)
        assert new.GetNumAtoms() == self.mols[1].GetNumAtoms()

        # check that chirality was assigned to ibuprofen
        chiral = False
        for atom in mol.GetAtoms():
            if atom.GetChiralTag() in chiral_types:
                chiral = True
        assert chiral
Ejemplo n.º 4
0
class TestSmilesGenerator(SmilesTests):
    """
    Test SmilesGenerator.
    """
    def setUp(self):
        """
        Set up tests.
        """
        super(TestSmilesGenerator, self).setUp()
        self.engine = SmilesGenerator()

    def test_get_smiles(self):
        """
        Test SmilesGenerator.get_smiles.
        """
        for mol in self.mols:
            smiles = self.engine.get_smiles(mol)
            new = Chem.MolFromSmiles(smiles)
            assert new.GetNumAtoms() == mol.GetNumAtoms()

    def test_get_smiles_3d(self):
        """
        Test SmilesGenerator.get_smiles with stereochemistry assigned from 3D
        coordinates.
        """
        # generate conformers for ibuprofen
        engine = conformers.ConformerGenerator()
        mol = engine.generate_conformers(self.mols[1])
        assert mol.GetNumConformers() > 0

        # check that chirality has not yet been assigned
        smiles = self.engine.get_smiles(mol)
        assert '@' not in smiles  # check for absence of chirality marker
        chiral_types = [Chem.ChiralType.CHI_TETRAHEDRAL_CW,
                        Chem.ChiralType.CHI_TETRAHEDRAL_CCW]
        chiral = False
        for atom in mol.GetAtoms():
            if atom.GetChiralTag() in chiral_types:
                chiral = True
        assert not chiral

        # generate SMILES
        self.engine = SmilesGenerator(assign_stereo_from_3d=True)
        smiles = self.engine.get_smiles(mol)
        assert '@' in smiles  # check for chirality marker
        new = Chem.MolFromSmiles(smiles)
        assert new.GetNumAtoms() == self.mols[1].GetNumAtoms()

        # check that chirality was assigned to ibuprofen
        chiral = False
        for atom in mol.GetAtoms():
            if atom.GetChiralTag() in chiral_types:
                chiral = True
        assert chiral
    def setUp(self):
        """
        Set up tests.
        """
        smiles = ['CC(=O)OC1=CC=CC=C1C(=O)O', 'CC(C)CC1=CC=C(C=C1)C(C)C(=O)O',
                  'CC1=CC=C(C=C1)C2=CC(=NN2C3=CC=C(C=C3)S(=O)(=O)N)C(F)(F)F']
        names = ['aspirin', 'ibuprofen', 'celecoxib']
        self.y = [0, 1, 0]
        self.mols = []
        for s, n in zip(smiles, names):
            mol = Chem.MolFromSmiles(s)
            mol.SetProp('_Name', n)
            self.mols.append(mol)

        # write active and decoy files
        self.temp_dir = tempfile.mkdtemp()
        _, self.active_filename = tempfile.mkstemp(dir=self.temp_dir,
                                                   suffix='.smi')
        _, self.decoy_filename = tempfile.mkstemp(dir=self.temp_dir,
                                                  suffix='.smi')
        active = open(self.active_filename, 'wb')
        decoy = open(self.decoy_filename, 'wb')
        for this_smiles, name, y in zip(smiles, names, self.y):
            data = '{}\t{}\n'.format(this_smiles, name)
            if y:
                active.write(data)
            else:
                decoy.write(data)
        active.close()
        decoy.close()
        _, self.output_filename = tempfile.mkstemp(dir=self.temp_dir)

        # get SMILES
        self.engine = SmilesGenerator()
        self.smiles = [self.engine.get_smiles(mol) for mol in self.mols]
    def test_stereo_to_3d(self):
        """
        Test main with --stereo-to-3d.
        """
        # generate conformers for ibuprofen
        engine = conformers.ConformerGenerator()
        self.mols[1] = engine.generate_conformers(self.mols[1])
        assert self.mols[1].GetNumConformers() > 0

        # rewrite actives file with 3D coordinates
        _, self.active_filename = tempfile.mkstemp(dir=self.temp_dir,
                                                   suffix='.sdf')
        with serial.MolWriter().open(self.active_filename) as writer:
            for mol, y in zip(self.mols, self.y):
                if y:
                    writer.write([self.mols[1]])

        # check for absence of chirality using default arguments
        smiles, targets = self.check_output([
            '-a', self.active_filename, '-d', self.decoy_filename, '-o',
            self.output_filename
        ])
        chiral = False
        for this_smiles in smiles:
            if '@' in this_smiles:
                chiral = True
        assert not chiral

        # update reference SMILES
        self.engine = SmilesGenerator(assign_stereo_from_3d=True)
        self.smiles[1] = self.engine.get_smiles(self.mols[1])

        # check for presence of chiraliy using --stereo-from-3d
        smiles, targets = self.check_output([
            '-a', self.active_filename, '-d', self.decoy_filename, '-o',
            self.output_filename, '--stereo-from-3d'
        ])
        chiral = False
        for this_smiles in smiles:
            if '@' in this_smiles:
                chiral = True
        assert chiral
Ejemplo n.º 7
0
    def initialize(self):
        """
        Initialize.

        This is not part of __init__ because it breaks IPython.parallel.
        """
        fd, self.config_filename = tempfile.mkstemp()
        os.close(fd)
        with open(self.config_filename, "wb") as f:
            f.write(self.get_config())
        self.smiles_engine = SmilesGenerator(**self.smiles_engine_kwargs)
        self.initialized = True
    def setUp(self):
        """
        Set up tests.
        """
        smiles = [
            'CC(=O)OC1=CC=CC=C1C(=O)O', 'CC(C)CC1=CC=C(C=C1)C(C)C(=O)O',
            'CC1=CC=C(C=C1)C2=CC(=NN2C3=CC=C(C=C3)S(=O)(=O)N)C(F)(F)F'
        ]
        names = ['aspirin', 'ibuprofen', 'celecoxib']
        self.y = [0, 1, 0]
        self.mols = []
        for s, n in zip(smiles, names):
            mol = Chem.MolFromSmiles(s)
            mol.SetProp('_Name', n)
            self.mols.append(mol)

        # write active and decoy files
        self.temp_dir = tempfile.mkdtemp()
        _, self.active_filename = tempfile.mkstemp(dir=self.temp_dir,
                                                   suffix='.smi')
        _, self.decoy_filename = tempfile.mkstemp(dir=self.temp_dir,
                                                  suffix='.smi')
        active = open(self.active_filename, 'wb')
        decoy = open(self.decoy_filename, 'wb')
        for this_smiles, name, y in zip(smiles, names, self.y):
            data = '{}\t{}\n'.format(this_smiles, name)
            if y:
                active.write(data)
            else:
                decoy.write(data)
        active.close()
        decoy.close()
        _, self.output_filename = tempfile.mkstemp(dir=self.temp_dir)

        # get SMILES
        self.engine = SmilesGenerator()
        self.smiles = [self.engine.get_smiles(mol) for mol in self.mols]
    def test_stereo_to_3d(self):
        """
        Test main with --stereo-to-3d.
        """
        # generate conformers for ibuprofen
        engine = conformers.ConformerGenerator()
        self.mols[1] = engine.generate_conformers(self.mols[1])
        assert self.mols[1].GetNumConformers() > 0

        # rewrite actives file with 3D coordinates
        _, self.active_filename = tempfile.mkstemp(dir=self.temp_dir,
                                                   suffix='.sdf')
        with serial.MolWriter().open(self.active_filename) as writer:
            for mol, y in zip(self.mols, self.y):
                if y:
                    writer.write([self.mols[1]])

        # check for absence of chirality using default arguments
        smiles, targets = self.check_output(
            ['-a', self.active_filename, '-d', self.decoy_filename, '-o',
             self.output_filename])
        chiral = False
        for this_smiles in smiles:
            if '@' in this_smiles:
                chiral = True
        assert not chiral

        # update reference SMILES
        self.engine = SmilesGenerator(assign_stereo_from_3d=True)
        self.smiles[1] = self.engine.get_smiles(self.mols[1])

        # check for presence of chiraliy using --stereo-from-3d
        smiles, targets = self.check_output(
            ['-a', self.active_filename, '-d', self.decoy_filename, '-o',
             self.output_filename, '--stereo-from-3d'])
        chiral = False
        for this_smiles in smiles:
            if '@' in this_smiles:
                chiral = True
        assert chiral
Ejemplo n.º 10
0
def main(featurizer_class, input_filename, output_filename,
         target_filename=None, featurizer_kwargs=None, parallel=False,
         client_kwargs=None, view_flags=None, compression_level=3,
         smiles_hydrogens=False, include_smiles=False, scaffolds=False,
         chiral_scaffolds=False, mol_id_prefix=None):
    """
    Featurize molecules in input_filename using the given featurizer.

    Parameters
    ----------
    featurizer_class : Featurizer
        Featurizer class.
    input_filename : str
        Filename containing molecules to be featurized.
    output_filename : str
        Output filename. Should end with .pkl or .pkl.gz.
    target_filename : str, optional
        Pickle containing target values. Should either be array_like or a dict
        containing 'names' and 'y' keys, corresponding to molecule names and
        target values.
    featurizer_kwargs : dict, optional
        Keyword arguments passed to featurizer.
    parallel : bool, optional
        Whether to train subtrainers in parallel using IPython.parallel
        (default False).
    client_kwargs : dict, optional
        Keyword arguments for IPython.parallel Client.
    view_flags : dict, optional
        Flags for IPython.parallel LoadBalancedView.
    compression_level : int, optional (default 3)
        Compression level (0-9) to use with joblib.dump.
    smiles_hydrogens : bool, optional (default False)
        Whether to keep hydrogens when generating SMILES.
    include_smiles : bool, optional (default False)
        Include SMILES in output.
    scaffolds : bool, optional (default False)
        Whether to include scaffolds in output.
    chiral_scaffods : bool, optional (default False)
        Whether to include chirality in scaffolds.
    mol_id_prefix : str, optional
        Prefix for molecule IDs.
    """
    mols, mol_ids = read_mols(input_filename, mol_id_prefix=mol_id_prefix)

    # get targets
    data = {}
    if target_filename is not None:
        targets = read_pickle(target_filename)
        if isinstance(targets, dict):
            mol_indices, target_indices = collate_mols(
                mols, mol_ids, targets['y'], targets['mol_id'])
            mols = mols[mol_indices]
            mol_ids = mol_ids[mol_indices]
            targets = np.asarray(targets['y'])[target_indices]
        else:
            assert len(targets) == len(mols)
        data['y'] = targets

    # featurize molecules
    print "Featurizing molecules..."
    if featurizer_kwargs is None:
        featurizer_kwargs = {}
    featurizer = featurizer_class(**featurizer_kwargs)
    features = featurizer.featurize(mols, parallel, client_kwargs, view_flags)

    # fill in data container
    print "Saving results..."
    data['mol_id'] = mol_ids
    data['features'] = features

    # sanity checks
    assert data['features'].shape[0] == len(mols), (
        "Features do not match molecules.")
    assert data['mol_id'].shape[0] == len(mols), (
        "Molecule IDs do not match molecules.")

    # smiles, scaffolds, args
    if include_smiles:
        smiles = SmilesGenerator(remove_hydrogens=(not smiles_hydrogens))
        data['smiles'] = np.asarray([smiles.get_smiles(mol) for mol in mols])
    if scaffolds:
        data['scaffolds'] = get_scaffolds(mols, chiral_scaffolds)

    # construct a DataFrame
    try:
        if data['features'].ndim > 1:
            # numpy arrays will be "summarized" when written as strings
            # use str(row.tolist())[1:-1] to remove the surrounding brackets
            # remove commas (keeping spaces) to avoid conflicts with csv
            if (output_filename.endswith('.csv')
                    or output_filename.endswith('.csv.gz')):
                data['features'] = [str(row.tolist())[1:-1].replace(', ', ' ')
                                    for row in data['features']]
            else:
                data['features'] = [row for row in data['features']]
    except AttributeError:
        pass
    df = pd.DataFrame(data)

    # write output file
    write_output_file(df, output_filename, compression_level)
class TestClassificationTargets(unittest.TestCase):
    """
    Tests for classification_targets.py.
    """
    def setUp(self):
        """
        Set up tests.
        """
        smiles = ['CC(=O)OC1=CC=CC=C1C(=O)O', 'CC(C)CC1=CC=C(C=C1)C(C)C(=O)O',
                  'CC1=CC=C(C=C1)C2=CC(=NN2C3=CC=C(C=C3)S(=O)(=O)N)C(F)(F)F']
        names = ['aspirin', 'ibuprofen', 'celecoxib']
        self.y = [0, 1, 0]
        self.mols = []
        for s, n in zip(smiles, names):
            mol = Chem.MolFromSmiles(s)
            mol.SetProp('_Name', n)
            self.mols.append(mol)

        # write active and decoy files
        self.temp_dir = tempfile.mkdtemp()
        _, self.active_filename = tempfile.mkstemp(dir=self.temp_dir,
                                                   suffix='.smi')
        _, self.decoy_filename = tempfile.mkstemp(dir=self.temp_dir,
                                                  suffix='.smi')
        active = open(self.active_filename, 'wb')
        decoy = open(self.decoy_filename, 'wb')
        for this_smiles, name, y in zip(smiles, names, self.y):
            data = '{}\t{}\n'.format(this_smiles, name)
            if y:
                active.write(data)
            else:
                decoy.write(data)
        active.close()
        decoy.close()
        _, self.output_filename = tempfile.mkstemp(dir=self.temp_dir)

        # get SMILES
        self.engine = SmilesGenerator()
        self.smiles = [self.engine.get_smiles(mol) for mol in self.mols]

    def tearDown(self):
        """
        Clean up tests.
        """
        shutil.rmtree(self.temp_dir)

    def check_output(self, input_args):
        """
        Check main output.

        Parameters
        ----------
        input_args : list
            Command-line arguments.
        """
        args = parse_args(input_args)
        main(args.actives, args.decoys, args.output, args.stereo_from_3d)
        data = read_pickle(self.output_filename)
        for smiles, target in zip(data['smiles'], data['targets']):
            assert smiles in self.smiles
            assert target == self.y[self.smiles.index(smiles)]
        return data['smiles'], data['targets']

    def test_defaults(self):
        """
        Test main with default parameters.
        """
        args = ['-a', self.active_filename, '-d', self.decoy_filename, '-o',
                self.output_filename]
        self.check_output(args)

    def test_stereo_to_3d(self):
        """
        Test main with --stereo-to-3d.
        """
        # generate conformers for ibuprofen
        engine = conformers.ConformerGenerator()
        self.mols[1] = engine.generate_conformers(self.mols[1])
        assert self.mols[1].GetNumConformers() > 0

        # rewrite actives file with 3D coordinates
        _, self.active_filename = tempfile.mkstemp(dir=self.temp_dir,
                                                   suffix='.sdf')
        with serial.MolWriter().open(self.active_filename) as writer:
            for mol, y in zip(self.mols, self.y):
                if y:
                    writer.write([self.mols[1]])

        # check for absence of chirality using default arguments
        smiles, targets = self.check_output(
            ['-a', self.active_filename, '-d', self.decoy_filename, '-o',
             self.output_filename])
        chiral = False
        for this_smiles in smiles:
            if '@' in this_smiles:
                chiral = True
        assert not chiral

        # update reference SMILES
        self.engine = SmilesGenerator(assign_stereo_from_3d=True)
        self.smiles[1] = self.engine.get_smiles(self.mols[1])

        # check for presence of chiraliy using --stereo-from-3d
        smiles, targets = self.check_output(
            ['-a', self.active_filename, '-d', self.decoy_filename, '-o',
             self.output_filename, '--stereo-from-3d'])
        chiral = False
        for this_smiles in smiles:
            if '@' in this_smiles:
                chiral = True
        assert chiral
Ejemplo n.º 12
0
 def setUp(self):
     """
     Set up tests.
     """
     super(TestSmilesGenerator, self).setUp()
     self.engine = SmilesGenerator()
Ejemplo n.º 13
0
class MoleculeDatabase(object):
    """
    Molecule database.

    Molecules are keyed by SMILES.

    Parameters
    ----------
    kwargs : dict, optional
        Keyword arguments for SmilesMap.
    """
    def __init__(self, **kwargs):
        self.engine = SmilesGenerator(**kwargs)
        self.smiles = set()

    def __len__(self):
        return len(self.smiles)

    def __iter__(self):
        for smiles in self.smiles:
            yield smiles

    def __contains__(self, item):
        return item in self.smiles

    def add_mol(self, mol):
        """
        Add a molecule to the database.

        Parameters
        ----------
        mol : RDKit Mol
            Molecule.
        """
        self.smiles.add(self.engine.get_smiles(mol))

    def load(self, filename):
        """
        Load an existing database.

        Parameters
        ----------
        filename : str
            Existing database filename.
        """
        if filename.endswith('.gz'):
            f = gzip.open(filename)
        else:
            f = open(filename)
        for line in f:
            smiles = line.strip()
            mol = Chem.MolFromSmiles(smiles)  # sanity check
            if mol is None:
                raise ValueError(
                    'Database is unreadable: "{}".'.format(smiles))
            self.smiles.add(smiles)
        f.close()

    def save(self, filename):
        """
        Save the database to disk.

        Parameters
        ----------
        filename : str
            Filename.
        """
        if filename.endswith('.gz'):
            f = gzip.open(filename, 'wb')
        else:
            f = open(filename, 'wb')
        for smiles in self.smiles:
            f.write('{}\n'.format(smiles))
        f.close()
class TestClassificationTargets(unittest.TestCase):
    """
    Tests for classification_targets.py.
    """
    def setUp(self):
        """
        Set up tests.
        """
        smiles = [
            'CC(=O)OC1=CC=CC=C1C(=O)O', 'CC(C)CC1=CC=C(C=C1)C(C)C(=O)O',
            'CC1=CC=C(C=C1)C2=CC(=NN2C3=CC=C(C=C3)S(=O)(=O)N)C(F)(F)F'
        ]
        names = ['aspirin', 'ibuprofen', 'celecoxib']
        self.y = [0, 1, 0]
        self.mols = []
        for s, n in zip(smiles, names):
            mol = Chem.MolFromSmiles(s)
            mol.SetProp('_Name', n)
            self.mols.append(mol)

        # write active and decoy files
        self.temp_dir = tempfile.mkdtemp()
        _, self.active_filename = tempfile.mkstemp(dir=self.temp_dir,
                                                   suffix='.smi')
        _, self.decoy_filename = tempfile.mkstemp(dir=self.temp_dir,
                                                  suffix='.smi')
        active = open(self.active_filename, 'wb')
        decoy = open(self.decoy_filename, 'wb')
        for this_smiles, name, y in zip(smiles, names, self.y):
            data = '{}\t{}\n'.format(this_smiles, name)
            if y:
                active.write(data)
            else:
                decoy.write(data)
        active.close()
        decoy.close()
        _, self.output_filename = tempfile.mkstemp(dir=self.temp_dir)

        # get SMILES
        self.engine = SmilesGenerator()
        self.smiles = [self.engine.get_smiles(mol) for mol in self.mols]

    def tearDown(self):
        """
        Clean up tests.
        """
        shutil.rmtree(self.temp_dir)

    def check_output(self, input_args):
        """
        Check main output.

        Parameters
        ----------
        input_args : list
            Command-line arguments.
        """
        args = parse_args(input_args)
        main(args.actives, args.decoys, args.output, args.stereo_from_3d)
        data = read_pickle(self.output_filename)
        for smiles, target in zip(data['smiles'], data['targets']):
            assert smiles in self.smiles
            assert target == self.y[self.smiles.index(smiles)]
        return data['smiles'], data['targets']

    def test_defaults(self):
        """
        Test main with default parameters.
        """
        args = [
            '-a', self.active_filename, '-d', self.decoy_filename, '-o',
            self.output_filename
        ]
        self.check_output(args)

    def test_stereo_to_3d(self):
        """
        Test main with --stereo-to-3d.
        """
        # generate conformers for ibuprofen
        engine = conformers.ConformerGenerator()
        self.mols[1] = engine.generate_conformers(self.mols[1])
        assert self.mols[1].GetNumConformers() > 0

        # rewrite actives file with 3D coordinates
        _, self.active_filename = tempfile.mkstemp(dir=self.temp_dir,
                                                   suffix='.sdf')
        with serial.MolWriter().open(self.active_filename) as writer:
            for mol, y in zip(self.mols, self.y):
                if y:
                    writer.write([self.mols[1]])

        # check for absence of chirality using default arguments
        smiles, targets = self.check_output([
            '-a', self.active_filename, '-d', self.decoy_filename, '-o',
            self.output_filename
        ])
        chiral = False
        for this_smiles in smiles:
            if '@' in this_smiles:
                chiral = True
        assert not chiral

        # update reference SMILES
        self.engine = SmilesGenerator(assign_stereo_from_3d=True)
        self.smiles[1] = self.engine.get_smiles(self.mols[1])

        # check for presence of chiraliy using --stereo-from-3d
        smiles, targets = self.check_output([
            '-a', self.active_filename, '-d', self.decoy_filename, '-o',
            self.output_filename, '--stereo-from-3d'
        ])
        chiral = False
        for this_smiles in smiles:
            if '@' in this_smiles:
                chiral = True
        assert chiral
Ejemplo n.º 15
0
 def __init__(self, **kwargs):
     self.engine = SmilesGenerator(**kwargs)
     self.smiles = set()
Ejemplo n.º 16
0
 def setUp(self):
     """
     Set up tests.
     """
     super(TestSmilesGenerator, self).setUp()
     self.engine = SmilesGenerator()
Ejemplo n.º 17
0
class Dragon(object):
    """
    Wrapper for dragon6shell.

    Parameters
    ----------
    subset : str, optional (default '2d')
        Descriptor subset.
    kwargs : dict, optional
        Keyword arguments for SmilesGenerator.
    """

    def __init__(self, subset="2d", **kwargs):
        self.subset = subset
        self.initialized = False
        self.config_filename, self.smiles_engine = None, None
        self.smiles_engine_kwargs = kwargs

    def initialize(self):
        """
        Initialize.

        This is not part of __init__ because it breaks IPython.parallel.
        """
        fd, self.config_filename = tempfile.mkstemp()
        os.close(fd)
        with open(self.config_filename, "wb") as f:
            f.write(self.get_config())
        self.smiles_engine = SmilesGenerator(**self.smiles_engine_kwargs)
        self.initialized = True

    def __del__(self):
        """
        Cleanup.
        """
        if self.config_filename is not None:
            os.unlink(self.config_filename)

    def get_config(self):
        """
        Get configuration file.
        """
        if self.subset == "2d":
            return """<?xml version="1.0" encoding="utf-8"?>
<DRAGON version="6.0.36" script_version="1" generation_date="2014/11/17">
  <OPTIONS>
    <CheckUpdates value="true"/>
    <SaveLayout value="true"/>
    <ShowWorksheet value="false"/>
    <Decimal_Separator value="."/>
    <Missing_String value="NaN"/>
    <DefaultMolFormat value="1"/>
    <HelpBrowser value="/usr/bin/xdg-open"/>
    <RejectUnusualValence value="false"/>
    <Add2DHydrogens value="false"/>
    <MaxSRforAllCircuit value="19"/>
    <MaxSR value="35"/>
    <MaxSRDetour value="30"/>
    <MaxAtomWalkPath value="2000"/>
    <LogPathWalk value="true"/>
    <LogEdge value="true"/>
    <Weights>
      <weight name="Mass"/>
      <weight name="VdWVolume"/>
      <weight name="Electronegativity"/>
      <weight name="Polarizability"/>
      <weight name="Ionization"/>
      <weight name="I-State"/>
    </Weights>
    <SaveOnlyData value="false"/>
    <SaveLabelsOnSeparateFile value="false"/>
    <SaveFormatBlock value="%b - %n.txt"/>
    <SaveFormatSubBlock value="%b-%s - %n - %m.txt"/>
    <SaveExcludeMisVal value="false"/>
    <SaveExcludeAllMisVal value="false"/>
    <SaveExcludeConst value="false"/>
    <SaveExcludeNearConst value="false"/>
    <SaveExcludeStdDev value="false"/>
    <SaveStdDevThreshold value="0.0001"/>
    <SaveExcludeCorrelated value="false"/>
    <SaveCorrThreshold value="0.95"/>
    <SaveExclusionOptionsToVariables value="false"/>
    <SaveExcludeMisMolecules value="false"/>
    <SaveExcludeRejectedMolecules value="false"/>
  </OPTIONS>
  <DESCRIPTORS>
    <block id="1" SelectAll="true"/>
    <block id="2" SelectAll="true"/>
    <block id="3" SelectAll="true"/>
    <block id="4" SelectAll="true"/>
    <block id="5" SelectAll="true"/>
    <block id="6" SelectAll="true"/>
    <block id="7" SelectAll="true"/>
    <block id="8" SelectAll="true"/>
    <block id="9" SelectAll="true"/>
    <block id="10" SelectAll="true"/>
    <block id="11" SelectAll="true"/>
    <block id="12" SelectAll="true"/>
    <block id="21" SelectAll="true"/>
    <block id="22" SelectAll="true"/>
    <block id="23" SelectAll="true"/>
    <block id="24" SelectAll="true"/>
    <block id="25" SelectAll="true"/>
    <block id="28" SelectAll="true"/>
    <block id="29" SelectAll="true"/>
  </DESCRIPTORS>
  <MOLFILES>
    <molInput value="stdin"/>
    <molInputFormat value="SMILES"/>
  </MOLFILES>
  <OUTPUT>
    <SaveStdOut value="true"/>
    <SaveProject value="false"/>
    <SaveFile value="false"/>
    <logMode value="stderr"/>
  </OUTPUT>
</DRAGON>
"""
        else:
            raise NotImplementedError

    def get_descriptors(self, mols):
        """
        Parameters
        ----------
        mols : array_like
            Molecules.
        """
        if not self.initialized:
            self.initialize()
        smiles = [self.smiles_engine.get_smiles(mol) for mol in mols]
        args = ["dragon6shell", "-s", self.config_filename]
        p = subprocess.Popen(args, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
        stdout, stderr = p.communicate("\n".join(smiles))
        if not stdout:
            raise RuntimeError(stderr)
        data, names = self.parse_descriptors(stdout)

        # adjust for skipped molecules
        # descriptors are in same order as smiles
        missing = np.setdiff1d(smiles, names)
        features = np.zeros(len(smiles), dtype=object)
        idx = 0  # index into calculated features
        for i, this_smiles in enumerate(smiles):
            if this_smiles in missing:
                features[i] = None
            else:
                assert this_smiles == names[idx]  # confirm match
                features[i] = data[idx]
                idx += 1
        assert len(features) == len(mols)
        return features

    def parse_descriptors(self, string):
        """
        Parse Dragon descriptors.

        Parameters
        ----------
        string : str
            Output from dragon6shell.
        """
        df = pd.read_table(StringIO(string))
        if self.subset == "2d":
            del df["nHBonds"], df["Psi_e_1d"], df["Psi_e_1s"]

        # extract names
        names = df["NAME"].values

        # delete No. and NAME columns
        del df["No."], df["NAME"]

        return np.asarray(df, dtype=float), names
Ejemplo n.º 18
0
def main(featurizer_class,
         input_filename,
         output_filename,
         target_filename=None,
         featurizer_kwargs=None,
         parallel=False,
         client_kwargs=None,
         view_flags=None,
         compression_level=3,
         smiles_hydrogens=False,
         include_smiles=False,
         scaffolds=False,
         chiral_scaffolds=False,
         mol_id_prefix=None):
    """
    Featurize molecules in input_filename using the given featurizer.

    Parameters
    ----------
    featurizer_class : Featurizer
        Featurizer class.
    input_filename : str
        Filename containing molecules to be featurized.
    output_filename : str
        Output filename. Should end with .pkl or .pkl.gz.
    target_filename : str, optional
        Pickle containing target values. Should either be array_like or a dict
        containing 'names' and 'y' keys, corresponding to molecule names and
        target values.
    featurizer_kwargs : dict, optional
        Keyword arguments passed to featurizer.
    parallel : bool, optional
        Whether to train subtrainers in parallel using IPython.parallel
        (default False).
    client_kwargs : dict, optional
        Keyword arguments for IPython.parallel Client.
    view_flags : dict, optional
        Flags for IPython.parallel LoadBalancedView.
    compression_level : int, optional (default 3)
        Compression level (0-9) to use with joblib.dump.
    smiles_hydrogens : bool, optional (default False)
        Whether to keep hydrogens when generating SMILES.
    include_smiles : bool, optional (default False)
        Include SMILES in output.
    scaffolds : bool, optional (default False)
        Whether to include scaffolds in output.
    chiral_scaffods : bool, optional (default False)
        Whether to include chirality in scaffolds.
    mol_id_prefix : str, optional
        Prefix for molecule IDs.
    """
    mols, mol_ids = read_mols(input_filename, mol_id_prefix=mol_id_prefix)

    # get targets
    data = {}
    if target_filename is not None:
        targets = read_pickle(target_filename)
        if isinstance(targets, dict):
            mol_indices, target_indices = collate_mols(mols, mol_ids,
                                                       targets['y'],
                                                       targets['mol_id'])
            mols = mols[mol_indices]
            mol_ids = mol_ids[mol_indices]
            targets = np.asarray(targets['y'])[target_indices]
        else:
            assert len(targets) == len(mols)
        data['y'] = targets

    # featurize molecules
    print "Featurizing molecules..."
    if featurizer_kwargs is None:
        featurizer_kwargs = {}
    featurizer = featurizer_class(**featurizer_kwargs)
    features = featurizer.featurize(mols, parallel, client_kwargs, view_flags)

    # fill in data container
    print "Saving results..."
    data['mol_id'] = mol_ids
    data['features'] = features

    # sanity checks
    assert data['features'].shape[0] == len(mols), (
        "Features do not match molecules.")
    assert data['mol_id'].shape[0] == len(mols), (
        "Molecule IDs do not match molecules.")

    # smiles, scaffolds, args
    if include_smiles:
        smiles = SmilesGenerator(remove_hydrogens=(not smiles_hydrogens))
        data['smiles'] = np.asarray([smiles.get_smiles(mol) for mol in mols])
    if scaffolds:
        data['scaffolds'] = get_scaffolds(mols, chiral_scaffolds)

    # construct a DataFrame
    try:
        if data['features'].ndim > 1:
            # numpy arrays will be "summarized" when written as strings
            # use str(row.tolist())[1:-1] to remove the surrounding brackets
            # remove commas (keeping spaces) to avoid conflicts with csv
            if (output_filename.endswith('.csv')
                    or output_filename.endswith('.csv.gz')):
                data['features'] = [
                    str(row.tolist())[1:-1].replace(', ', ' ')
                    for row in data['features']
                ]
            else:
                data['features'] = [row for row in data['features']]
    except AttributeError:
        pass
    df = pd.DataFrame(data)

    # write output file
    write_output_file(df, output_filename, compression_level)
Ejemplo n.º 19
0
def main(featurizer_class, input_filename, output_filename,
         target_filename=None, featurizer_kwargs=None, parallel=False,
         client_kwargs=None, view_flags=None, compression_level=3,
         smiles_hydrogens=False, names=False, scaffolds=False,
         chiral_scaffolds=False):
    """
    Featurize molecules in input_filename using the given featurizer.

    Parameters
    ----------
    featurizer_class : Featurizer
        Featurizer class.
    input_filename : str
        Filename containing molecules to be featurized.
    output_filename : str
        Output filename. Should end with .pkl or .pkl.gz.
    target_filename : str, optional
        Pickle containing target values. Should either be array_like or a dict
        containing 'names' and 'y' keys, corresponding to molecule names and
        target values.
    featurizer_kwargs : dict, optional
        Keyword arguments passed to featurizer.
    parallel : bool, optional
        Whether to train subtrainers in parallel using IPython.parallel
        (default False).
    client_kwargs : dict, optional
        Keyword arguments for IPython.parallel Client.
    view_flags : dict, optional
        Flags for IPython.parallel LoadBalancedView.
    compression_level : int, optional (default 3)
        Compression level (0-9) to use with joblib.dump.
    smiles_hydrogens : bool, optional (default False)
        Whether to keep hydrogens when generating SMILES.
    names : bool, optional (default False)
        Whether to include molecule names in output.
    scaffolds : bool, optional (default False)
        Whether to include scaffolds in output.
    chiral_scaffods : bool, optional (default False)
        Whether to include chirality in scaffolds.
    """
    mols, mol_names = read_mols(input_filename)

    # get targets
    data = {}
    if target_filename is not None:
        targets = read_pickle(target_filename)
        if isinstance(targets, dict):
            mol_indices, target_indices = collate_mols(
                mols, mol_names, targets['y'], targets['names'])
            mols = mols[mol_indices]
            mol_names = mol_names[mol_indices]
            targets = np.asarray(targets['y'])[target_indices]
        else:
            assert len(targets) == len(mols)
        data['y'] = targets

    # featurize molecules
    print "Featurizing molecules..."
    if featurizer_kwargs is None:
        featurizer_kwargs = {}
    featurizer = featurizer_class(**featurizer_kwargs)
    features = featurizer.featurize(mols, parallel, client_kwargs, view_flags)

    # fill in data container
    print "Saving results..."
    data['features'] = features

    # calculate SMILES
    smiles = SmilesGenerator(remove_hydrogens=(not smiles_hydrogens))
    data['smiles'] = np.asarray([smiles.get_smiles(mol) for mol in mols])

    # sanity checks
    assert data['features'].shape[0] == len(mols), (
        "Features do not match molecules.")
    assert data['smiles'].shape[0] == len(mols), (
        "SMILES do not match molecules.")

    # names, scaffolds, args
    if names:
        data['names'] = mol_names
    if scaffolds:
        data['scaffolds'] = get_scaffolds(mols, chiral_scaffolds)
    data['args'] = {'featurizer_class': featurizer_class.__name__,
                    'input_filename': input_filename,
                    'target_filename': target_filename,
                    'featurizer_kwargs': featurizer_kwargs,
                    'chiral_scaffolds': chiral_scaffolds}

    # write output file
    write_output_file(data, output_filename, compression_level)