Beispiel #1
0
 def string(self, val):
     if not _os.path.exists(val):
         self._molecule = _rdkit.openAsRdkit(val, minimise=self.minimise)
         self._string = _rdmolfiles.MolToSmiles(self._molecule)
     else:
         raise ValueError(
             "Need a SMILES or InChI string instead of a filename")
def test_sdf_file_parser_target_index(sdf_file, test_mols):
    idxs = [0, 2]
    preprocessor = EGCNPreprocessor(max_atoms=49, out_size=49)
    parser = SDFFileParser(preprocessor, labels='Fitness')
    result = parser.parse(sdf_file, return_smiles=True, target_index=idxs)
    dataset = result['dataset']
    smiles = result['smiles']
    assert len(dataset) == 4

    # # Check if computed features are saved correctly
    for i in range(len(dataset) - 1):  # for each feature
        for data_idx, j in enumerate(idxs):  # and for each example
            expect = preprocessor.get_input_feats(test_mols[j])
            np.testing.assert_array_almost_equal(dataset[i][data_idx],
                                                 expect[i],
                                                 decimal=3)

    # Check if labels are parsed correctly
    labels = dataset[3]
    expected_labels = np.array(
        [preprocessor.get_labels(test_mols[idx], 'Fitness') for idx in idxs])
    np.testing.assert_array_almost_equal(labels, expected_labels, decimal=3)

    # Check smiles array
    assert type(smiles) == np.ndarray
    assert smiles.ndim == 1
    assert len(smiles) == dataset[0].shape[0]
    expected_smiles = np.array(
        [rdmolfiles.MolToSmiles(test_mols[idx]) for idx in idxs])
    np.testing.assert_array_equal(smiles, expected_smiles)
Beispiel #3
0
def _get_smiles(inchi_term):
    '''Get smiles.'''
    try:
        mol = inchi.MolFromInchi(inchi_term, treatWarningAsError=True)
        return rdmolfiles.MolToSmiles(mol)
    except Exception:
        return None
Beispiel #4
0
    def prepare_mol(self, mol: rdchem.Mol) -> Tuple[str, rdchem.Mol]:
        """Prepare both smiles and mol by standardizing to common rules.

        This method should be called before `get_input_feats`.

        Params:
        -------
        mol: rdkit.Chem.rdchem.Mol
            Molecule of interest.

        Returns:
        --------
        canonical_smiles: str
            Canonical SMILES representation of the molecule. 

        mol: rdkit.Chem.rdchem.Mol
            Modified molecule w/ kekulization and Hs added, if specified.
        """
        canonical_smiles = rdmolfiles.MolToSmiles(mol, canonical=True)
        mol = rdmolfiles.MolFromSmiles(canonical_smiles)

        if self.add_Hs:
            mol = rdmolops.AddHs(mol)
        if self.kekulize:
            rdmolops.Kekulize(mol)
        return canonical_smiles, mol
    def smiles_from_seq(self, seq):
        """Calculates the smiles of a given peptide dendrimer sequence

        Arguments:
            seq {string} -- peptide dendrimer sequence
        Returns:
            string -- molecule_smile - SMILES of the peptide
        """

        gs, bs, terminal, capping = self.split_seq_components(seq)

        # modifies the Cterminal
        if terminal:
            molecule = rdmolfiles.MolFromSmiles(self.T_SMILES[terminal[0]])
        else:
            molecule = ''

        # creates the dendrimer structure
        for gen in gs:
            for aa in gen:
                if aa == '-':
                    self.metbond = True
                    continue
                if molecule == '':
                    molecule = rdmolfiles.MolFromSmiles(self.AA_SMILES[aa])
                else:
                    molecule = self.connect_mol(
                        molecule, rdmolfiles.MolFromSmiles(self.AA_SMILES[aa]))

            if bs:
                if bs[0] == '-':
                    self.metbond = True
                    bs.pop(0)
                if molecule == '':
                    molecule = rdmolfiles.MolFromSmiles(self.B_SMILES[bs[0]])
                else:
                    molecule = self.connect_mol(
                        molecule,
                        rdmolfiles.MolFromSmiles(self.B_SMILES[bs[0]]))
                bs.pop(0)

        # adds capping to the N-terminal (the called clip function is different, cause the listed smiles
        # for the capping are already without OH, it is not necessary removing any atom after foming the new bond)
        if capping:
            molecule = attach_capping(
                molecule, rdmolfiles.MolFromSmiles(self.C_SMILES[capping[0]]))

        # clean the smile from all the tags
        for atom in molecule.GetAtoms():
            atom.SetAtomMapNum(0)

        molecule_smile = rdmolfiles.MolToSmiles(molecule,
                                                isomericSmiles=True).replace(
                                                    '[N]',
                                                    'N').replace('[C]', 'C')
        return molecule_smile
Beispiel #6
0
 def protonated_filename(self, val):
     with self.workdir:
         if val is None:
             self._protonated_filename = None
             self._protonated = False
         else:
             self._protonated_filename = _fileio.checkFileExists(val)
             self._molecule = _rdkit.openAsRdkit(self._protonated_filename,
                                                 removeHs=False,
                                                 minimise=self.minimise)
             self._string = _rdmolfiles.MolToSmiles(self.molecule)
             self._protonated = True
def test_sdf_file_parser_return_smiles(sdf_file, test_mols):
    preprocessor = EGCNPreprocessor(max_atoms=49, out_size=49)
    parser = SDFFileParser(preprocessor)
    result = parser.parse(sdf_file, return_smiles=True)
    dataset = result['dataset']
    smiles = result['smiles']
    assert len(dataset) == 3

    # Check if computed features are saved correctly
    for i in range(len(dataset)):  # for each feature
        for j in range(len(test_mols)):  # and for each example
            expect = preprocessor.get_input_feats(test_mols[j])
            np.testing.assert_array_almost_equal(dataset[i][j],
                                                 expect[i],
                                                 decimal=3)

    # Check smiles array
    assert type(smiles) == np.ndarray
    assert smiles.ndim == 1
    assert len(smiles) == dataset[0].shape[0]
    expected_smiles = np.array(
        [rdmolfiles.MolToSmiles(mol) for mol in test_mols])
    np.testing.assert_array_equal(smiles, expected_smiles)
Beispiel #8
0
 def molecule(self, val):
     if isinstance(val, _rdchem.Mol):
         self._molecule = val
         self._string = _rdmolfiles.MolToSmiles(self._molecule)
     else:
         raise TypeError("Need an object of type RDKit Mol")
Beispiel #9
0
def smiles_from_seq_cyclic(seq):
    """Calculates the smiles of the given peptide sequence and cyclize it
        Arguments:
            seq {string} -- peptide dendrimer sequence
        Returns:
            string -- molecule_smile - SMILES of the peptide
    """
    # used internally to recognize a methylated aa:
    metbond = False
    # can be set with exclude or allow methylation,
    # it refers to the possibility of having methylation in the entire GA:
    methyl = False

    if 'X' in seq:
        cy = 1
        for i in NT:
            seq = seq.replace(i, '')
        for i in CT:
            seq = seq.replace(i, '')
    else:
        cy = 0

    gs, bs, terminal, capping = split_seq_components(seq)

    # modifies the Cterminal
    if terminal:
        molecule = rdmolfiles.MolFromSmiles(T_SMILES[terminal[0]])
    else:
        molecule = ''

    if bs:
        if verbose:
            print(
                'dendrimer, cyclization not possible, branching unit will not be considered'
            )

    # creates the linear peptide structure
    for gen in gs:
        for aa in gen:
            if aa == 'X':
                continue
            if aa == '-':
                metbond = True
                continue
            if molecule == '':
                molecule = rdmolfiles.MolFromSmiles(AA_SMILES[aa])
            else:
                molecule = connect_mol(molecule,
                                       rdmolfiles.MolFromSmiles(AA_SMILES[aa]))

    # adds capping to the N-terminal (the called clip function is different, cause the listed smiles
    # for the capping are already without OH, it is not necessary removing any atom after foming the new bond)
    if capping:
        molecule = attach_capping(
            molecule, rdmolfiles.MolFromSmiles(C_SMILES[capping[0]]))

    # cyclize
    if molecule == '':
        smiles = ''
        return smiles

    #print (cy)
    molecule = cyclize(molecule, cy)

    # clean the smile from all the tags
    for atom in molecule.GetAtoms():
        atom.SetAtomMapNum(0)
    smiles = rdmolfiles.MolToSmiles(molecule, isomericSmiles=True).replace(
        '[N]', 'N').replace('[C]', 'C')

    return smiles
Beispiel #10
0
def smiles_from_seq(seq, cyclize):
    """Calculates the smiles of a given peptide dendrimer sequence

    Arguments:
        seq {string} -- peptide dendrimer sequence
    Returns:
        string -- molecule_smile - SMILES of the peptide
    """

    #seq = seq.replace("-z","z").replace("-Z","Z").replace("-p","p").replace("-P","P")

    gs, bs, terminal, capping = split_seq_components(seq)

    # modifies the Cterminal
    if terminal:
        molecule = rdmolfiles.MolFromSmiles(T_SMILES[terminal[0]])
    else:
        molecule = ''

    if cyclize and bs:
        print(
            'dendrimer, cyclization not possible, branching unit will not be considered'
        )

    if cyclize:
        for gen in gs:
            metbond = False
            for aa in gen:
                if aa == 'X':
                    continue
                if aa == '-':
                    metbond = True
                    continue
                if molecule == '':
                    molecule = rdmolfiles.MolFromSmiles(AA_SMILES[aa])
                else:
                    molecule = utils.connect_mol(
                        molecule, rdmolfiles.MolFromSmiles(AA_SMILES[aa]),
                        metbond)
                    if metbond:
                        metbond = False
    else:
        # creates the dendrimer structure
        for gen in gs:
            metbond = False
            for aa in gen:
                if aa == '-':
                    metbond = True
                    continue
                if molecule == '':
                    molecule = rdmolfiles.MolFromSmiles(AA_SMILES[aa])
                else:
                    molecule = utils.connect_mol(
                        molecule, rdmolfiles.MolFromSmiles(AA_SMILES[aa]),
                        metbond)
                    if metbond:
                        metbond = False

            if bs:
                if bs[0] == '-':
                    metbond = True
                    bs.pop(0)
                if molecule == '':
                    molecule = rdmolfiles.MolFromSmiles(B_SMILES[bs[0]])
                else:
                    molecule = utils.connect_mol(
                        molecule, rdmolfiles.MolFromSmiles(B_SMILES[bs[0]]),
                        metbond)
                    if metbond:
                        metbond = False
                bs.pop(0)

    # adds capping to the N-terminal (the called clip function is different, cause the listed smiles
    # for the capping are already without OH, it is not necessary removing any atom after foming the new bond)

    if molecule == '':
        smiles = ''
        return smiles, seq

    if capping:
        molecule = utils.attach_capping(
            molecule, rdmolfiles.MolFromSmiles(C_SMILES[capping[0]]))

    if cyclize:
        if is_cyclic(seq):
            cy = 1
        else:
            cy = 0
        molecule = utils.cyclize(molecule, cy)

    # clean the smile from all the tags
    for atom in molecule.GetAtoms():
        atom.SetAtomMapNum(0)

    molecule_smile = rdmolfiles.MolToSmiles(
        molecule, isomericSmiles=True).replace('[N]', 'N').replace('[C]', 'C')
    return molecule_smile, seq