Beispiel #1
0
def test_comp_descriptor_1():
    desc = Compositions(n_jobs=1)

    desc.fit_transform(pd.Series([{'H': 2}], name='composition'))
    desc.fit_transform(pd.Series([{'H': 2}], name='other'))

    tmp1 = desc.fit_transform(pd.Series([{
        'H': 2
    }], name='other'),
                              composition='other')
    tmp2 = desc.fit_transform([{'H': 2}])

    assert tmp1.shape == (1, 290)
    assert isinstance(tmp1, pd.DataFrame)
    assert isinstance(tmp2, pd.DataFrame)

    assert np.all(tmp1.values == tmp2.values)

    tmp = desc.transform([{'H': 2}], featurizers=['WeightedAverage'])
    assert tmp.shape == (1, 58)

    tmp = desc.transform([{'H': 2}], featurizers='all')
    assert tmp.shape == (1, 500)

    tmp = desc.transform([{'H': 2}], featurizers=Compositions.classic)
    assert tmp.shape == (1, 290)
class OrganicCompDescriptor(BaseFeaturizer):
    def __init__(self,
                 n_jobs=-1,
                 *,
                 featurizers='all',
                 on_errors='raise',
                 return_type='any'):
        """
        A featurizer for extracting XenonPy compositional descriptors from SMILES or MOL
        """

        # fix n_jobs to be 0 to skip automatic wrapper in XenonPy BaseFeaturizer class
        super().__init__(n_jobs=0,
                         on_errors=on_errors,
                         return_type=return_type)
        self._cal = Compositions(n_jobs=n_jobs,
                                 featurizers=featurizers,
                                 on_errors=on_errors)

    def featurize(self, x):
        # check if type(x) = list
        if isinstance(x, pd.Series):
            x = x.tolist()
        if not isinstance(x, list):
            x = [x]
        # check input format, assume SMILES if not RDKit-MOL
        if not isinstance(x[0], Chem.rdchem.Mol):
            x_mol = []
            for z in x:
                x_mol.append(Chem.MolFromSmiles(z))
                if x_mol[-1] is None:
                    raise ValueError('can not convert Mol from SMILES %s' % z)
        else:
            x_mol = x

        # convert to counting dictionary
        mol = [Chem.AddHs(z) for z in x_mol]
        d_list = [
            dict(Counter([atom.GetSymbol() for atom in z.GetAtoms()]))
            for z in mol
        ]

        self.output = self._cal.transform(d_list)

        return self.output

    @property
    def feature_labels(self):
        return self.output.columns