def test_comp_descriptor_1(): desc = Compositions(n_jobs=1) desc.fit_transform(pd.Series([{'H': 2}], name='composition')) desc.fit_transform(pd.Series([{'H': 2}], name='other')) tmp1 = desc.fit_transform(pd.Series([{ 'H': 2 }], name='other'), composition='other') tmp2 = desc.fit_transform([{'H': 2}]) assert tmp1.shape == (1, 290) assert isinstance(tmp1, pd.DataFrame) assert isinstance(tmp2, pd.DataFrame) assert np.all(tmp1.values == tmp2.values) tmp = desc.transform([{'H': 2}], featurizers=['WeightedAverage']) assert tmp.shape == (1, 58) tmp = desc.transform([{'H': 2}], featurizers='all') assert tmp.shape == (1, 500) tmp = desc.transform([{'H': 2}], featurizers=Compositions.classic) assert tmp.shape == (1, 290)
class OrganicCompDescriptor(BaseFeaturizer): def __init__(self, n_jobs=-1, *, featurizers='all', on_errors='raise', return_type='any'): """ A featurizer for extracting XenonPy compositional descriptors from SMILES or MOL """ # fix n_jobs to be 0 to skip automatic wrapper in XenonPy BaseFeaturizer class super().__init__(n_jobs=0, on_errors=on_errors, return_type=return_type) self._cal = Compositions(n_jobs=n_jobs, featurizers=featurizers, on_errors=on_errors) def featurize(self, x): # check if type(x) = list if isinstance(x, pd.Series): x = x.tolist() if not isinstance(x, list): x = [x] # check input format, assume SMILES if not RDKit-MOL if not isinstance(x[0], Chem.rdchem.Mol): x_mol = [] for z in x: x_mol.append(Chem.MolFromSmiles(z)) if x_mol[-1] is None: raise ValueError('can not convert Mol from SMILES %s' % z) else: x_mol = x # convert to counting dictionary mol = [Chem.AddHs(z) for z in x_mol] d_list = [ dict(Counter([atom.GetSymbol() for atom in z.GetAtoms()])) for z in mol ] self.output = self._cal.transform(d_list) return self.output @property def feature_labels(self): return self.output.columns