def similarity(_parents, target): featurizer = MultipleFeaturizer([ SiteStatsFingerprint.from_preset("CoordinationNumber_ward-prb-2017"), StructuralHeterogeneity(), ChemicalOrdering(), MaximumPackingEfficiency(), SiteStatsFingerprint.from_preset( "LocalPropertyDifference_ward-prb-2017"), StructureComposition(Stoichiometry()), StructureComposition(ElementProperty.from_preset("magpie")), StructureComposition(ValenceOrbital(props=["frac"])), StructureComposition(IonProperty(fast=True)), ]) # HACK celery doesn't work with multiprocessing (used by matminer) try: from celery import current_task if current_task: featurizer.set_n_jobs(1) except ImportError: pass x_target = pd.DataFrame.from_records([featurizer.featurize(target)], columns=featurizer.feature_labels()) x_parent = pd.DataFrame.from_records( featurizer.featurize_many(_parents, ignore_errors=True, pbar=False), columns=featurizer.feature_labels(), ) nulls = x_parent[x_parent.isnull().any(axis=1)].index.values x_parent.fillna(100000, inplace=True) x_target = x_target.reindex(sorted(x_target.columns), axis=1) x_parent = x_parent.reindex(sorted(x_parent.columns), axis=1) with open(os.path.join(settings.rxn_files, "scaler2.pickle"), "rb") as f: scaler = pickle.load(f) with open(os.path.join(settings.rxn_files, "quantiles.pickle"), "rb") as f: quantiles = pickle.load(f) X = scaler.transform(x_parent.append(x_target)) D = [pairwise_distances(np.array([row, X[-1]]))[0, 1] for row in X[:-1]] _res = [] for d in D: _res.append(np.linspace(0, 1, 101)[np.abs(quantiles - d).argmin()]) _res = np.array(_res) _res[nulls] = -1 return _res
def test_multifeatures(self): # Make a test dataset with two input variables data = self.make_test_data() data['x2'] = [4, 5, 6] # Create a second featurizer class MultiArgs2(SingleFeaturizerMultiArgs): def featurize(self, *x): # Making a 2D array to test whether MutliFeaturizer # can handle featurizers that have both 1D vectors with # singleton dimensions (e.g., shape==(4,1)) and those # without (e.g., shape==(4,)) return [super(MultiArgs2, self).featurize(*x)] def feature_labels(self): return ['y2'] multiargs2 = MultiArgs2() # Create featurizer multi_f = MultipleFeaturizer([self.multiargs, multiargs2]) multi_f.set_n_jobs(1) # Test featurize with multiple arguments features = multi_f.featurize(0, 2) self.assertArrayAlmostEqual([2, 2], features) # Test dataframe data = multi_f.featurize_dataframe(data, ['x', 'x2']) self.assertEquals(['y', 'y2'], multi_f.feature_labels()) self.assertArrayAlmostEqual([[5, 5], [7, 7], [9, 9]], data[['y', 'y2']])
def test_multifeatures_multiargs(self): multiargs2 = MultiArgs2() # test iterating over both entries and featurizers for iter_entries in [True, False]: # Make a test dataset with two input variables data = self.make_test_data() data['x2'] = [4, 5, 6] # Create featurizer multi_f = MultipleFeaturizer([self.multiargs, multiargs2], iterate_over_entries=iter_entries) # Test featurize with multiple arguments features = multi_f.featurize(0, 2) self.assertArrayAlmostEqual([2, 2], features) # Test dataframe data = multi_f.featurize_dataframe(data, ['x', 'x2']) self.assertEqual(['y', 'y2'], multi_f.feature_labels()) self.assertArrayAlmostEqual([[5, 5], [7, 7], [9, 9]], data[['y', 'y2']]) # Test with multiindex data = multi_f.featurize_dataframe(data, ['x', 'x2'], multiindex=True) self.assertIn(("MultiArgs2", "y2"), data.columns) self.assertIn(("SingleFeaturizerMultiArgs", "y"), data.columns) self.assertArrayAlmostEqual([[5, 5], [7, 7], [9, 9]], data[[ ("SingleFeaturizerMultiArgs", "y"), ("MultiArgs2", "y2") ]])
def get_structure_properties(structure: Structure, mode: str = 'all') -> dict: if mode == 'all': featurizer = MultipleFeaturizer([ SiteStatsFingerprint.from_preset( 'CoordinationNumber_ward-prb-2017'), StructuralHeterogeneity(), ChemicalOrdering(), DensityFeatures(), MaximumPackingEfficiency(), SiteStatsFingerprint.from_preset( 'LocalPropertyDifference_ward-prb-2017'), StructureComposition(Stoichiometry()), StructureComposition(ElementProperty.from_preset('magpie')), StructureComposition(ValenceOrbital(props=['frac'])), ]) else: # Calculate only those which do not need a Voronoi tesselation featurizer = MultipleFeaturizer([ DensityFeatures(), StructureComposition(Stoichiometry()), StructureComposition(ElementProperty.from_preset('magpie')), StructureComposition(ValenceOrbital(props=['frac'])), ]) X = featurizer.featurize(structure) matminer_dict = dict(list(zip(featurizer.feature_labels(), X))) matminer_dict['volume'] = structure.volume return matminer_dict
def test_multifeatures(self): # Make a test dataset with two input variables data = self.make_test_data() data['x2'] = [4, 5, 6] multiargs2 = MultiArgs2() # Create featurizer multi_f = MultipleFeaturizer([self.multiargs, multiargs2]) # Test featurize with multiple arguments features = multi_f.featurize(0, 2) self.assertArrayAlmostEqual([2, 2], features) # Test dataframe data = multi_f.featurize_dataframe(data, ['x', 'x2']) self.assertEquals(['y', 'y2'], multi_f.feature_labels()) self.assertArrayAlmostEqual([[5, 5], [7, 7], [9, 9]], data[['y', 'y2']])
def test_multitype_multifeat(self): """Test Multifeaturizer when a featurizer returns a non-numeric type""" # Make the featurizer f = MultipleFeaturizer([SingleFeaturizer(), MultiTypeFeaturizer()]) f.set_n_jobs(1) # Make the test data data = self.make_test_data() # Add the columns data = f.featurize_dataframe(data, 'x') # Make sure the types are as expected labels = f.feature_labels() self.assertArrayEqual(['int64', 'object', 'int64'], data[labels].dtypes.astype(str).tolist()) self.assertArrayAlmostEqual(data['y'], [2, 3, 4])
def test_multitype_multifeat(self): """Test Multifeaturizer when a featurizer returns a non-numeric type""" # test both iteration over entries and featurizers for iter_entries in [True, False]: # Make the featurizer f = MultipleFeaturizer([SingleFeaturizer(), MultiTypeFeaturizer()], iterate_over_entries=iter_entries) f.set_n_jobs(1) # Make the test data data = self.make_test_data() # Add the columns data = f.featurize_dataframe(data, 'x') # Make sure the types are as expected labels = f.feature_labels() self.assertArrayEqual(['int64', 'object', 'int64'], data[labels].dtypes.astype(str).tolist()) self.assertArrayAlmostEqual(data['y'], [2, 3, 4])
(original_count - len(data), original_count)) # 用逻辑和的方式筛选[-20,5]范围内的delta_e original_count = len(data) data = data[np.logical_and(data['delta_e'] >= -20, data['delta_e'] <= 5)] print('Removed %d/%d entries' % (original_count - len(data), original_count)) print(data.head(3)) # 设定化学计算规范:使用MagpieData数据源初始化元素属性,返回各层轨道电子数量信息,假设元素以单一氧化态存在 feature_calculators = MultipleFeaturizer([ cf.Stoichiometry(), cf.ElementProperty.from_preset("magpie"), cf.ValenceOrbital(props=['avg']), cf.IonProperty(fast=False) ]) # 获得特征名 feature_labels = feature_calculators.feature_labels() # 计算特征量 data = feature_calculators.featurize_dataframe(data, col_id='composition_obj') print('Generated %d features' % len(feature_labels)) print('Training set size:', 'x'.join([str(x) for x in data[feature_labels].shape])) # 去除空值缺省值 original_count = len(data) data = data[~data[feature_labels].isnull().any(axis=1)] print('Removed %d/%d entries' % (original_count - len(data), original_count)) print(data.head(3)) # 调用随机森林 # “随机森林”算法通过训练许多不同的决策树模型来工作, # 其中每个模型都在数据集的不同子集上进行训练。