def __init__(self, exclude=None): super(CompositionFeaturizers, self).__init__(exclude=exclude) self._fast_featurizers = [ cf.AtomicOrbitals(), cf.ElementProperty.from_preset("matminer"), cf.ElementProperty.from_preset("magpie"), cf.ElementFraction(), cf.Stoichiometry(), cf.TMetalFraction(), cf.BandCenter(), cf.ValenceOrbital() ] self._slow_featurizers = [ cf.Miedema(), cf.AtomicPackingEfficiency(), # slower than the rest cf.CohesiveEnergy() # requires mpid present ] self._need_oxi_featurizers = [ cf.CationProperty.from_preset(preset_name='deml'), cf.OxidationStates.from_preset(preset_name='deml'), cf.ElectronAffinity(), cf.ElectronegativityDiff(), cf.YangSolidSolution(), cf.IonProperty() ]
def express(self): fs = [ cf.ElementProperty.from_preset("magpie"), cf.OxidationStates.from_preset(preset_name='deml'), cf.ElectronAffinity(), cf.IonProperty(), cf.YangSolidSolution(), cf.Miedema(), ] return self._get_featurizers(fs)
def __init__(self): self.feature_calculators = MultipleFeaturizer([ cf.ElementProperty.from_preset(preset_name="magpie"), cf.Stoichiometry(), cf.ValenceOrbital(props=['frac']), cf.IonProperty(fast=True), cf.BandCenter(), cf.ElementFraction(), ]) self.str2composition = StrToComposition()
def all(self): fs = [ cf.AtomicOrbitals(), cf.ElementProperty.from_preset("matminer"), cf.ElementProperty.from_preset("magpie"), cf.ElementProperty.from_preset("matscholar_el"), cf.ElementProperty.from_preset("deml"), cf.ElementFraction(), cf.Stoichiometry(), cf.TMetalFraction(), cf.BandCenter(), cf.ValenceOrbital(), cf.YangSolidSolution(), cf.CationProperty.from_preset(preset_name='deml'), cf.OxidationStates.from_preset(preset_name='deml'), cf.ElectronAffinity(), cf.ElectronegativityDiff(), cf.IonProperty(), cf.Miedema(), cf.AtomicPackingEfficiency(), # slower than the rest cf.CohesiveEnergy() # requires mpid present ] return self._get_featurizers(fs)
data['composition'] = data['structure'].apply(lambda x: x.composition) data['integer_formula'] = data['composition'].apply( lambda x: x.get_integer_formula_and_factor()[0]) data.sort_values('e_above_hull', ascending=True, inplace=True) data.drop_duplicates('integer_formula', keep='first', inplace=True) print('Reduced dataset to {} unique compositions.'.format(len(data))) data.reset_index(inplace=True, drop=True) # Create the featurizer, which will take the composition as input featurizer = MultipleFeaturizer([ cf.Stoichiometry(), cf.ElementProperty.from_preset('magpie'), cf.ValenceOrbital(props=['frac']), cf.IonProperty(fast=True) ]) # Compute the features featurizer.set_n_jobs(1) X = featurizer.featurize_many(data['composition']) # Make the model model = Pipeline([('imputer', Imputer()), ('model', RandomForestRegressor())]) model.fit(X, data['formation_energy_per_atom']) print('Trained a RandomForest model') # Save the model, featurizer, and data using pickle with open('model.pkl', 'wb') as fp: pkl.dump(model, fp) with open('featurizer.pkl', 'wb') as fp:
def load_data_zT(): results_dir = setResDir() ## Metadata keys_response = [ 'Seebeck coefficient; squared', 'Electrical resistivity', 'Thermal conductivity' ] sign = np.array([ +1, # Seebeck -1, # Electric resistivity -1 # Thermal conductivity ]) ## Load data, if possible # -------------------------------------------------- try: df_X_all = pd.read_csv(results_dir + file_features) X_all = df_X_all.drop(df_X_all.columns[0], axis=1).values df_Y_all = pd.read_csv(results_dir + file_responses) Y_all = df_Y_all.drop(df_Y_all.columns[0], axis=1).values print("Cached data loaded.") except FileNotFoundError: ## Data Import # -------------------------------------------------- # Initialize client print("Accessing data from Citrination...") site = 'https://citrination.com' # Citrination client = CitrinationClient(api_key=os.environ['CITRINATION_API_KEY'], site=site) search_client = client.search # Aluminum dataset dataset_id = 178480 # ucsb_te_roomtemp_seebeck system_query = PifSystemReturningQuery( size=1000, query=DataQuery(dataset=DatasetQuery(id=Filter( equal=str(dataset_id))))) query_result = search_client.pif_search(system_query) print(" Found {} PIFs in dataset {}.".format( query_result.total_num_hits, dataset_id)) ## Wrangle # -------------------------------------------------- pifs = [x.system for x in query_result.hits] # Utility function will tabularize PIFs df_response = pifs2df(pifs) # Down-select columns to play well with to_numeric df_response = df_response[[ 'Seebeck coefficient', 'Electrical resistivity', 'Thermal conductivity' ]] df_response = df_response.apply(pd.to_numeric) # Parse chemical compositions formulas = [pif.chemical_formula for pif in pifs] df_comp = pd.DataFrame(columns=['chemical_formula'], data=formulas) # Join df_data = pd.concat([df_comp, df_response], axis=1) print(" Accessed data.") # Featurize print("Featurizing data...") df_data['composition'] = df_data['chemical_formula'].apply( get_compostion) f = MultipleFeaturizer([ cf.Stoichiometry(), cf.ElementProperty.from_preset("magpie"), cf.ValenceOrbital(props=['avg']), cf.IonProperty(fast=True) ]) X = np.array(f.featurize_many(df_data['composition'])) # Find valid response values keys_original = [ 'Seebeck coefficient', 'Electrical resistivity', 'Thermal conductivity' ] index_valid_response = { key: df_data[key].dropna().index.values for key in keys_original } index_valid_all = df_data[keys_original].dropna().index.values X_all = X[index_valid_all, :] Y_all = df_data[keys_original].iloc[index_valid_all].values # Manipulate columns for proper objective values Y_all[:, 0] = Y_all[:, 0]**2 # Squared seebeck print(" Data prepared; {0:} valid observations.".format( X_all.shape[0])) # Cache data pd.DataFrame(data=X_all).to_csv(results_dir + file_features) pd.DataFrame(data=Y_all, columns=keys_response).to_csv(results_dir + file_responses) print("Data cached in results directory.") return X_all, Y_all, sign, keys_response, prefix
data.sort_values('delta_e', ascending=True, inplace=True) data.drop_duplicates('composition', keep='first', inplace=True) print('Removed %d/%d entries' % (original_count - len(data), original_count)) # 用逻辑和的方式筛选[-20,5]范围内的delta_e original_count = len(data) data = data[np.logical_and(data['delta_e'] >= -20, data['delta_e'] <= 5)] print('Removed %d/%d entries' % (original_count - len(data), original_count)) print(data.head(3)) # 设定化学计算规范:使用MagpieData数据源初始化元素属性,返回各层轨道电子数量信息,假设元素以单一氧化态存在 feature_calculators = MultipleFeaturizer([ cf.Stoichiometry(), cf.ElementProperty.from_preset("magpie"), cf.ValenceOrbital(props=['avg']), cf.IonProperty(fast=False) ]) # 获得特征名 feature_labels = feature_calculators.feature_labels() # 计算特征量 data = feature_calculators.featurize_dataframe(data, col_id='composition_obj') print('Generated %d features' % len(feature_labels)) print('Training set size:', 'x'.join([str(x) for x in data[feature_labels].shape])) # 去除空值缺省值 original_count = len(data) data = data[~data[feature_labels].isnull().any(axis=1)] print('Removed %d/%d entries' % (original_count - len(data), original_count)) print(data.head(3))