def generate(fake_df, ignore_errors=False): fake_df = np.array([fake_df]) fake_df = pd.DataFrame(fake_df) fake_df.columns = ['full_formula'] # print(fake_df) fake_df = StrToComposition().featurize_dataframe( fake_df, "full_formula", ignore_errors=ignore_errors) fake_df = fake_df.dropna() fake_df = feature_calculators.featurize_dataframe( fake_df, col_id='composition', ignore_errors=ignore_errors) fake_df["NComp"] = fake_df["composition"].apply(len) return fake_df
from matminer.featurizers.composition import ElementProperty ep_feat = ElementProperty.from_preset(preset_name="magpie") df = ep_feat.featurize_dataframe(df, col_id="composition") # input the "composition" column to the featurizer from matminer.featurizers.conversions import CompositionToOxidComposition from matminer.featurizers.composition import OxidationStates df = CompositionToOxidComposition().featurize_dataframe(df, "composition") os_feat = OxidationStates() df = os_feat.featurize_dataframe(df, "composition_oxid") dataset = PymatgenData() descriptors = ['row', 'group', 'atomic_mass', 'atomic_radius', 'boiling_point', 'melting_point', 'X'] stats = ["mean", "std_dev"] ep = ElementProperty(data_source=dataset, features=descriptors, stats=stats) df = ep.featurize_dataframe(df, "composition") #Remove NaN values df = df.dropna() #y = df['elasticity.K_VRH'].values y=df['Tensile Strength, Yield'].values excluded = ["elasticity.G_VRH", "elasticity.K_VRH", "pretty_formula", 'volume','nsites','spacegroup.symbol','e_above_hull','Tensile Strength, Yield','Elongation at Break ','Tensile Strength,Ultimate', "poisson_ratio", "composition", "composition_oxid"]#"elastic_anisotropy" X = df.drop(excluded, axis=1) print("There are {} possible descriptors:\n\n{}".format(X.shape[1], X.columns.values)) from sklearn.linear_model import LinearRegression from sklearn.metrics import mean_squared_error import numpy as np lr = LinearRegression() lr.fit(X, y) # get fit statistics print('training R2 = ' + str(round(lr.score(X, y), 3))) print('training RMSE = %.3f' % np.sqrt(mean_squared_error(y_true=y, y_pred=lr.predict(X))))