Example #1
0
def generate(fake_df, ignore_errors=False):
    fake_df = np.array([fake_df])
    fake_df = pd.DataFrame(fake_df)
    fake_df.columns = ['full_formula']
    # print(fake_df)
    fake_df = StrToComposition().featurize_dataframe(
        fake_df, "full_formula", ignore_errors=ignore_errors)
    fake_df = fake_df.dropna()
    fake_df = feature_calculators.featurize_dataframe(
        fake_df, col_id='composition', ignore_errors=ignore_errors)
    fake_df["NComp"] = fake_df["composition"].apply(len)
    return fake_df
from matminer.featurizers.composition import ElementProperty
ep_feat = ElementProperty.from_preset(preset_name="magpie")
df = ep_feat.featurize_dataframe(df, col_id="composition")  # input the "composition" column to the featurizer
from matminer.featurizers.conversions import CompositionToOxidComposition
from matminer.featurizers.composition import OxidationStates
df = CompositionToOxidComposition().featurize_dataframe(df, "composition")
os_feat = OxidationStates()
df = os_feat.featurize_dataframe(df, "composition_oxid")
dataset = PymatgenData()
descriptors = ['row', 'group', 'atomic_mass',
               'atomic_radius', 'boiling_point', 'melting_point', 'X']
stats = ["mean", "std_dev"]
ep = ElementProperty(data_source=dataset, features=descriptors, stats=stats)
df = ep.featurize_dataframe(df, "composition")
#Remove NaN values
df = df.dropna()

#y = df['elasticity.K_VRH'].values
y=df['Tensile Strength, Yield'].values
excluded = ["elasticity.G_VRH", "elasticity.K_VRH",  "pretty_formula", 'volume','nsites','spacegroup.symbol','e_above_hull','Tensile Strength, Yield','Elongation at Break ','Tensile Strength,Ultimate',
            "poisson_ratio", "composition", "composition_oxid"]#"elastic_anisotropy"
X = df.drop(excluded, axis=1)
print("There are {} possible descriptors:\n\n{}".format(X.shape[1], X.columns.values))
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import numpy as np
lr = LinearRegression()
lr.fit(X, y)
# get fit statistics
print('training R2 = ' + str(round(lr.score(X, y), 3)))
print('training RMSE = %.3f' % np.sqrt(mean_squared_error(y_true=y, y_pred=lr.predict(X))))