Esempio n. 1
0
    def __init__(self, exclude=None):
        super(CompositionFeaturizers, self).__init__(exclude=exclude)

        self._fast_featurizers = [
            cf.AtomicOrbitals(),
            cf.ElementProperty.from_preset("matminer"),
            cf.ElementProperty.from_preset("magpie"),
            cf.ElementFraction(),
            cf.Stoichiometry(),
            cf.TMetalFraction(),
            cf.BandCenter(),
            cf.ValenceOrbital()
        ]

        self._slow_featurizers = [
            cf.Miedema(),
            cf.AtomicPackingEfficiency(),  # slower than the rest
            cf.CohesiveEnergy()  # requires mpid present
        ]

        self._need_oxi_featurizers = [
            cf.CationProperty.from_preset(preset_name='deml'),
            cf.OxidationStates.from_preset(preset_name='deml'),
            cf.ElectronAffinity(),
            cf.ElectronegativityDiff(),
            cf.YangSolidSolution(),
            cf.IonProperty()
        ]
Esempio n. 2
0
    def __init__(self):
        self.feature_calculators = MultipleFeaturizer([
            cf.ElementProperty.from_preset(preset_name="magpie"),
            cf.Stoichiometry(),
            cf.ValenceOrbital(props=['frac']),
            cf.IonProperty(fast=True),
            cf.BandCenter(),
            cf.ElementFraction(),
        ])

        self.str2composition = StrToComposition()
Esempio n. 3
0
 def all(self):
     fs = [
         cf.AtomicOrbitals(),
         cf.ElementProperty.from_preset("matminer"),
         cf.ElementProperty.from_preset("magpie"),
         cf.ElementProperty.from_preset("matscholar_el"),
         cf.ElementProperty.from_preset("deml"),
         cf.ElementFraction(),
         cf.Stoichiometry(),
         cf.TMetalFraction(),
         cf.BandCenter(),
         cf.ValenceOrbital(),
         cf.YangSolidSolution(),
         cf.CationProperty.from_preset(preset_name='deml'),
         cf.OxidationStates.from_preset(preset_name='deml'),
         cf.ElectronAffinity(),
         cf.ElectronegativityDiff(),
         cf.IonProperty(),
         cf.Miedema(),
         cf.AtomicPackingEfficiency(),  # slower than the rest
         cf.CohesiveEnergy()  # requires mpid present
     ]
     return self._get_featurizers(fs)
Esempio n. 4
0

# element property 
ep_feat = composition.ElementProperty.from_preset(preset_name="magpie")
df_ft = ep_feat.featurize_dataframe(df_ft, col_id="composition", ignore_errors=True)# input the "composition" column to the featurizer
# atomic orbitals
ao_feat = composition.AtomicOrbitals()
df_ft = ao_feat.featurize_dataframe(df_ft, col_id="composition", ignore_errors=True)  
# band center
bc_feat  = composition.BandCenter()
df_ft = bc_feat.featurize_dataframe(df_ft, col_id="composition", ignore_errors=True) 
# miedema
m_feat  = composition.Miedema()
df_ft = m_feat.featurize_dataframe(df_ft, col_id="composition", ignore_errors=True) 
# stoichiometry
s_feat  = composition.Stoichiometry()
df_ft = s_feat.featurize_dataframe(df_ft, col_id="composition", ignore_errors=True) 
# t metal fraction
tmf_feat  = composition.TMetalFraction()
df_ft = tmf_feat.featurize_dataframe(df_ft, col_id="composition", ignore_errors=True) 
# # valence orbital
# vo_feat  = composition.ValenceOrbital()
# df_ft = vo_feat.featurize_dataframe(df_ft, col_id="composition", ignore_errors=True) 
# # yang solid solution
# yss_feat  = composition.YangSolidSolution()
# df_ft = yss_feat.featurize_dataframe(df_ft, col_id="composition", ignore_errors=True) 
# # atomic packing efficiency
# ape_feat  = composition.AtomicPackingEfficiency()
# df_ft = ape_feat.featurize_dataframe(df_ft, col_id="composition", ignore_errors=True) 

df_ft.shape
Esempio n. 5
0
      data.columns.tolist())

# Get only the minimum energy structure at each composition
data['composition'] = data['structure'].apply(lambda x: x.composition)
data['integer_formula'] = data['composition'].apply(
    lambda x: x.get_integer_formula_and_factor()[0])

data.sort_values('e_above_hull', ascending=True, inplace=True)
data.drop_duplicates('integer_formula', keep='first', inplace=True)
print('Reduced dataset to {} unique compositions.'.format(len(data)))

data.reset_index(inplace=True, drop=True)

# Create the featurizer, which will take the composition as input
featurizer = MultipleFeaturizer([
    cf.Stoichiometry(),
    cf.ElementProperty.from_preset('magpie'),
    cf.ValenceOrbital(props=['frac']),
    cf.IonProperty(fast=True)
])

# Compute the features
featurizer.set_n_jobs(1)
X = featurizer.featurize_many(data['composition'])

# Make the model
model = Pipeline([('imputer', Imputer()), ('model', RandomForestRegressor())])
model.fit(X, data['formation_energy_per_atom'])
print('Trained a RandomForest model')

# Save the model, featurizer, and data using pickle
Esempio n. 6
0
def load_data_zT():
    results_dir = setResDir()

    ## Metadata
    keys_response = [
        'Seebeck coefficient; squared', 'Electrical resistivity',
        'Thermal conductivity'
    ]
    sign = np.array([
        +1,  # Seebeck
        -1,  # Electric resistivity
        -1  # Thermal conductivity
    ])

    ## Load data, if possible
    # --------------------------------------------------
    try:
        df_X_all = pd.read_csv(results_dir + file_features)
        X_all = df_X_all.drop(df_X_all.columns[0], axis=1).values

        df_Y_all = pd.read_csv(results_dir + file_responses)
        Y_all = df_Y_all.drop(df_Y_all.columns[0], axis=1).values
        print("Cached data loaded.")

    except FileNotFoundError:
        ## Data Import
        # --------------------------------------------------
        # Initialize client
        print("Accessing data from Citrination...")
        site = 'https://citrination.com'  # Citrination
        client = CitrinationClient(api_key=os.environ['CITRINATION_API_KEY'],
                                   site=site)
        search_client = client.search
        # Aluminum dataset
        dataset_id = 178480  # ucsb_te_roomtemp_seebeck
        system_query = PifSystemReturningQuery(
            size=1000,
            query=DataQuery(dataset=DatasetQuery(id=Filter(
                equal=str(dataset_id)))))

        query_result = search_client.pif_search(system_query)
        print("    Found {} PIFs in dataset {}.".format(
            query_result.total_num_hits, dataset_id))

        ## Wrangle
        # --------------------------------------------------
        pifs = [x.system for x in query_result.hits]
        # Utility function will tabularize PIFs
        df_response = pifs2df(pifs)
        # Down-select columns to play well with to_numeric
        df_response = df_response[[
            'Seebeck coefficient', 'Electrical resistivity',
            'Thermal conductivity'
        ]]
        df_response = df_response.apply(pd.to_numeric)

        # Parse chemical compositions
        formulas = [pif.chemical_formula for pif in pifs]

        df_comp = pd.DataFrame(columns=['chemical_formula'], data=formulas)

        # Join
        df_data = pd.concat([df_comp, df_response], axis=1)
        print("    Accessed data.")

        # Featurize
        print("Featurizing data...")
        df_data['composition'] = df_data['chemical_formula'].apply(
            get_compostion)

        f = MultipleFeaturizer([
            cf.Stoichiometry(),
            cf.ElementProperty.from_preset("magpie"),
            cf.ValenceOrbital(props=['avg']),
            cf.IonProperty(fast=True)
        ])

        X = np.array(f.featurize_many(df_data['composition']))

        # Find valid response values
        keys_original = [
            'Seebeck coefficient', 'Electrical resistivity',
            'Thermal conductivity'
        ]

        index_valid_response = {
            key: df_data[key].dropna().index.values
            for key in keys_original
        }

        index_valid_all = df_data[keys_original].dropna().index.values
        X_all = X[index_valid_all, :]
        Y_all = df_data[keys_original].iloc[index_valid_all].values

        # Manipulate columns for proper objective values
        Y_all[:, 0] = Y_all[:, 0]**2  # Squared seebeck
        print("    Data prepared; {0:} valid observations.".format(
            X_all.shape[0]))

        # Cache data
        pd.DataFrame(data=X_all).to_csv(results_dir + file_features)
        pd.DataFrame(data=Y_all, columns=keys_response).to_csv(results_dir +
                                                               file_responses)
        print("Data cached in results directory.")

    return X_all, Y_all, sign, keys_response, prefix