Beispiel #1
0
    def __init__(self, exclude=None):
        super(CompositionFeaturizers, self).__init__(exclude=exclude)

        self._fast_featurizers = [
            cf.AtomicOrbitals(),
            cf.ElementProperty.from_preset("matminer"),
            cf.ElementProperty.from_preset("magpie"),
            cf.ElementFraction(),
            cf.Stoichiometry(),
            cf.TMetalFraction(),
            cf.BandCenter(),
            cf.ValenceOrbital()
        ]

        self._slow_featurizers = [
            cf.Miedema(),
            cf.AtomicPackingEfficiency(),  # slower than the rest
            cf.CohesiveEnergy()  # requires mpid present
        ]

        self._need_oxi_featurizers = [
            cf.CationProperty.from_preset(preset_name='deml'),
            cf.OxidationStates.from_preset(preset_name='deml'),
            cf.ElectronAffinity(),
            cf.ElectronegativityDiff(),
            cf.YangSolidSolution(),
            cf.IonProperty()
        ]
Beispiel #2
0
 def express(self):
     fs = [
         cf.ElementProperty.from_preset("magpie"),
         cf.OxidationStates.from_preset(preset_name='deml'),
         cf.ElectronAffinity(),
         cf.IonProperty(),
         cf.YangSolidSolution(),
         cf.Miedema(),
     ]
     return self._get_featurizers(fs)
Beispiel #3
0
    def __init__(self):
        self.feature_calculators = MultipleFeaturizer([
            cf.ElementProperty.from_preset(preset_name="magpie"),
            cf.Stoichiometry(),
            cf.ValenceOrbital(props=['frac']),
            cf.IonProperty(fast=True),
            cf.BandCenter(),
            cf.ElementFraction(),
        ])

        self.str2composition = StrToComposition()
Beispiel #4
0
 def all(self):
     fs = [
         cf.AtomicOrbitals(),
         cf.ElementProperty.from_preset("matminer"),
         cf.ElementProperty.from_preset("magpie"),
         cf.ElementProperty.from_preset("matscholar_el"),
         cf.ElementProperty.from_preset("deml"),
         cf.ElementFraction(),
         cf.Stoichiometry(),
         cf.TMetalFraction(),
         cf.BandCenter(),
         cf.ValenceOrbital(),
         cf.YangSolidSolution(),
         cf.CationProperty.from_preset(preset_name='deml'),
         cf.OxidationStates.from_preset(preset_name='deml'),
         cf.ElectronAffinity(),
         cf.ElectronegativityDiff(),
         cf.IonProperty(),
         cf.Miedema(),
         cf.AtomicPackingEfficiency(),  # slower than the rest
         cf.CohesiveEnergy()  # requires mpid present
     ]
     return self._get_featurizers(fs)
Beispiel #5
0
data['composition'] = data['structure'].apply(lambda x: x.composition)
data['integer_formula'] = data['composition'].apply(
    lambda x: x.get_integer_formula_and_factor()[0])

data.sort_values('e_above_hull', ascending=True, inplace=True)
data.drop_duplicates('integer_formula', keep='first', inplace=True)
print('Reduced dataset to {} unique compositions.'.format(len(data)))

data.reset_index(inplace=True, drop=True)

# Create the featurizer, which will take the composition as input
featurizer = MultipleFeaturizer([
    cf.Stoichiometry(),
    cf.ElementProperty.from_preset('magpie'),
    cf.ValenceOrbital(props=['frac']),
    cf.IonProperty(fast=True)
])

# Compute the features
featurizer.set_n_jobs(1)
X = featurizer.featurize_many(data['composition'])

# Make the model
model = Pipeline([('imputer', Imputer()), ('model', RandomForestRegressor())])
model.fit(X, data['formation_energy_per_atom'])
print('Trained a RandomForest model')

# Save the model, featurizer, and data using pickle
with open('model.pkl', 'wb') as fp:
    pkl.dump(model, fp)
with open('featurizer.pkl', 'wb') as fp:
Beispiel #6
0
def load_data_zT():
    results_dir = setResDir()

    ## Metadata
    keys_response = [
        'Seebeck coefficient; squared', 'Electrical resistivity',
        'Thermal conductivity'
    ]
    sign = np.array([
        +1,  # Seebeck
        -1,  # Electric resistivity
        -1  # Thermal conductivity
    ])

    ## Load data, if possible
    # --------------------------------------------------
    try:
        df_X_all = pd.read_csv(results_dir + file_features)
        X_all = df_X_all.drop(df_X_all.columns[0], axis=1).values

        df_Y_all = pd.read_csv(results_dir + file_responses)
        Y_all = df_Y_all.drop(df_Y_all.columns[0], axis=1).values
        print("Cached data loaded.")

    except FileNotFoundError:
        ## Data Import
        # --------------------------------------------------
        # Initialize client
        print("Accessing data from Citrination...")
        site = 'https://citrination.com'  # Citrination
        client = CitrinationClient(api_key=os.environ['CITRINATION_API_KEY'],
                                   site=site)
        search_client = client.search
        # Aluminum dataset
        dataset_id = 178480  # ucsb_te_roomtemp_seebeck
        system_query = PifSystemReturningQuery(
            size=1000,
            query=DataQuery(dataset=DatasetQuery(id=Filter(
                equal=str(dataset_id)))))

        query_result = search_client.pif_search(system_query)
        print("    Found {} PIFs in dataset {}.".format(
            query_result.total_num_hits, dataset_id))

        ## Wrangle
        # --------------------------------------------------
        pifs = [x.system for x in query_result.hits]
        # Utility function will tabularize PIFs
        df_response = pifs2df(pifs)
        # Down-select columns to play well with to_numeric
        df_response = df_response[[
            'Seebeck coefficient', 'Electrical resistivity',
            'Thermal conductivity'
        ]]
        df_response = df_response.apply(pd.to_numeric)

        # Parse chemical compositions
        formulas = [pif.chemical_formula for pif in pifs]

        df_comp = pd.DataFrame(columns=['chemical_formula'], data=formulas)

        # Join
        df_data = pd.concat([df_comp, df_response], axis=1)
        print("    Accessed data.")

        # Featurize
        print("Featurizing data...")
        df_data['composition'] = df_data['chemical_formula'].apply(
            get_compostion)

        f = MultipleFeaturizer([
            cf.Stoichiometry(),
            cf.ElementProperty.from_preset("magpie"),
            cf.ValenceOrbital(props=['avg']),
            cf.IonProperty(fast=True)
        ])

        X = np.array(f.featurize_many(df_data['composition']))

        # Find valid response values
        keys_original = [
            'Seebeck coefficient', 'Electrical resistivity',
            'Thermal conductivity'
        ]

        index_valid_response = {
            key: df_data[key].dropna().index.values
            for key in keys_original
        }

        index_valid_all = df_data[keys_original].dropna().index.values
        X_all = X[index_valid_all, :]
        Y_all = df_data[keys_original].iloc[index_valid_all].values

        # Manipulate columns for proper objective values
        Y_all[:, 0] = Y_all[:, 0]**2  # Squared seebeck
        print("    Data prepared; {0:} valid observations.".format(
            X_all.shape[0]))

        # Cache data
        pd.DataFrame(data=X_all).to_csv(results_dir + file_features)
        pd.DataFrame(data=Y_all, columns=keys_response).to_csv(results_dir +
                                                               file_responses)
        print("Data cached in results directory.")

    return X_all, Y_all, sign, keys_response, prefix
Beispiel #7
0
 data.sort_values('delta_e', ascending=True, inplace=True)
 data.drop_duplicates('composition', keep='first', inplace=True)
 print('Removed %d/%d entries' %
       (original_count - len(data), original_count))
 # 用逻辑和的方式筛选[-20,5]范围内的delta_e
 original_count = len(data)
 data = data[np.logical_and(data['delta_e'] >= -20, data['delta_e'] <= 5)]
 print('Removed %d/%d entries' %
       (original_count - len(data), original_count))
 print(data.head(3))
 # 设定化学计算规范:使用MagpieData数据源初始化元素属性,返回各层轨道电子数量信息,假设元素以单一氧化态存在
 feature_calculators = MultipleFeaturizer([
     cf.Stoichiometry(),
     cf.ElementProperty.from_preset("magpie"),
     cf.ValenceOrbital(props=['avg']),
     cf.IonProperty(fast=False)
 ])
 # 获得特征名
 feature_labels = feature_calculators.feature_labels()
 # 计算特征量
 data = feature_calculators.featurize_dataframe(data,
                                                col_id='composition_obj')
 print('Generated %d features' % len(feature_labels))
 print('Training set size:',
       'x'.join([str(x) for x in data[feature_labels].shape]))
 # 去除空值缺省值
 original_count = len(data)
 data = data[~data[feature_labels].isnull().any(axis=1)]
 print('Removed %d/%d entries' %
       (original_count - len(data), original_count))
 print(data.head(3))