def featurize_bandstructure(self, df: pd.DataFrame) -> pd.DataFrame: """ Decorate input "pandas.DataFrame" of structures with bandstructure features from matminer, specified by the extendedMODFeaturizer preset. Arguments: df: the input dataframe with a "bandstructure" column containing "pymatgen.electronic_structure.bandstructure" objects. Returns: pandas.DataFrame: the decorated DataFrame. """ if not (self.band_featurizers): return pd.DataFrame([]) LOG.info("Applying bandstructure featurizers...") df = df.copy() try: df = self._fit_apply_featurizers(df, self.band_featurizers, "bandstructure") except: df = self.band_featurizers.featurize_dataframe( df=df, col_id="bandstructure", multiindex=True, ignore_errors=True) df = df.rename(columns={'Input Data': ''}) df.columns = df.columns.map('|'.join).str.strip('|') return df
def get_featurized_data() -> pd.DataFrame: """ A function that checks if featurized data is present in folder, if not, will download and store the data. Returns a dataframe """ featurized_data_path = Path(__file__).resolve().parents[2] / \ "data" / "raw" / "featurized" featurized_file_path = featurized_data_path / "featurized-11-04-2021.pkl" if not does_file_exist(featurized_file_path): # Add unique url id for figshare endpoint url = "https://ndownloader.figshare.com/files/26777699" file = wget.download(url) # Read and load pkl with open(file, 'rb') as f: df = pickle.load(f) # Make directory if not present Path(featurized_data_path).mkdir(parents=True, exist_ok=True) df.to_pickle(featurized_file_path) os.remove(file) else: LOG.info("Reading data..") df = pd.read_pickle(featurized_file_path) return df
def featurize_dos(self, df: pd.DataFrame) -> pd.DataFrame: """ Decorate input "pandas.DataFrame" of structures with density of state features from matminer, specified by the extendedMODFeaturizer preset. Arguments: df: the input dataframe with a "dos" column containing "pymatgen.dos" objects. Returns: pandas.DataFrame: the decorated DataFrame. """ if not (self.dos_featurizers): return pd.DataFrame([]) LOG.info("Applying dos featurizers...") df = df.copy() try: df = self._fit_apply_featurizers(df, self.dos_featurizers, "dos") except: df = self.dos_featurizers.featurize_dataframe(df, "dos", multiindex=True, ignore_errors=True, fit_to_df=True) #df = df.replace([np.inf, -np.inf, np.nan], 0) df = df.rename(columns={'Input Data': ''}) df.columns = df.columns.map('|'.join).str.strip('|') return df
def _sort(self, df: pd.DataFrame, entries: pd.DataFrame) -> pd.DataFrame: bandgaps_tbmbj = np.empty(len(entries)) bandgaps_tbmbj[:] = np.nan bandgaps_opt = np.copy(bandgaps_tbmbj) spillage = np.copy(bandgaps_tbmbj) LOG.info("total iterations: {}".format(len(entries))) for i, mp_icsd_list in tqdm(enumerate(entries["icsd_ids"])): for j, jarvis_icsd_list in enumerate(df["icsd"]): for icsd_mp in (mp_icsd_list): for icsd_jarvis in (jarvis_icsd_list): if icsd_mp == int(icsd_jarvis): bandgaps_tbmbj[i] = float( df["mbj_bandgap"].iloc[j]) bandgaps_opt[i] = float( df["optb88vdw_bandgap"].iloc[j]) spillage[i] = float(df["spillage"].iloc[j]) sorted_df = pd.DataFrame({ "jarvis_bg_tbmbj": bandgaps_tbmbj, "jarvis_bg_opt": bandgaps_opt, "jarvis_spillage": spillage, "material_id": entries["material_id"] }) sorted_df.to_pickle(self.interim_data_path) return sorted_df
def _sort(self, df: pd.DataFrame, entries: pd.DataFrame) -> pd.DataFrame: bandgaps = np.empty(len(entries)) bandgaps[:] = np.nan spacegroups = np.copy(bandgaps) ICSDs = np.copy(bandgaps) LOG.info("total iterations: {}".format(len(entries))) for i, icsd_list in tqdm(enumerate(entries["icsd_ids"])): for j, oqmd_icsd in enumerate( df["crystal_structure.cross_reference.icsd"]): for icsd in icsd_list: if icsd == oqmd_icsd: spacegroups[i] = int( df["crystal_structure.space_group_number"].iloc[j]) bandgaps[i] = df["oqmd.band_gap.value"].iloc[j] ICSDs[i] = int(oqmd_icsd) sorted_df = pd.DataFrame({ "oqmd_bg": bandgaps, "oqmd_sg": spacegroups, "oqmd_icsd": ICSDs, "material_id": entries["material_id"] }) sorted_df.to_pickle(self.interim_data_path) return sorted_df
def _apply_query(self, sorted: Optional[bool]) -> pd.DataFrame: # Query mdf = MDFDataRetrieval(anonymous=True) df = mdf.get_dataframe( { "source_names": ['oqmd'], "match_fields": { "oqmd.converged": True } }, unwind_arrays=False) # Applying filters for unneccessary data df = df[[ "crystal_structure.space_group_number", "dft.exchange_correlation_functional", "material.composition", "crystal_structure.cross_reference.icsd", "oqmd.band_gap.value", "dc.relatedIdentifiers" ]] df = df[df["oqmd.band_gap.value"] > 0] df['crystal_structure.cross_reference.icsd'] = df[ 'crystal_structure.cross_reference.icsd'].fillna(0) df["crystal_structure.space_group_number"] = df[ "crystal_structure.space_group_number"].astype(int) df["crystal_structure.cross_reference.icsd"] = df[ "crystal_structure.cross_reference.icsd"].astype(int) df = df.reset_index(drop=True) LOG.info("Writing to raw data...") df.to_pickle(self.raw_data_path) return df
def _sort(self, df: pd.DataFrame, entries: pd.DataFrame) -> pd.DataFrame: bandgap = np.empty(len(entries)) bandgap[:] = np.nan bandgap_fitted = np.copy(bandgap) spacegroup_orig = np.copy(bandgap) spacegroup_relax = np.copy(bandgap) ICSDs = np.copy(bandgap) LOG.info("total iterations: {}".format(len(entries))) for i, icsd_list in tqdm(enumerate(entries["icsd_ids"])): for j, aflow_icsd in enumerate(df["prototype"]): for icsd in eval(str(icsd_list)): if icsd == int(aflow_icsd.split("_")[-1][:-1]): spacegroup_orig[i] = int(df["spacegroup_orig"].iloc[j]) spacegroup_relax[i] = int( df["spacegroup_relax"].iloc[j]) ICSDs[i] = int(aflow_icsd.split("_")[-1][:-1]) bandgap[i] = df["Egap"].iloc[j] bandgap_fitted[i] = df["Egap_fit"].iloc[j] sorted_df = pd.DataFrame({ "aflow_bg": bandgap, "aflow_bg_fit": bandgap_fitted, "aflow_sg_orig": spacegroup_orig, "aflow_sg_relax": spacegroup_relax, "aflow_icsd": ICSDs, "material_id": entries["material_id"] }) sorted_df.to_pickle(self.data_dir / "interim" / "AFLOW" / "AFLOW.pkl") return sorted_df
def get_dataframe(self, sorted: Optional[bool] = True) -> pd.DataFrame: if self._does_file_exist(): df = pd.read_pickle(self.raw_data_path) else: df = self._apply_query(sorted=sorted) LOG.info("Done") return (df)
def _does_file_exist(self) -> bool: if os.path.exists(self.raw_data_path): LOG.info("Data path {} detected. Reading now...".format( self.raw_data_path)) return True else: LOG.info("Data path {} not detected. Applying query now...".format( self.raw_data_path)) return False
def _apply_query(self, sorted: Optional[bool]) -> pd.DataFrame: url = "https://ndownloader.figshare.com/files/22471022" js_tag = "jdft_3d-4-26-2020.json" path = str(os.path.join(os.path.dirname(__file__), js_tag)) if not os.path.isfile(path): zfile = str(os.path.join(os.path.dirname(__file__), "tmp.zip")) r = requests.get(url) f = open(zfile, "wb") f.write(r.content) f.close() with zipfile.ZipFile(zfile, "r") as zipObj: zipObj.extractall(os.path.join(os.path.dirname(__file__))) os.remove(zfile) f = open(path, "r") data = json.load(f) f.close() if os.path.exists(path): os.remove(path) # Query #df = pd.DataFrame(data('dft_3d'))\ df = pd.DataFrame(data)\ .replace("na", np.nan)\ .replace("None", np.nan)\ .fillna(value=np.nan)\ .dropna(subset=['icsd']) icsd_list = [] # ICSD-column is not consequent in notation, therefore we present a fix for icsd_jarvis in df["icsd"]: if isinstance(icsd_jarvis, str): if isinstance(eval(icsd_jarvis), int): icsd_list.append([eval(icsd_jarvis)]) elif isinstance(eval(icsd_jarvis), list): icsd_list.append(eval(icsd_jarvis)) elif isinstance(icsd_jarvis, float): icsd_list.append([icsd_jarvis]) df["icsd"] = icsd_list df = df[df["optb88vdw_bandgap"] > 0].reset_index(drop=True) LOG.info("Writing to raw data...") df.to_pickle(self.raw_data_path) return df
def apply_featurizers(criterion, properties, mpdr, featurizerObject): LOG.info("Downloading dos and bandstructure objects..") timeDownloadStart = time.time() df_portion = mpdr.get_dataframe(criteria=criterion, properties=properties) timeDownloadEnd = time.time() LOG.info(df_portion) df_time, df_portion = featurizerObject.featurize(df_portion) df_time["download_objects"] = [timeDownloadEnd - timeDownloadStart] return df_time, df_portion
def does_file_exist(filepath: Path) -> bool: """ Checks if file path exists. """ if os.path.exists(filepath): LOG.info("Data path detected:\n{}\.".format(filepath)) return True else: LOG.info( "Data path\n{}\nnot detected. Downloading now...".format(filepath)) return False
def get_data_AFLOW(self, compound_list: list, keys: list, batch_size: int, catalog: str = "icsd") -> Dict: """ A function used to make a query to AFLOW. ... Args ---------- compound_list : list (dim:N) A list of strings containing full formula, eg. H2O1 or Si1C1 keys : list (dim:M) A list containing the features of the compound, found in documentation of AFLUX. eg. Egap batch_size : int Number of data entries to return per HTTP request catalog : str "icsd" for ICSD Returns ------- dict A dictionary containing the resulting matching queries. This can result in several matching compounds for each compound. """ index = 0 aflow_dict = {k: [] for k in keys} for compound in tqdm(compound_list): LOG.info("Current query: {}".format(compound)) results = search(catalog=catalog, batch_size=batch_size)\ .filter(K.compound==compound) if len(results) > 0: for result in tqdm(results): for key in keys: try: aflow_dict[key].append(getattr(result, key)) except: aflow_dict[key].append("None") if (index % 10 == 0): pd.DataFrame.from_dict(aflow_dict).to_pickle( self.data_dir / "raw" / "AFLOW" / "new_AFLOW.pkl") index += 1 else: LOG.info("No compound is matching the search") continue return aflow_dict
def _apply_query(self, sorted: Optional[bool]) -> pd.DataFrame: cdr = CitrineDataRetrieval(api_key=self.API_KEY) criteria = {"data_type": "EXPERIMENTAL"} properties = ['Band gap'] common_fields = [ "uid", "chemicalFormula", "references", "Crystallinity", "Structure", "Crystal structure", "uid" ] df = cdr.get_dataframe(criteria=criteria, properties=properties, common_fields=common_fields) LOG.info("Writing to raw data...") df.to_pickle(self.raw_data_path) return df
def featurize_site( self, df: pd.DataFrame, aliases: Optional[Dict[str, str]] = None) -> pd.DataFrame: """ Decorate input "pandas.DataFrame" of structures with site features, specified by the extendedMODFeaturizer preset. Arguments: df: the input dataframe with a ""structure"" column containing "pymatgen.Structure" objects. aliases: optional dictionary to map matminer output column names to new aliases, mostly used for backwards-compatibility. Returns: pandas.DataFrame: the decorated DataFrame. """ if not self.site_featurizers: return pd.DataFrame([]) LOG.info("Applying site featurizers...") df = df.copy() df.columns = ["Input data|" + x for x in df.columns] for fingerprint in self.site_featurizers: site_stats_fingerprint = SiteStatsFingerprint( fingerprint, stats=self.site_stats) df = site_stats_fingerprint.featurize_dataframe( df, "Input data|structure", multiindex=False, ignore_errors=True) fingerprint_name = fingerprint.__class__.__name__ if aliases: fingerprint_name = aliases.get(fingerprint_name, fingerprint_name) if "|" not in fingerprint_name: fingerprint_name += "|" df.columns = [ f"{fingerprint_name}{x}" if "|" not in x else x for x in df.columns ] return df
def _apply_query(self, sorted: Optional[bool]) -> pd.DataFrame: # Add unique url id for figshare endpoint url = "https://ndownloader.figshare.com/files/26777717" file = wget.download(url) # Read and load pkl data with open(file, 'rb') as f: df = pickle.load(f) os.remove(file) # TODO : Add option to make new queries to AFLOW. This has to be # rewritten since AFLOW does not have MPID. """ try: MP = data_MP(API_KEY = self.MAPI_KEY) except: raise ValueError("AFLOW is dependent on MP data. Add MAPI_KEY argument\ to class constructor.") entries = MP.get_dataframe() df = self.sort_with_MP(df, entries) # Find if there are new entries in MP newEntries = entries[~entries["material_id"].isin(df["material_id"])] # Update if there are new entries if newEntries.shape[0]>0: keys = list(pd.read_pickle(self.data_dir / "raw" / "AFLOW" / "AFLOW_keywords.pkl")) LOG.info("New entries identified. Generating features for AFLOW...") AFLOW_portion = self.get_dataframe_AFLOW(compound_list=list(newEntries["full_formula"]), keys=keys, batch_size = 1000, catalog="icsd") AFLOW_portion = self._sort(AFLOW_portion, entries) df = pd.concat([df, AFLOW_portion]) df = sortByMPID(df) """ LOG.info("Writing to raw data...") df.to_pickle(self.data_dir / "raw" / "AFLOW" / "AFLOW.pkl") return df
def featurize_structure(self, df: pd.DataFrame) -> pd.DataFrame: """ Decorate input "pandas.DataFrame" of structures with structural features from matminer, specified by the extendedMODFeaturizer preset. Currently applies the set of all matminer structure features. Arguments: df: the input dataframe with a ""structure"" column containing "pymatgen.Structure" objects. Returns: pandas.DataFrame: the decorated DataFrame. """ if not self.structure_featurizers: return pd.DataFrame([]) LOG.info("Applying structure featurizers...") df = df.copy() df = self._fit_apply_featurizers(df, self.structure_featurizers, "structure") df.columns = df.columns.map('|'.join).str.strip('|') return df
def _apply_query(self, sorted: Optional[bool] = True) -> pd.DataFrame: # TODO: Remove when all #if (isCurrentlyFeaturizing == False): with MPRester(self.API_KEY) as mpr: # Initial criteria criteria = { "icsd_ids": { "$gt": 0 }, #All compounds deemed similar to a structure in ICSD "band_gap": { "$gt": 0.1 }, #"material_id":{"$in": featurizedData["material_id"].to_list()} } # Features props = [ "material_id", "full_formula", "icsd_ids", "spacegroup.number", "spacegroup.point_group", "band_gap", "run_type", "cif", "structure", "pretty_formula", "total_magnetization", "nelements", "efermi", "oxide_type" ] # Query df = pd.DataFrame(mpr.query(criteria=criteria, properties=props)) # Remove unsupported MPIDs df = filterIDs(df) LOG.info("Current shape of dataframe after filter applied: {}".format( df.shape)) # Sort by ascending MPID order if (sorted): df = sortByMPID(df) LOG.info("Writing to raw data...") df.to_pickle(self.raw_data_path) return df
def featurize_composition(self, df: pd.DataFrame) -> pd.DataFrame: """ Decorate input "pandas.DataFrame" of structures with composition features from matminer, specified by the extendedMODFeaturizer preset. Currently applies the set of all matminer composition features. Arguments: df: the input dataframe with a ""structure"" column containing "pymatgen.Structure" objects. Returns: pandas.DataFrame: the decorated DataFrame, or an empty DataFrame if no composition/oxidation featurizers exist for this class. """ if not (self.composition_featurizers or self.oxid_composition_featurizers): return pd.DataFrame([]) df = df.copy() if self.composition_featurizers: LOG.info("Applying composition featurizers...") df['composition'] = df['structure'].apply(lambda s: s.composition) df = self._fit_apply_featurizers(df, self.composition_featurizers, "composition") #df = df.replace([np.inf, -np.inf, np.nan], 0) df = df.rename(columns={'Input Data': ''}) df.columns = df.columns.map('|'.join).str.strip('|') if self.oxid_composition_featurizers: LOG.info("Applying oxidation state featurizers...") df = CompositionToOxidComposition().featurize_dataframe( df, "composition") df = self._fit_apply_featurizers(df, self.oxid_composition_featurizers, "composition_oxid") df = df.rename(columns={'Input Data': ''}) df.columns = df.columns.map('|'.join).str.strip('|') return df
def _apply_query(self, sorted: Optional[bool])-> pd.DataFrame: # Add unique url id for figshare endpoint url = "https://ndownloader.figshare.com/files/26922764" file = wget.download(url) # Read and load pkl with open(file, 'rb') as f: df = pickle.load(f) os.remove(file) # Get data from Materials Project try: MP = data_MP(API_KEY = self.MAPI_KEY) except: raise ValueError("AFLOW-ML is dependent on MP data. Add MAPI_KEY argument\ to class constructor.") entries = MP.get_dataframe() # Find if there are new entries in MP newEntries = entries[~entries["material_id"].isin(df["material_id"])] # Update if there are new entries if (newEntries.shape[0]>0): LOG.info("{} new entries identified. Generating features for AFLOW-ML...".format(newEntries.shape[0])) AFLOWML_portion = self.calculate_dataframe(entries=newEntries) df = pd.concat([df, AFLOWML_portion]) df = sortByMPID(df) LOG.info("Writing to raw data...") df.to_pickle(self.data_dir / "raw" / "AFLOWML" / "AFLOWML.pkl") return df;
def run_featurizer() -> None: """ Function used to run, and rerun a featurization process of a large amount of entries. As default, we use the initial query from Materials Project. Initialised by "make features" If program stops, identify mistake (most likely an error in Materials Project (add to filterIDs)), remove raw data in Materials Project data folder, and rerun with "make features" command. """ project_dir = Path(__file__).resolve().parents[2] data_dir = project_dir / "data" dotenv.load_dotenv(project_dir / ".env") MAPI_KEY = os.getenv("MAPI_KEY") MP = data_MP(API_KEY=MAPI_KEY) entries = MP.get_dataframe() material_ids = entries["material_id"] del entries, MP featurizerObject = preset.PRESET_HEBNES_2021() if Path(data_dir / "raw" / "featurizer" / "featurized.pkl").is_file(): # If errors met, just rerun and this if-test will run. LOG.info("In-progress featurized data identified. Reading now...") entries_featurized = pd.read_pickle(data_dir / "raw" / "featurizer" / "featurized.pkl") time_featurized = pd.read_csv(data_dir / "raw" / "featurizer" / "timing.csv") LOG.info("Last featurized MPID: {}".format( entries_featurized.index[-1])) howFar = material_ids[material_ids == entries_featurized.index[-1]].index.values # Test if mpid index is the same, true if using the same dataset assert material_ids[howFar[0]] == entries_featurized.index[ -1], "Are you sure this is the same dataset as earlier?" LOG.info("Index: {}".format(howFar)) LOG.info( "Preparing for new featurized data starting with MPID: {}".format( material_ids[howFar[0]])) entries_featurized.to_pickle( data_dir / "raw" / "featurizer" / Path("featurized-upto-" + str(howFar[0]) + ".pkl")) time_featurized.to_csv(data_dir / "raw" / "featurizer" / Path("timing-upto-" + str(howFar[0]) + ".csv")) del entries_featurized, time_featurized df = featurize_by_material_id(material_ids[howFar[0] + 1:], featurizerObject, MAPI_KEY) else: # First time running featurizers. df = featurize_by_material_id(entries["material_id"], featurizerObject, MAPI_KEY)
def featurize_by_material_id( material_ids: np.array, featurizerObject: featurizer.extendedMODFeaturizer, MAPI_KEY: str, writeToFile: bool = True) -> pd.DataFrame: """ Run all of the preset featurizers on the input dataframe. Arguments: df: the input dataframe with a `"structure"` column containing `pymatgen.Structure` objects. Returns: The featurized DataFrame. """ def apply_featurizers(criterion, properties, mpdr, featurizerObject): LOG.info("Downloading dos and bandstructure objects..") timeDownloadStart = time.time() df_portion = mpdr.get_dataframe(criteria=criterion, properties=properties) timeDownloadEnd = time.time() LOG.info(df_portion) df_time, df_portion = featurizerObject.featurize(df_portion) df_time["download_objects"] = [timeDownloadEnd - timeDownloadStart] return df_time, df_portion properties = [ "material_id", "full_formula", "bandstructure", "dos", "structure" ] mpdr = MPDataRetrieval(MAPI_KEY) steps = 1 leftover = len(material_ids) % steps df = pd.DataFrame({}) df_timers = pd.DataFrame({}) for i in tqdm(range(0, len(material_ids), steps)): portionReturned = True if not (i + steps > len(material_ids)): LOG.info(list(material_ids[i:i + steps])) criteria = {"task_id": {"$in": list(material_ids[i:i + steps])}} while (portionReturned): try: df_time, df_portion = apply_featurizers( criteria, properties, mpdr, featurizerObject) portionReturned = False except: LOG.info("Except - try again.") # Add ID to recognize afterwards df_portion["material_id"] = material_ids[i:i + steps] df = pd.concat([df, df_portion]) df_timers = pd.concat([df_timers, df_time]) LOG.info("CURRENT SHAPE:{}".format(df.shape)) if writeToFile: df.to_pickle( Path(__file__).resolve().parents[2] / "data" / "raw" / "featurizer" / "featurized.pkl") df_timers.to_csv( Path(__file__).resolve().parents[2] / "data" / "raw" / "featurizer" / "timing.csv") if (leftover): LOG.info(list(material_ids[i:i + leftover])) criteria = {"task_id": {"$in": list(material_ids[i:i + leftover])}} df_time, df_portion = apply_featurizers(criteria, properties, mpdr, featurizerObject) df_portion["material_id"] = material_ids[i:i + leftover] df = pd.concat([df, df_portion]) df_timers = pd.concat([df_timers, df_time]) if writeToFile: df.to_pickle( Path(__file__).resolve().parents[2] / "data" / "raw" / "featurizer" / "featurized.pkl") df_timers.to_csv( Path(__file__).resolve().parents[2] / "data" / "raw" / "featurizer" / "timing.csv") return df
def main(): get_featurized_data() LOG.info("Done")