Python LOGの例、src.data.utils.LOG Pythonの例

コード例 #1

0

ファイルを表示

    def featurize_bandstructure(self, df: pd.DataFrame) -> pd.DataFrame:
        """ Decorate input "pandas.DataFrame" of structures with bandstructure
        features from matminer, specified by the extendedMODFeaturizer preset.
        Arguments:
            df: the input dataframe with a "bandstructure" column
                containing "pymatgen.electronic_structure.bandstructure" objects.
        Returns:
            pandas.DataFrame: the decorated DataFrame.
        """
        if not (self.band_featurizers):
            return pd.DataFrame([])

        LOG.info("Applying bandstructure featurizers...")
        df = df.copy()
        try:
            df = self._fit_apply_featurizers(df, self.band_featurizers,
                                             "bandstructure")
        except:
            df = self.band_featurizers.featurize_dataframe(
                df=df,
                col_id="bandstructure",
                multiindex=True,
                ignore_errors=True)
        df = df.rename(columns={'Input Data': ''})
        df.columns = df.columns.map('|'.join).str.strip('|')

        return df

コード例 #2

0

ファイルを表示

ファイル: build_features.py プロジェクト: ohebbi/predicting-solid-state-qubit-material-hosts

def get_featurized_data() -> pd.DataFrame:
    """ A function that checks if featurized data is present in folder, if not,
        will download and store the data.

        Returns a dataframe
    """

    featurized_data_path = Path(__file__).resolve().parents[2] / \
        "data" / "raw" / "featurized"

    featurized_file_path = featurized_data_path / "featurized-11-04-2021.pkl"
    if not does_file_exist(featurized_file_path):
        # Add unique url id for figshare endpoint
        url = "https://ndownloader.figshare.com/files/26777699"
        file = wget.download(url)

        # Read and load pkl
        with open(file, 'rb') as f:
            df = pickle.load(f)
            # Make directory if not present
            Path(featurized_data_path).mkdir(parents=True, exist_ok=True)
            df.to_pickle(featurized_file_path)
            os.remove(file)
    else:
        LOG.info("Reading data..")
        df = pd.read_pickle(featurized_file_path)
    return df

コード例 #3

0

ファイルを表示

    def featurize_dos(self, df: pd.DataFrame) -> pd.DataFrame:
        """ Decorate input "pandas.DataFrame" of structures with density of state
        features from matminer, specified by the extendedMODFeaturizer preset.
        Arguments:
            df: the input dataframe with a "dos" column
                containing "pymatgen.dos" objects.
        Returns:
            pandas.DataFrame: the decorated DataFrame.
        """
        if not (self.dos_featurizers):
            return pd.DataFrame([])

        LOG.info("Applying dos featurizers...")
        df = df.copy()

        try:
            df = self._fit_apply_featurizers(df, self.dos_featurizers, "dos")
        except:
            df = self.dos_featurizers.featurize_dataframe(df,
                                                          "dos",
                                                          multiindex=True,
                                                          ignore_errors=True,
                                                          fit_to_df=True)
        #df = df.replace([np.inf, -np.inf, np.nan], 0)
        df = df.rename(columns={'Input Data': ''})
        df.columns = df.columns.map('|'.join).str.strip('|')

        return df

コード例 #4

0

ファイルを表示

    def _sort(self, df: pd.DataFrame, entries: pd.DataFrame) -> pd.DataFrame:

        bandgaps_tbmbj = np.empty(len(entries))
        bandgaps_tbmbj[:] = np.nan

        bandgaps_opt = np.copy(bandgaps_tbmbj)
        spillage = np.copy(bandgaps_tbmbj)

        LOG.info("total iterations: {}".format(len(entries)))
        for i, mp_icsd_list in tqdm(enumerate(entries["icsd_ids"])):
            for j, jarvis_icsd_list in enumerate(df["icsd"]):
                for icsd_mp in (mp_icsd_list):
                    for icsd_jarvis in (jarvis_icsd_list):
                        if icsd_mp == int(icsd_jarvis):
                            bandgaps_tbmbj[i] = float(
                                df["mbj_bandgap"].iloc[j])
                            bandgaps_opt[i] = float(
                                df["optb88vdw_bandgap"].iloc[j])
                            spillage[i] = float(df["spillage"].iloc[j])

        sorted_df = pd.DataFrame({
            "jarvis_bg_tbmbj": bandgaps_tbmbj,
            "jarvis_bg_opt": bandgaps_opt,
            "jarvis_spillage": spillage,
            "material_id": entries["material_id"]
        })

        sorted_df.to_pickle(self.interim_data_path)
        return sorted_df

コード例 #5

0

ファイルを表示

ファイル: get_data_OQMD.py プロジェクト: ohebbi/predicting-solid-state-qubit-material-hosts

    def _sort(self, df: pd.DataFrame, entries: pd.DataFrame) -> pd.DataFrame:

        bandgaps = np.empty(len(entries))
        bandgaps[:] = np.nan

        spacegroups = np.copy(bandgaps)
        ICSDs = np.copy(bandgaps)

        LOG.info("total iterations: {}".format(len(entries)))
        for i, icsd_list in tqdm(enumerate(entries["icsd_ids"])):
            for j, oqmd_icsd in enumerate(
                    df["crystal_structure.cross_reference.icsd"]):
                for icsd in icsd_list:
                    if icsd == oqmd_icsd:
                        spacegroups[i] = int(
                            df["crystal_structure.space_group_number"].iloc[j])
                        bandgaps[i] = df["oqmd.band_gap.value"].iloc[j]
                        ICSDs[i] = int(oqmd_icsd)

        sorted_df = pd.DataFrame({
            "oqmd_bg": bandgaps,
            "oqmd_sg": spacegroups,
            "oqmd_icsd": ICSDs,
            "material_id": entries["material_id"]
        })

        sorted_df.to_pickle(self.interim_data_path)
        return sorted_df

コード例 #6

0

ファイルを表示

ファイル: get_data_OQMD.py プロジェクト: ohebbi/predicting-solid-state-qubit-material-hosts

    def _apply_query(self, sorted: Optional[bool]) -> pd.DataFrame:

        # Query
        mdf = MDFDataRetrieval(anonymous=True)
        df = mdf.get_dataframe(
            {
                "source_names": ['oqmd'],
                "match_fields": {
                    "oqmd.converged": True
                }
            },
            unwind_arrays=False)

        # Applying filters for unneccessary data
        df = df[[
            "crystal_structure.space_group_number",
            "dft.exchange_correlation_functional", "material.composition",
            "crystal_structure.cross_reference.icsd", "oqmd.band_gap.value",
            "dc.relatedIdentifiers"
        ]]
        df = df[df["oqmd.band_gap.value"] > 0]
        df['crystal_structure.cross_reference.icsd'] = df[
            'crystal_structure.cross_reference.icsd'].fillna(0)
        df["crystal_structure.space_group_number"] = df[
            "crystal_structure.space_group_number"].astype(int)
        df["crystal_structure.cross_reference.icsd"] = df[
            "crystal_structure.cross_reference.icsd"].astype(int)
        df = df.reset_index(drop=True)

        LOG.info("Writing to raw data...")
        df.to_pickle(self.raw_data_path)

        return df

コード例 #7

0

ファイルを表示

ファイル: get_data_AFLOW.py プロジェクト: ohebbi/predicting-solid-state-qubit-material-hosts

    def _sort(self, df: pd.DataFrame, entries: pd.DataFrame) -> pd.DataFrame:

        bandgap = np.empty(len(entries))
        bandgap[:] = np.nan

        bandgap_fitted = np.copy(bandgap)
        spacegroup_orig = np.copy(bandgap)
        spacegroup_relax = np.copy(bandgap)
        ICSDs = np.copy(bandgap)

        LOG.info("total iterations: {}".format(len(entries)))
        for i, icsd_list in tqdm(enumerate(entries["icsd_ids"])):
            for j, aflow_icsd in enumerate(df["prototype"]):
                for icsd in eval(str(icsd_list)):
                    if icsd == int(aflow_icsd.split("_")[-1][:-1]):

                        spacegroup_orig[i] = int(df["spacegroup_orig"].iloc[j])
                        spacegroup_relax[i] = int(
                            df["spacegroup_relax"].iloc[j])
                        ICSDs[i] = int(aflow_icsd.split("_")[-1][:-1])
                        bandgap[i] = df["Egap"].iloc[j]
                        bandgap_fitted[i] = df["Egap_fit"].iloc[j]

        sorted_df = pd.DataFrame({
            "aflow_bg": bandgap,
            "aflow_bg_fit": bandgap_fitted,
            "aflow_sg_orig": spacegroup_orig,
            "aflow_sg_relax": spacegroup_relax,
            "aflow_icsd": ICSDs,
            "material_id": entries["material_id"]
        })

        sorted_df.to_pickle(self.data_dir / "interim" / "AFLOW" / "AFLOW.pkl")
        return sorted_df

コード例 #8

0

ファイルを表示

ファイル: get_data_base.py プロジェクト: ohebbi/predicting-solid-state-qubit-material-hosts

    def get_dataframe(self, sorted: Optional[bool] = True) -> pd.DataFrame:

        if self._does_file_exist():
            df = pd.read_pickle(self.raw_data_path)
        else:
            df = self._apply_query(sorted=sorted)
        LOG.info("Done")
        return (df)

コード例 #9

0

ファイルを表示

ファイル: get_data_base.py プロジェクト: ohebbi/predicting-solid-state-qubit-material-hosts

 def _does_file_exist(self) -> bool:
     if os.path.exists(self.raw_data_path):
         LOG.info("Data path {} detected. Reading now...".format(
             self.raw_data_path))
         return True
     else:
         LOG.info("Data path {} not detected. Applying query now...".format(
             self.raw_data_path))
         return False

コード例 #10

0

ファイルを表示

    def _apply_query(self, sorted: Optional[bool]) -> pd.DataFrame:

        url = "https://ndownloader.figshare.com/files/22471022"
        js_tag = "jdft_3d-4-26-2020.json"

        path = str(os.path.join(os.path.dirname(__file__), js_tag))
        if not os.path.isfile(path):
            zfile = str(os.path.join(os.path.dirname(__file__), "tmp.zip"))
            r = requests.get(url)
            f = open(zfile, "wb")
            f.write(r.content)
            f.close()

            with zipfile.ZipFile(zfile, "r") as zipObj:
                zipObj.extractall(os.path.join(os.path.dirname(__file__)))
            os.remove(zfile)

        f = open(path, "r")
        data = json.load(f)
        f.close()

        if os.path.exists(path):
            os.remove(path)

        # Query
        #df = pd.DataFrame(data('dft_3d'))\
        df = pd.DataFrame(data)\
                               .replace("na", np.nan)\
                               .replace("None", np.nan)\
                               .fillna(value=np.nan)\
                               .dropna(subset=['icsd'])

        icsd_list = []

        # ICSD-column is not consequent in notation, therefore we present a fix
        for icsd_jarvis in df["icsd"]:
            if isinstance(icsd_jarvis, str):

                if isinstance(eval(icsd_jarvis), int):
                    icsd_list.append([eval(icsd_jarvis)])

                elif isinstance(eval(icsd_jarvis), list):
                    icsd_list.append(eval(icsd_jarvis))

            elif isinstance(icsd_jarvis, float):
                icsd_list.append([icsd_jarvis])

        df["icsd"] = icsd_list
        df = df[df["optb88vdw_bandgap"] > 0].reset_index(drop=True)

        LOG.info("Writing to raw data...")
        df.to_pickle(self.raw_data_path)

        return df

コード例 #11

0

ファイルを表示

ファイル: build_features.py プロジェクト: ohebbi/predicting-solid-state-qubit-material-hosts

    def apply_featurizers(criterion, properties, mpdr, featurizerObject):
        LOG.info("Downloading dos and bandstructure objects..")

        timeDownloadStart = time.time()
        df_portion = mpdr.get_dataframe(criteria=criterion,
                                        properties=properties)
        timeDownloadEnd = time.time()

        LOG.info(df_portion)
        df_time, df_portion = featurizerObject.featurize(df_portion)
        df_time["download_objects"] = [timeDownloadEnd - timeDownloadStart]

        return df_time, df_portion

コード例 #12

0

ファイルを表示

ファイル: build_features.py プロジェクト: ohebbi/predicting-solid-state-qubit-material-hosts

def does_file_exist(filepath: Path) -> bool:
    """
    Checks if file path exists.

    """

    if os.path.exists(filepath):
        LOG.info("Data path detected:\n{}\.".format(filepath))
        return True
    else:
        LOG.info(
            "Data path\n{}\nnot detected. Downloading now...".format(filepath))
        return False

コード例 #13

0

ファイルを表示

ファイル: get_data_AFLOW.py プロジェクト: ohebbi/predicting-solid-state-qubit-material-hosts

    def get_data_AFLOW(self,
                       compound_list: list,
                       keys: list,
                       batch_size: int,
                       catalog: str = "icsd") -> Dict:
        """
        A function used to make a query to AFLOW.
        ...
        Args
        ----------
        compound_list : list (dim:N)
            A list of strings containing full formula, eg. H2O1 or Si1C1
        keys : list (dim:M)
            A list containing the features of the compound, found in documentation of AFLUX.
            eg. Egap
        batch_size : int
            Number of data entries to return per HTTP request
        catalog : str
            "icsd" for ICSD

        Returns
        -------
        dict
            A dictionary containing the resulting matching queries. This can result
            in several matching compounds for each compound.
        """
        index = 0
        aflow_dict = {k: [] for k in keys}
        for compound in tqdm(compound_list):
            LOG.info("Current query: {}".format(compound))

            results = search(catalog=catalog, batch_size=batch_size)\
                .filter(K.compound==compound)

            if len(results) > 0:
                for result in tqdm(results):
                    for key in keys:
                        try:
                            aflow_dict[key].append(getattr(result, key))
                        except:
                            aflow_dict[key].append("None")
                    if (index % 10 == 0):
                        pd.DataFrame.from_dict(aflow_dict).to_pickle(
                            self.data_dir / "raw" / "AFLOW" / "new_AFLOW.pkl")

                    index += 1
            else:
                LOG.info("No compound is matching the search")
                continue

        return aflow_dict

コード例 #14

0

ファイルを表示

ファイル: get_data_Citrine.py プロジェクト: ohebbi/predicting-solid-state-qubit-material-hosts

    def _apply_query(self, sorted: Optional[bool]) -> pd.DataFrame:
        cdr = CitrineDataRetrieval(api_key=self.API_KEY)
        criteria = {"data_type": "EXPERIMENTAL"}
        properties = ['Band gap']
        common_fields = [
            "uid", "chemicalFormula", "references", "Crystallinity",
            "Structure", "Crystal structure", "uid"
        ]

        df = cdr.get_dataframe(criteria=criteria,
                               properties=properties,
                               common_fields=common_fields)

        LOG.info("Writing to raw data...")
        df.to_pickle(self.raw_data_path)
        return df

コード例 #15

0

ファイルを表示

    def featurize_site(
            self,
            df: pd.DataFrame,
            aliases: Optional[Dict[str, str]] = None) -> pd.DataFrame:
        """ Decorate input "pandas.DataFrame" of structures with site
        features, specified by the extendedMODFeaturizer preset.
        Arguments:
            df: the input dataframe with a ""structure"" column
                containing "pymatgen.Structure" objects.
            aliases: optional dictionary to map matminer output column
                names to new aliases, mostly used for
                backwards-compatibility.
        Returns:
            pandas.DataFrame: the decorated DataFrame.
        """

        if not self.site_featurizers:
            return pd.DataFrame([])

        LOG.info("Applying site featurizers...")

        df = df.copy()
        df.columns = ["Input data|" + x for x in df.columns]

        for fingerprint in self.site_featurizers:
            site_stats_fingerprint = SiteStatsFingerprint(
                fingerprint, stats=self.site_stats)
            df = site_stats_fingerprint.featurize_dataframe(
                df,
                "Input data|structure",
                multiindex=False,
                ignore_errors=True)

            fingerprint_name = fingerprint.__class__.__name__
            if aliases:
                fingerprint_name = aliases.get(fingerprint_name,
                                               fingerprint_name)
            if "|" not in fingerprint_name:
                fingerprint_name += "|"
            df.columns = [
                f"{fingerprint_name}{x}" if "|" not in x else x
                for x in df.columns
            ]

        return df

コード例 #16

0

ファイルを表示

ファイル: get_data_AFLOW.py プロジェクト: ohebbi/predicting-solid-state-qubit-material-hosts

    def _apply_query(self, sorted: Optional[bool]) -> pd.DataFrame:

        # Add unique url id for figshare endpoint
        url = "https://ndownloader.figshare.com/files/26777717"
        file = wget.download(url)

        # Read and load pkl data
        with open(file, 'rb') as f:
            df = pickle.load(f)
            os.remove(file)

        # TODO : Add option to make new queries to AFLOW. This has to be
        #        rewritten since AFLOW does not have MPID.
        """
        try:
            MP = data_MP(API_KEY = self.MAPI_KEY)
        except:
            raise ValueError("AFLOW is dependent on MP data. Add MAPI_KEY argument\
            to class constructor.")

        entries = MP.get_dataframe()

        df = self.sort_with_MP(df, entries)

        # Find if there are new entries in MP
        newEntries = entries[~entries["material_id"].isin(df["material_id"])]

        # Update if there are new entries
        if newEntries.shape[0]>0:
            keys = list(pd.read_pickle(self.data_dir / "raw" / "AFLOW" / "AFLOW_keywords.pkl"))

            LOG.info("New entries identified. Generating features for AFLOW...")

            AFLOW_portion = self.get_dataframe_AFLOW(compound_list=list(newEntries["full_formula"]), keys=keys, batch_size = 1000, catalog="icsd")

            AFLOW_portion = self._sort(AFLOW_portion, entries)

            df = pd.concat([df, AFLOW_portion])
            df = sortByMPID(df)
        """
        LOG.info("Writing to raw data...")
        df.to_pickle(self.data_dir / "raw" / "AFLOW" / "AFLOW.pkl")

        return df

コード例 #17

0

ファイルを表示

    def featurize_structure(self, df: pd.DataFrame) -> pd.DataFrame:
        """ Decorate input "pandas.DataFrame" of structures with structural
        features from matminer, specified by the extendedMODFeaturizer preset.
        Currently applies the set of all matminer structure features.
        Arguments:
            df: the input dataframe with a ""structure"" column
                containing "pymatgen.Structure" objects.
        Returns:
            pandas.DataFrame: the decorated DataFrame.
        """

        if not self.structure_featurizers:
            return pd.DataFrame([])

        LOG.info("Applying structure featurizers...")
        df = df.copy()
        df = self._fit_apply_featurizers(df, self.structure_featurizers,
                                         "structure")
        df.columns = df.columns.map('|'.join).str.strip('|')

        return df

コード例 #18

0

ファイルを表示

ファイル: get_data_MP.py プロジェクト: ohebbi/predicting-solid-state-qubit-material-hosts

    def _apply_query(self, sorted: Optional[bool] = True) -> pd.DataFrame:

        # TODO: Remove when all
        #if (isCurrentlyFeaturizing == False):

        with MPRester(self.API_KEY) as mpr:

            # Initial criteria
            criteria = {
                "icsd_ids": {
                    "$gt": 0
                },  #All compounds deemed similar to a structure in ICSD
                "band_gap": {
                    "$gt": 0.1
                },
                #"material_id":{"$in": featurizedData["material_id"].to_list()}
            }

            # Features
            props = [
                "material_id", "full_formula", "icsd_ids", "spacegroup.number",
                "spacegroup.point_group", "band_gap", "run_type", "cif",
                "structure", "pretty_formula", "total_magnetization",
                "nelements", "efermi", "oxide_type"
            ]

            # Query
            df = pd.DataFrame(mpr.query(criteria=criteria, properties=props))

        # Remove unsupported MPIDs
        df = filterIDs(df)
        LOG.info("Current shape of dataframe after filter applied: {}".format(
            df.shape))
        # Sort by ascending MPID order
        if (sorted):
            df = sortByMPID(df)

        LOG.info("Writing to raw data...")
        df.to_pickle(self.raw_data_path)
        return df

コード例 #19

0

ファイルを表示

    def featurize_composition(self, df: pd.DataFrame) -> pd.DataFrame:
        """ Decorate input "pandas.DataFrame" of structures with composition
        features from matminer, specified by the extendedMODFeaturizer preset.
        Currently applies the set of all matminer composition features.
        Arguments:
            df: the input dataframe with a ""structure"" column
                containing "pymatgen.Structure" objects.
        Returns:
            pandas.DataFrame: the decorated DataFrame, or an empty
                DataFrame if no composition/oxidation featurizers
                exist for this class.
        """
        if not (self.composition_featurizers
                or self.oxid_composition_featurizers):
            return pd.DataFrame([])

        df = df.copy()

        if self.composition_featurizers:

            LOG.info("Applying composition featurizers...")
            df['composition'] = df['structure'].apply(lambda s: s.composition)

            df = self._fit_apply_featurizers(df, self.composition_featurizers,
                                             "composition")
            #df = df.replace([np.inf, -np.inf, np.nan], 0)
            df = df.rename(columns={'Input Data': ''})
            df.columns = df.columns.map('|'.join).str.strip('|')

        if self.oxid_composition_featurizers:
            LOG.info("Applying oxidation state featurizers...")
            df = CompositionToOxidComposition().featurize_dataframe(
                df, "composition")
            df = self._fit_apply_featurizers(df,
                                             self.oxid_composition_featurizers,
                                             "composition_oxid")
            df = df.rename(columns={'Input Data': ''})
            df.columns = df.columns.map('|'.join).str.strip('|')

        return df

コード例 #20

0

ファイルを表示

ファイル: get_data_AFLOWML.py プロジェクト: ohebbi/predicting-solid-state-qubit-material-hosts

    def _apply_query(self, sorted: Optional[bool])-> pd.DataFrame:

        # Add unique url id for figshare endpoint
        url = "https://ndownloader.figshare.com/files/26922764"
        file = wget.download(url)

        # Read and load pkl
        with open(file, 'rb') as f:
            df = pickle.load(f)
            os.remove(file)

        # Get data from Materials Project
        try:
            MP = data_MP(API_KEY = self.MAPI_KEY)
        except:
            raise ValueError("AFLOW-ML is dependent on MP data. Add MAPI_KEY argument\
            to class constructor.")
        entries = MP.get_dataframe()

        # Find if there are new entries in MP
        newEntries = entries[~entries["material_id"].isin(df["material_id"])]

        # Update if there are new entries
        
        if (newEntries.shape[0]>0):
            LOG.info("{} new entries identified. Generating features for AFLOW-ML...".format(newEntries.shape[0]))

            AFLOWML_portion = self.calculate_dataframe(entries=newEntries)

            df = pd.concat([df, AFLOWML_portion])
            df = sortByMPID(df)

        LOG.info("Writing to raw data...")
        df.to_pickle(self.data_dir / "raw"  / "AFLOWML" / "AFLOWML.pkl")

        return df;

コード例 #21

0

ファイルを表示

ファイル: build_features.py プロジェクト: ohebbi/predicting-solid-state-qubit-material-hosts

def run_featurizer() -> None:
    """ Function used to run, and rerun a featurization process of a large amount of entries.
        As default, we use the initial query from Materials Project. Initialised by
        "make features"

        If program stops, identify mistake (most likely an error in Materials Project
        (add to filterIDs)), remove raw data in Materials Project data folder, and
        rerun with "make features" command.

    """

    project_dir = Path(__file__).resolve().parents[2]
    data_dir = project_dir / "data"

    dotenv.load_dotenv(project_dir / ".env")

    MAPI_KEY = os.getenv("MAPI_KEY")
    MP = data_MP(API_KEY=MAPI_KEY)
    entries = MP.get_dataframe()
    material_ids = entries["material_id"]
    del entries, MP

    featurizerObject = preset.PRESET_HEBNES_2021()

    if Path(data_dir / "raw" / "featurizer" / "featurized.pkl").is_file():

        # If errors met, just rerun and this if-test will run.
        LOG.info("In-progress featurized data identified. Reading now...")

        entries_featurized = pd.read_pickle(data_dir / "raw" / "featurizer" /
                                            "featurized.pkl")
        time_featurized = pd.read_csv(data_dir / "raw" / "featurizer" /
                                      "timing.csv")

        LOG.info("Last featurized MPID: {}".format(
            entries_featurized.index[-1]))

        howFar = material_ids[material_ids ==
                              entries_featurized.index[-1]].index.values

        # Test if mpid index is the same, true if using the same dataset
        assert material_ids[howFar[0]] == entries_featurized.index[
            -1], "Are you sure this is the same dataset as earlier?"

        LOG.info("Index: {}".format(howFar))
        LOG.info(
            "Preparing for new featurized data starting with MPID: {}".format(
                material_ids[howFar[0]]))

        entries_featurized.to_pickle(
            data_dir / "raw" / "featurizer" /
            Path("featurized-upto-" + str(howFar[0]) + ".pkl"))
        time_featurized.to_csv(data_dir / "raw" / "featurizer" /
                               Path("timing-upto-" + str(howFar[0]) + ".csv"))

        del entries_featurized, time_featurized

        df = featurize_by_material_id(material_ids[howFar[0] + 1:],
                                      featurizerObject, MAPI_KEY)

    else:
        # First time running featurizers.
        df = featurize_by_material_id(entries["material_id"], featurizerObject,
                                      MAPI_KEY)

コード例 #22

0

ファイルを表示

ファイル: build_features.py プロジェクト: ohebbi/predicting-solid-state-qubit-material-hosts

def featurize_by_material_id(
        material_ids: np.array,
        featurizerObject: featurizer.extendedMODFeaturizer,
        MAPI_KEY: str,
        writeToFile: bool = True) -> pd.DataFrame:
    """ Run all of the preset featurizers on the input dataframe.
    Arguments:
        df: the input dataframe with a `"structure"` column
            containing `pymatgen.Structure` objects.
    Returns:
        The featurized DataFrame.
    """
    def apply_featurizers(criterion, properties, mpdr, featurizerObject):
        LOG.info("Downloading dos and bandstructure objects..")

        timeDownloadStart = time.time()
        df_portion = mpdr.get_dataframe(criteria=criterion,
                                        properties=properties)
        timeDownloadEnd = time.time()

        LOG.info(df_portion)
        df_time, df_portion = featurizerObject.featurize(df_portion)
        df_time["download_objects"] = [timeDownloadEnd - timeDownloadStart]

        return df_time, df_portion

    properties = [
        "material_id", "full_formula", "bandstructure", "dos", "structure"
    ]

    mpdr = MPDataRetrieval(MAPI_KEY)

    steps = 1
    leftover = len(material_ids) % steps

    df = pd.DataFrame({})
    df_timers = pd.DataFrame({})

    for i in tqdm(range(0, len(material_ids), steps)):
        portionReturned = True
        if not (i + steps > len(material_ids)):

            LOG.info(list(material_ids[i:i + steps]))
            criteria = {"task_id": {"$in": list(material_ids[i:i + steps])}}

            while (portionReturned):
                try:
                    df_time, df_portion = apply_featurizers(
                        criteria, properties, mpdr, featurizerObject)
                    portionReturned = False
                except:
                    LOG.info("Except - try again.")

            # Add ID to recognize afterwards
            df_portion["material_id"] = material_ids[i:i + steps]

            df = pd.concat([df, df_portion])
            df_timers = pd.concat([df_timers, df_time])

            LOG.info("CURRENT SHAPE:{}".format(df.shape))
            if writeToFile:
                df.to_pickle(
                    Path(__file__).resolve().parents[2] / "data" / "raw" /
                    "featurizer" / "featurized.pkl")
                df_timers.to_csv(
                    Path(__file__).resolve().parents[2] / "data" / "raw" /
                    "featurizer" / "timing.csv")

    if (leftover):
        LOG.info(list(material_ids[i:i + leftover]))
        criteria = {"task_id": {"$in": list(material_ids[i:i + leftover])}}
        df_time, df_portion = apply_featurizers(criteria, properties, mpdr,
                                                featurizerObject)
        df_portion["material_id"] = material_ids[i:i + leftover]

        df = pd.concat([df, df_portion])
        df_timers = pd.concat([df_timers, df_time])
        if writeToFile:
            df.to_pickle(
                Path(__file__).resolve().parents[2] / "data" / "raw" /
                "featurizer" / "featurized.pkl")
            df_timers.to_csv(
                Path(__file__).resolve().parents[2] / "data" / "raw" /
                "featurizer" / "timing.csv")

    return df

コード例 #23

0

ファイルを表示

ファイル: build_features.py プロジェクト: ohebbi/predicting-solid-state-qubit-material-hosts

def main():
    get_featurized_data()

    LOG.info("Done")