Python save_to_hdf Examples

Programming Language: Python

Namespace/Package Name: radis.io.cache_files

Method/Function: save_to_hdf

Examples at hotexamples.com: 4

Python save_to_hdf - 4 examples found. These are the top rated real world Python examples of radis.io.cache_files.save_to_hdf extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

File: partfunc_cdsd.py Project: BlehMaks/radis-1

    def __init__(
        self,
        energy_levels,
        isotope,
        levelsfmt,  # ='cdsd-pc',
        use_cached=True,
        use_json=None,
        verbose=True,
    ):

        # %% Init

        # Initialize PartitionFunctionCalculator for this electronic state
        ElecState = ElectronicState("CO2", isotope, "X", "1Σu+")
        super(PartFuncCO2_CDSDcalc, self).__init__(ElecState)

        # Check inputs ('return' is not mentionned in signature. it will just return
        # after cache name is given)
        assert use_cached in [True, False, "regen", "force", "return"]
        if isotope not in [1, 2]:
            raise ValueError(
                "CDSD Energies not defined for isotope: {0}".format(isotope))
        if use_json is not None:
            warn(
                DeprecationWarning(
                    "use_json replaced with faster HDF5-based use_cached"))
        # Get vibrational level definitions that match Energy Database (in particular
        # how Evib and Erot are calculated)
        # This is needed to be able to match the levels in the Line Database and
        # the levels in the Energy database
        if levelsfmt == "cdsd-p":
            viblvl_label = "p"
        elif levelsfmt == "cdsd-pc":
            viblvl_label = "pc"
        elif levelsfmt == "cdsd-pcN":
            viblvl_label = "pcN"
        elif levelsfmt == "cdsd-hamil":
            viblvl_label = "pcJN"
        elif levelsfmt is None:
            # dont label the levels. Wont be able to use the EnergyDatabase to fetch
            # vibrational energies for lines, however it can still be used to
            # calculate Partition functions independently from a Spectrum calculation
            viblvl_label = None
        else:
            raise ValueError(
                "Unknown Energy database format: levelsfmt = `{0}`".format(
                    levelsfmt) +
                ". Use one of: `cdsd-p`, `cdsd-pc`, `cdsd-pcN`,`cdsd-hamil`")

        # Store defaults
        self.verbose = verbose
        self.use_cached = use_cached
        self.levelsfmt = levelsfmt
        self.viblvl_label = viblvl_label
        self.last_modification = time.ctime(
            getmtime(getTestFile(r"co2_cdsd_hamiltonian_fragment.levels")))
        if verbose >= 2:
            print("Last modification time: {0}".format(self.last_modification))

        # Get variables to store in metadata  (after default values have been set)
        molecule = "CO2"  # will be stored in cache file metadata
        last_modification = time.ctime(
            getmtime(getTestFile(r"co2_cdsd_hamiltonian_fragment.levels")))

        _discard = [
            "self",
            "energy_levels",
            "verbose",
            "ElecState",
            "electronic_state",
            "use_json",
            "use_cached",
        ]
        # (dev) locals() automatically stores all variables: levelsfmt, viblvl_label, etc.
        metadata = filter_metadata(locals(), discard_variables=_discard)

        # %% Get levels
        # Function of use_cached value:
        # ... if True, use (and generate if doesnt exist) cache file.
        # ... if 'regen', regenerate cache file. If 'force', raise an error
        # ... if file doesnt exist.
        # If file is deprecated, regenerate it unless 'force' was used

        # Load cache file if exists

        cachefile = energy_levels + ".h5"
        self.cachefile = cachefile

        # If return, return after cachefile generated (used for tests)
        if use_cached == "return":
            return

        df = load_h5_cache_file(
            cachefile,
            use_cached,
            valid_if_metadata_is=metadata,
            relevant_if_metadata_above={},
            relevant_if_metadata_below={},
            current_version=radis.__version__,
            last_compatible_version=OLDEST_COMPATIBLE_VERSION,
            verbose=verbose,
        )

        if df is None:  # Read normal file
            df = pd.read_csv(energy_levels, comment="#", delim_whitespace=True)
            df = self._add_degeneracies(df)
            df = self._add_levels(df)

        self.df = df  # Store

        if use_cached and not exists(cachefile):
            save_to_hdf(
                self.df,
                cachefile,
                metadata=metadata,
                version=radis.__version__,
                key="df",
                overwrite=True,
                verbose=verbose,
            )

Example #2

Show file

File: query.py Project: BlehMaks/radis-1

def fetch_astroquery(
    molecule, isotope, wmin, wmax, verbose=True, cache=True, expected_metadata={}
):
    """Download a HITRAN line database to a Pandas DataFrame.

    Wrapper to Astroquery [1]_ fetch function

    Parameters
    ----------
    molecule: str, or int
        molecule name or identifier
    isotope: int
        isotope number
    wmin, wmax: float  (cm-1)
        wavenumber min and max

    Other Parameters
    ----------------
    verbose: boolean
        Default ``True``
    cache: boolean or ``'regen'``
        if ``True``, tries to find a ``.h5`` cache file in the Astroquery
        :py:attr:`~astroquery.query.BaseQuery.cache_location`, that would match
        the requirements. If not found, downloads it and saves the line dataframe
        as a ``.h5`` file in the Astroquery.
        If ``'regen'``, delete existing cache file to regerenate it.
    expected_metadata: dict
        if ``cache=True``, check that the metadata in the cache file correspond
        to these attributes. Arguments ``molecule``, ``isotope``, ``wmin``, ``wmax``
        are already added by default.

    Notes
    -----
    The HITRAN module in Astroquery [1]_ is itself based on [HAPI]_

    References
    ----------
    .. [1] `Astroquery <https://astroquery.readthedocs.io>`_

    See Also
    --------
    :py:func:`astroquery.hitran.reader.download_hitran`,
    :py:func:`astroquery.hitran.reader.read_hitran_file`,
    :py:attr:`~astroquery.query.BaseQuery.cache_location`

    """
    # Check input
    if not is_float(molecule):
        mol_id = get_molecule_identifier(molecule)
    else:
        mol_id = molecule
        molecule = get_molecule(mol_id)
    assert is_float(isotope)

    empty_range = False

    if cache:
        # Cache file location in Astroquery cache
        # TODO: move full HITRAN databases in ~/radisdb cache like io/hitemp/fetch_hitemp ?
        fcache = join(
            Hitran.cache_location,
            CACHE_FILE_NAME.format(
                **{"molecule": molecule, "isotope": isotope, "wmin": wmin, "wmax": wmax}
            ),
        )
        # ... Update metadata with physical properties from the database.
        expected_metadata.update(
            {"molecule": molecule, "isotope": isotope, "wmin": wmin, "wmax": wmax}
        )
        if cache == "regen":
            if exists(fcache):
                if verbose:
                    print(f"Cache file {fcache} deleted to be regenerated")
                os.remove(fcache)
        else:
            # Load cache file if valid
            check_cache_file(
                fcache=fcache,
                use_cached=cache,
                expected_metadata=expected_metadata,
                verbose=verbose,
            )
            if exists(fcache):
                try:
                    return get_cache_file(fcache, verbose=verbose)
                except Exception as err:
                    if verbose:
                        printr(
                            "Problem reading cache file {0}:\n{1}\nDeleting it!".format(
                                fcache, str(err)
                            )
                        )
                    os.remove(fcache)

    # Download using the astroquery library
    try:
        response = Hitran.query_lines_async(
            molecule_number=mol_id,
            isotopologue_number=isotope,
            min_frequency=wmin / u.cm,
            max_frequency=wmax / u.cm,
        )
    except KeyError as err:
        raise KeyError(
            str(err)
            + " <<w this error occured in Astroquery. Maybe these molecule "
            + "({0}) and isotope ({1}) are not supported".format(molecule, isotope)
        ) from err

    # Deal with usual errors
    if response.status_code == 404:
        # Maybe there are just no lines for this species in this range
        # In that case we usually end up with errors like:

        # (<class 'Exception'>, Exception('Query failed: 404 Client Error:
        # Not Found for url: http://hitran.org/lbl/api?numax=25000&numin=19000&iso_ids_list=69\n',),
        # <traceback object at 0x7f0967c91708>)

        if response.reason == "Not Found":
            # Let's bet it's just that there are no lines in this range
            empty_range = True
            if verbose:
                print(
                    (
                        "No lines for {0} (id={1}), iso={2} in range {3:.2f}-{4:.2f}cm-1. ".format(
                            molecule, mol_id, isotope, wmin, wmax
                        )
                    )
                )
        else:
            raise ValueError(
                "An error occured during the download of HITRAN files "
                + "for {0} (id={1}), iso={2} between {3:.2f}-{4:.2f}cm-1. ".format(
                    molecule, mol_id, isotope, wmin, wmax
                )
                + "Are you online?\n"
                + "See details of the error below:\n\n {0}".format(response.reason)
            )
    elif response.status_code == 500:

        raise ValueError(
            "{0} while querying the HITRAN server: ".format(response.status_code)
            + "\n\n{0}".format(response.text)
        )

    # Process response

    # Rename columns from Astroquery to RADIS format
    rename_columns = {
        "molec_id": "id",
        "local_iso_id": "iso",
        "nu": "wav",
        "sw": "int",
        "a": "A",
        "gamma_air": "airbrd",
        "gamma_self": "selbrd",
        "elower": "El",
        "n_air": "Tdpair",
        "delta_air": "Pshft",
        "global_upper_quanta": "globu",
        "global_lower_quanta": "globl",
        "local_upper_quanta": "locu",
        "local_lower_quanta": "locl",
        "line_mixing_flag": "lmix",
        "gp": "gp",
        "gpp": "gpp",
    }

    if not empty_range:
        tbl = Hitran._parse_result(response)
        df = tbl.to_pandas()
        df = df.rename(columns=rename_columns)
    else:
        df = pd.DataFrame(columns=list(rename_columns.values()))

    # Cast type to float64
    cast_type = {
        "wav": np.float64,
        "int": np.float64,
        "A": np.float64,
        "airbrd": np.float64,
        "selbrd": np.float64,
        "El": np.float64,
        "Tdpair": np.float64,
        "Pshft": np.float64,
    }
    for c, typ in cast_type.items():
        df[c] = df[c].astype(typ)

    # cached file mode but cached file doesn't exist yet (else we had returned)
    if cache:
        new_metadata = {
            "molecule": molecule,
            "isotope": isotope,
            "wmin": wmin,
            "wmax": wmax,
        }
        if verbose:
            print(
                "Generating cache file {0} with metadata :\n{1}".format(
                    fcache, new_metadata
                )
            )
        try:
            save_to_hdf(
                df,
                fcache,
                metadata=new_metadata,
                version=radis.__version__,
                key="df",
                overwrite=True,
                verbose=verbose,
            )
        except PermissionError:
            if verbose:
                print(sys.exc_info())
                print("An error occured in cache file generation. Lookup access rights")
            pass

    return df

Example #3

Show file

File: hitran.py Project: radis/radis

def hit2df(
    fname,
    cache=True,
    verbose=True,
    drop_non_numeric=True,
    load_wavenum_min=None,
    load_wavenum_max=None,
):
    """Convert a HITRAN/HITEMP [1]_ file to a Pandas dataframe

    Parameters
    ----------
    fname: str
        HITRAN-HITEMP file name
    cache: boolean, or ``'regen'`` or ``'force'``
        if ``True``, a pandas-readable HDF5 file is generated on first access,
        and later used. This saves on the datatype cast and conversion and
        improves performances a lot (but changes in the database are not
        taken into account). If False, no database is used. If ``'regen'``, temp
        file are reconstructed. Default ``True``.

    Other Parameters
    ----------------
    drop_non_numeric: boolean
        if ``True``, non numeric columns are dropped. This improves performances,
        but make sure all the columns you need are converted to numeric formats
        before hand. Default ``True``. Note that if a cache file is loaded it
        will be left untouched.
    load_wavenum_min, load_wavenum_max: float
        if not ``'None'``, only load the cached file if it contains data for
        wavenumbers above/below the specified value. See :py:func`~radis.io.cache_files.load_h5_cache_file`.
        Default ``'None'``.

    Returns
    -------
    df: pandas Dataframe
        dataframe containing all lines and parameters


    References
    ----------

    .. [1] `HITRAN 1996, Rothman et al., 1998 <https://www.sciencedirect.com/science/article/pii/S0022407398000788>`__



    Notes
    -----

    Performances: see CDSD-HITEMP parser


    See Also
    --------

    :func:`~radis.io.cdsd.cdsd2df`
    """
    metadata = {}
    # Last modification time of the original file :
    metadata["last_modification"] = time.ctime(getmtime(fname))
    if verbose >= 2:
        print("Opening file {0} (cache={1})".format(fname, cache))
        print("Last modification time: {0}".format(metadata["last_modification"]))
    if load_wavenum_min and load_wavenum_max:
        assert load_wavenum_min < load_wavenum_max

    columns = columns_2004

    # Use cache file if possible
    fcache = cache_file_name(fname)
    if cache and exists(fcache):
        relevant_if_metadata_above = (
            {"wavenum_max": load_wavenum_min} if load_wavenum_min else {}
        )  # not relevant if wavenum_max of file is < wavenum min required
        relevant_if_metadata_below = (
            {"wavenum_min": load_wavenum_max} if load_wavenum_max else {}
        )  # not relevant if wavenum_min of file is > wavenum max required
        df = load_h5_cache_file(
            fcache,
            cache,
            valid_if_metadata_is=metadata,
            relevant_if_metadata_above=relevant_if_metadata_above,
            relevant_if_metadata_below=relevant_if_metadata_below,
            current_version=radis.__version__,
            last_compatible_version=OLDEST_COMPATIBLE_VERSION,
            verbose=verbose,
        )
        if df is not None:
            return df

    # Detect the molecule by reading the start of the file
    try:
        with open(fname) as f:
            mol = get_molecule(int(f.read(2)))
    except UnicodeDecodeError as err:
        raise ValueError(
            "You're trying to read a binary file {0} ".format(fname)
            + "instead of an HITRAN file"
        ) from err

    # %% Start reading the full file

    df = parse_hitran_file(fname, columns)

    # %% Post processing

    # assert one molecule per database only. Else the groupbase data reading
    # above doesnt make sense
    nmol = len(set(df["id"]))
    if nmol == 0:
        raise ValueError("Databank looks empty")
    elif nmol != 1:
        # Crash, give explicity error messages
        try:
            secondline = df.iloc[1]
        except IndexError:
            secondline = ""
        raise ValueError(
            "Multiple molecules in database ({0}). Current ".format(nmol)
            + "spectral code only computes 1 species at the time. Use MergeSlabs. "
            + "Verify the parsing was correct by looking at the first row below: "
            + "\n{0}".format(df.iloc[0])
            + "\n----------------\nand the second row "
            + "below: \n{0}".format(secondline)
        )

    # Add local quanta attributes, based on the HITRAN group
    df = parse_local_quanta(df, mol)

    # Add global quanta attributes, based on the HITRAN class
    df = parse_global_quanta(df, mol)

    # Remove non numerical attributes
    if drop_non_numeric:
        if "branch" in df:
            replace_PQR_with_m101(df)
        df = drop_object_format_columns(df, verbose=verbose)

    # cached file mode but cached file doesn't exist yet (else we had returned)
    if cache:
        new_metadata = {
            # Last modification time of the original file :
            "last_modification": time.ctime(getmtime(fname)),
            "wavenum_min": df.wav.min(),
            "wavenum_max": df.wav.max(),
        }
        if verbose:
            print(
                "Generating cache file {0} with metadata :\n{1}".format(
                    fcache, new_metadata
                )
            )
        try:
            save_to_hdf(
                df,
                fcache,
                metadata=new_metadata,
                version=radis.__version__,
                key="df",
                overwrite=True,
                verbose=verbose,
            )
        except PermissionError:
            if verbose:
                print(sys.exc_info())
                print("An error occured in cache file generation. Lookup access rights")
            pass

    # TODO : get only wavenum above/below 'load_wavenum_min', 'load_wavenum_max'
    # by parsing df.wav.   Completely irrelevant files are discarded in 'load_h5_cache_file'
    # but files that have partly relevant lines are fully loaded.
    # Note : cache file is generated with the full line list.

    return df

Example #4

Show file

File: cdsd.py Project: BlehMaks/radis-1

def cdsd2df(
    fname,
    version="hitemp",
    cache=True,
    verbose=True,
    drop_non_numeric=True,
    load_wavenum_min=None,
    load_wavenum_max=None,
):
    """Convert a CDSD-HITEMP [1]_ or CDSD-4000 [2]_ file to a Pandas dataframe.

    Parameter
    ----------
    fname: str
        CDSD file name
    version: str ('4000', 'hitemp')
        CDSD version
    cache: boolean, or 'regen'
        if ``True``, a pandas-readable HDF5 file is generated on first access,
        and later used. This saves on the datatype cast and conversion and
        improves performances a lot (but changes in the database are not
        taken into account). If ``False``, no database is used. If 'regen', temp
        file are reconstructed. Default ``True``.

    Other Parameters
    ----------------
    drop_non_numeric: boolean
        if ``True``, non numeric columns are dropped. This improves performances,
        but make sure all the columns you need are converted to numeric formats
        before hand. Default ``True``. Note that if a cache file is loaded it
        will be left untouched.
    load_wavenum_min, load_wavenum_max: float
        if not ``'None'``, only load the cached file if it contains data for
        wavenumbers above/below the specified value. See :py:func`~radis.io.cache_files.load_h5_cache_file`.
        Default ``'None'``.

    Returns
    -------
    df: pandas Dataframe
        dataframe containing all lines and parameters

    Notes
    -----

    CDSD-4000 Database can be downloaded from [3]_

    Performances: I had huge performance trouble with this function, because the files are
    huge (500k lines) and the format is to special (no space between numbers...)
    to apply optimized methods such as pandas's. A line by line reading isn't
    so bad, using struct to parse each line. However, we waste typing determining
    what every line is. I ended up using the fromfiles functions from numpy,
    not considering *\\n* (line return) as a special character anymore, and a second call
    to numpy to cast the correct format. That ended up being twice as fast.

        - initial:                      20s / loop
        - with mmap:                    worse
        - w/o readline().rstrip('\\n'):  still 20s
        - numpy fromfiles:              17s
        - no more readline, 2x fromfile 9s

    Think about using cache mode too:

        - no cache mode                 9s
        - cache mode, first time        22s
        - cache mode, then              2s

    Moving to HDF5:

    On cdsd_02069_02070 (56 Mb)

    Reading::

        cdsd2df(): 9.29 s
        cdsd2df(cache=True [old .txt version]): 2.3s
        cdsd2df(cache=True [new h5 version, table]): 910ms
        cdsd2df(cache=True [new h5 version, fixed]): 125ms

    Storage::

        %timeit df.to_hdf("cdsd_02069_02070.h5", "df", format="fixed")  337ms
        %timeit df.to_hdf("cdsd_02069_02070.h5", "df", format="table")  1.03s

    References
    ----------

    Note that CDSD-HITEMP is used as the line database for CO2 in HITEMP 2010

    .. [1] `HITEMP 2010, Rothman et al., 2010 <https://www.sciencedirect.com/science/article/pii/S002240731000169X>`_

    .. [2] `CDSD-4000 article, Tashkun et al., 2011 <https://www.sciencedirect.com/science/article/pii/S0022407311001154>`_

    .. [3] `CDSD-4000 database <ftp://ftp.iao.ru/pub/CDSD-4000/>`_

    See Also
    --------

    :func:`~radis.io.hitran.hit2df`
    """
    metadata = {}
    metadata["last_modification"] = time.ctime(getmtime(fname))
    if load_wavenum_min and load_wavenum_max:
        assert load_wavenum_min < load_wavenum_max

    if verbose >= 2:
        print("Opening file {0} (format=CDSD {1}, cache={2})".format(
            fname, version, cache))
        print("Last Modification time: {0}".format(
            metadata["last_modification"]))

    if version == "hitemp":
        columns = columns_hitemp
    elif version == "4000":
        columns = columns_4000
    else:
        raise ValueError("Unknown CDSD version: {0}".format(version))

    # Use cache file if possible
    fcache = cache_file_name(fname)
    if cache and exists(fcache):
        relevant_if_metadata_above = (
            {
                "wavenum_max": load_wavenum_min
            } if load_wavenum_min else {}
        )  # not relevant if wavenum_max of file is < wavenum min required
        relevant_if_metadata_below = (
            {
                "wavenum_min": load_wavenum_max
            } if load_wavenum_max else {}
        )  # not relevant if wavenum_min of file is > wavenum max required
        df = load_h5_cache_file(
            fcache,
            cache,
            valid_if_metadata_is=metadata,
            relevant_if_metadata_above=relevant_if_metadata_above,
            relevant_if_metadata_below=relevant_if_metadata_below,
            current_version=radis.__version__,
            last_compatible_version=OLDEST_COMPATIBLE_VERSION,
            verbose=verbose,
        )
        if df is not None:
            return df

    # %% Start reading the full file

    df = parse_hitran_file(fname, columns)

    # Remove non numerical attributes
    if drop_non_numeric:
        replace_PQR_with_m101(df)
        df = drop_object_format_columns(df, verbose=verbose)

    # cached file mode but cached file doesn't exist yet (else we had returned)
    if cache:
        new_metadata = {
            # Last modification time of the original file :
            "last_modification": time.ctime(getmtime(fname)),
            "wavenum_min": df.wav.min(),
            "wavenum_max": df.wav.max(),
        }
        if verbose:
            print("Generating cache file {0} with metadata :\n{1}".format(
                fcache, new_metadata))
        try:
            save_to_hdf(
                df,
                fcache,
                metadata=new_metadata,
                version=radis.__version__,
                key="df",
                overwrite=True,
                verbose=verbose,
            )
        except PermissionError:
            if verbose:
                print(
                    "An error occured in cache file generation. Lookup access rights"
                )
            pass

    # TODO : get only wavenum above/below 'load_only_wavenum_above', 'load_only_wavenum_below'
    # by parsing df.wav.   Completely irrelevant files are discarded in 'load_h5_cache_file'
    # but files that have partly relevant lines are fully loaded.
    # Note : cache file is generated with the full line list.

    return df