Exemple #1
0
def hit2df(
    fname,
    cache=True,
    verbose=True,
    drop_non_numeric=True,
    load_wavenum_min=None,
    load_wavenum_max=None,
):
    """Convert a HITRAN/HITEMP [1]_ file to a Pandas dataframe

    Parameters
    ----------
    fname: str
        HITRAN-HITEMP file name
    cache: boolean, or ``'regen'`` or ``'force'``
        if ``True``, a pandas-readable HDF5 file is generated on first access,
        and later used. This saves on the datatype cast and conversion and
        improves performances a lot (but changes in the database are not
        taken into account). If False, no database is used. If ``'regen'``, temp
        file are reconstructed. Default ``True``.

    Other Parameters
    ----------------
    drop_non_numeric: boolean
        if ``True``, non numeric columns are dropped. This improves performances,
        but make sure all the columns you need are converted to numeric formats
        before hand. Default ``True``. Note that if a cache file is loaded it
        will be left untouched.
    load_wavenum_min, load_wavenum_max: float
        if not ``'None'``, only load the cached file if it contains data for
        wavenumbers above/below the specified value. See :py:func`~radis.io.cache_files.load_h5_cache_file`.
        Default ``'None'``.

    Returns
    -------
    df: pandas Dataframe
        dataframe containing all lines and parameters


    References
    ----------

    .. [1] `HITRAN 1996, Rothman et al., 1998 <https://www.sciencedirect.com/science/article/pii/S0022407398000788>`__



    Notes
    -----

    Performances: see CDSD-HITEMP parser


    See Also
    --------

    :func:`~radis.io.cdsd.cdsd2df`
    """
    metadata = {}
    # Last modification time of the original file :
    metadata["last_modification"] = time.ctime(getmtime(fname))
    if verbose >= 2:
        print("Opening file {0} (cache={1})".format(fname, cache))
        print("Last modification time: {0}".format(metadata["last_modification"]))
    if load_wavenum_min and load_wavenum_max:
        assert load_wavenum_min < load_wavenum_max

    columns = columns_2004

    # Use cache file if possible
    fcache = cache_file_name(fname)
    if cache and exists(fcache):
        relevant_if_metadata_above = (
            {"wavenum_max": load_wavenum_min} if load_wavenum_min else {}
        )  # not relevant if wavenum_max of file is < wavenum min required
        relevant_if_metadata_below = (
            {"wavenum_min": load_wavenum_max} if load_wavenum_max else {}
        )  # not relevant if wavenum_min of file is > wavenum max required
        df = load_h5_cache_file(
            fcache,
            cache,
            valid_if_metadata_is=metadata,
            relevant_if_metadata_above=relevant_if_metadata_above,
            relevant_if_metadata_below=relevant_if_metadata_below,
            current_version=radis.__version__,
            last_compatible_version=OLDEST_COMPATIBLE_VERSION,
            verbose=verbose,
        )
        if df is not None:
            return df

    # Detect the molecule by reading the start of the file
    try:
        with open(fname) as f:
            mol = get_molecule(int(f.read(2)))
    except UnicodeDecodeError as err:
        raise ValueError(
            "You're trying to read a binary file {0} ".format(fname)
            + "instead of an HITRAN file"
        ) from err

    # %% Start reading the full file

    df = parse_hitran_file(fname, columns)

    # %% Post processing

    # assert one molecule per database only. Else the groupbase data reading
    # above doesnt make sense
    nmol = len(set(df["id"]))
    if nmol == 0:
        raise ValueError("Databank looks empty")
    elif nmol != 1:
        # Crash, give explicity error messages
        try:
            secondline = df.iloc[1]
        except IndexError:
            secondline = ""
        raise ValueError(
            "Multiple molecules in database ({0}). Current ".format(nmol)
            + "spectral code only computes 1 species at the time. Use MergeSlabs. "
            + "Verify the parsing was correct by looking at the first row below: "
            + "\n{0}".format(df.iloc[0])
            + "\n----------------\nand the second row "
            + "below: \n{0}".format(secondline)
        )

    # Add local quanta attributes, based on the HITRAN group
    df = parse_local_quanta(df, mol)

    # Add global quanta attributes, based on the HITRAN class
    df = parse_global_quanta(df, mol)

    # Remove non numerical attributes
    if drop_non_numeric:
        if "branch" in df:
            replace_PQR_with_m101(df)
        df = drop_object_format_columns(df, verbose=verbose)

    # cached file mode but cached file doesn't exist yet (else we had returned)
    if cache:
        new_metadata = {
            # Last modification time of the original file :
            "last_modification": time.ctime(getmtime(fname)),
            "wavenum_min": df.wav.min(),
            "wavenum_max": df.wav.max(),
        }
        if verbose:
            print(
                "Generating cache file {0} with metadata :\n{1}".format(
                    fcache, new_metadata
                )
            )
        try:
            save_to_hdf(
                df,
                fcache,
                metadata=new_metadata,
                version=radis.__version__,
                key="df",
                overwrite=True,
                verbose=verbose,
            )
        except PermissionError:
            if verbose:
                print(sys.exc_info())
                print("An error occured in cache file generation. Lookup access rights")
            pass

    # TODO : get only wavenum above/below 'load_wavenum_min', 'load_wavenum_max'
    # by parsing df.wav.   Completely irrelevant files are discarded in 'load_h5_cache_file'
    # but files that have partly relevant lines are fully loaded.
    # Note : cache file is generated with the full line list.

    return df
Exemple #2
0
    def get_label_hitran(row, details):
        """
        Todo
        -------

        replace with simple astype(str) statements and str operations

        ex:
        > '['+df[locl].astype(str)+']('+df[globl].astype(str)+'->'+
        >     df[globu].astype(str)'+)'

        will be much faster!
        """

        molecule = get_molecule(row.id)

        # Get global labels
        if molecule in HITRAN_CLASS1:
            label = (
                "{molec}[iso{iso:.0f}] [{branch}{jl:.0f}]({vl:.0f})->({vu:.0f})"
                .format(
                    **dict([(k, row[k]) for k in ["vu", "vl", "jl", "iso"]] + [
                        ("molec", molecule),
                        ("branch", _fix_branch_format[row["branch"]]),
                    ])))
        elif molecule in HITRAN_CLASS4:
            label = "{molec}[iso{iso:.0f}] [{branch}{jl:.0f}]({v1l:.0f}{v2l:.0f}`{l2l:.0f}`{v3l:.0f})->({v1u:.0f}{v2u:.0f}`{l2u:.0f}`{v3u:.0f})".format(
                **dict([(k, row[k]) for k in [
                    "v1u",
                    "v2u",
                    "l2u",
                    "v3u",
                    "v1l",
                    "v2l",
                    "l2l",
                    "v3l",
                    "jl",
                    "iso",
                ]] + [
                    ("molec", molecule),
                    ("branch", _fix_branch_format[row["branch"]]),
                ]))
        elif molecule in HITRAN_CLASS5:
            label = "{molec}[iso{iso:.0f}] [{branch}{jl:.0f}]({v1l:.0f}{v2l:.0f}`{l2l:.0f}`{v3l:.0f} {rl:.0f})->({v1u:.0f}{v2u:.0f}`{l2u:.0f}`{v3u:.0f} {ru:.0f})".format(
                **dict([(k, row[k]) for k in [
                    "v1u",
                    "v2u",
                    "l2u",
                    "v3u",
                    "v1l",
                    "v2l",
                    "l2l",
                    "v3l",
                    "rl",
                    "ru",
                    "jl",
                    "iso",
                ]] + [
                    ("molec", molecule),
                    ("branch", _fix_branch_format[row["branch"]]),
                ]))
        else:
            raise NotImplementedError(
                "No label for {0}. Please add it!".format(molecule))

        # Add details about some line properties
        for k in details:
            name, _, unit = details[k]
            if is_float(row[k]):
                label += "<br>{0} {1}: {2:.3g} {3}".format(
                    k, name, row[k], unit)
            else:
                label += "<br>{0} {1}: {2} {3}".format(k, name, row[k], unit)

        return label
Exemple #3
0
def fetch_astroquery(
    molecule, isotope, wmin, wmax, verbose=True, cache=True, expected_metadata={}
):
    """Download a HITRAN line database to a Pandas DataFrame.

    Wrapper to Astroquery [1]_ fetch function

    Parameters
    ----------
    molecule: str, or int
        molecule name or identifier
    isotope: int
        isotope number
    wmin, wmax: float  (cm-1)
        wavenumber min and max

    Other Parameters
    ----------------
    verbose: boolean
        Default ``True``
    cache: boolean or ``'regen'``
        if ``True``, tries to find a ``.h5`` cache file in the Astroquery
        :py:attr:`~astroquery.query.BaseQuery.cache_location`, that would match
        the requirements. If not found, downloads it and saves the line dataframe
        as a ``.h5`` file in the Astroquery.
        If ``'regen'``, delete existing cache file to regerenate it.
    expected_metadata: dict
        if ``cache=True``, check that the metadata in the cache file correspond
        to these attributes. Arguments ``molecule``, ``isotope``, ``wmin``, ``wmax``
        are already added by default.

    Notes
    -----
    The HITRAN module in Astroquery [1]_ is itself based on [HAPI]_

    References
    ----------
    .. [1] `Astroquery <https://astroquery.readthedocs.io>`_

    See Also
    --------
    :py:func:`astroquery.hitran.reader.download_hitran`,
    :py:func:`astroquery.hitran.reader.read_hitran_file`,
    :py:attr:`~astroquery.query.BaseQuery.cache_location`

    """
    # Check input
    if not is_float(molecule):
        mol_id = get_molecule_identifier(molecule)
    else:
        mol_id = molecule
        molecule = get_molecule(mol_id)
    assert is_float(isotope)

    empty_range = False

    if cache:
        # Cache file location in Astroquery cache
        # TODO: move full HITRAN databases in ~/radisdb cache like io/hitemp/fetch_hitemp ?
        fcache = join(
            Hitran.cache_location,
            CACHE_FILE_NAME.format(
                **{"molecule": molecule, "isotope": isotope, "wmin": wmin, "wmax": wmax}
            ),
        )
        # ... Update metadata with physical properties from the database.
        expected_metadata.update(
            {"molecule": molecule, "isotope": isotope, "wmin": wmin, "wmax": wmax}
        )
        if cache == "regen":
            if exists(fcache):
                if verbose:
                    print(f"Cache file {fcache} deleted to be regenerated")
                os.remove(fcache)
        else:
            # Load cache file if valid
            check_cache_file(
                fcache=fcache,
                use_cached=cache,
                expected_metadata=expected_metadata,
                verbose=verbose,
            )
            if exists(fcache):
                try:
                    return get_cache_file(fcache, verbose=verbose)
                except Exception as err:
                    if verbose:
                        printr(
                            "Problem reading cache file {0}:\n{1}\nDeleting it!".format(
                                fcache, str(err)
                            )
                        )
                    os.remove(fcache)

    # Download using the astroquery library
    try:
        response = Hitran.query_lines_async(
            molecule_number=mol_id,
            isotopologue_number=isotope,
            min_frequency=wmin / u.cm,
            max_frequency=wmax / u.cm,
        )
    except KeyError as err:
        raise KeyError(
            str(err)
            + " <<w this error occured in Astroquery. Maybe these molecule "
            + "({0}) and isotope ({1}) are not supported".format(molecule, isotope)
        ) from err

    # Deal with usual errors
    if response.status_code == 404:
        # Maybe there are just no lines for this species in this range
        # In that case we usually end up with errors like:

        # (<class 'Exception'>, Exception('Query failed: 404 Client Error:
        # Not Found for url: http://hitran.org/lbl/api?numax=25000&numin=19000&iso_ids_list=69\n',),
        # <traceback object at 0x7f0967c91708>)

        if response.reason == "Not Found":
            # Let's bet it's just that there are no lines in this range
            empty_range = True
            if verbose:
                print(
                    (
                        "No lines for {0} (id={1}), iso={2} in range {3:.2f}-{4:.2f}cm-1. ".format(
                            molecule, mol_id, isotope, wmin, wmax
                        )
                    )
                )
        else:
            raise ValueError(
                "An error occured during the download of HITRAN files "
                + "for {0} (id={1}), iso={2} between {3:.2f}-{4:.2f}cm-1. ".format(
                    molecule, mol_id, isotope, wmin, wmax
                )
                + "Are you online?\n"
                + "See details of the error below:\n\n {0}".format(response.reason)
            )
    elif response.status_code == 500:

        raise ValueError(
            "{0} while querying the HITRAN server: ".format(response.status_code)
            + "\n\n{0}".format(response.text)
        )

    # Process response

    # Rename columns from Astroquery to RADIS format
    rename_columns = {
        "molec_id": "id",
        "local_iso_id": "iso",
        "nu": "wav",
        "sw": "int",
        "a": "A",
        "gamma_air": "airbrd",
        "gamma_self": "selbrd",
        "elower": "El",
        "n_air": "Tdpair",
        "delta_air": "Pshft",
        "global_upper_quanta": "globu",
        "global_lower_quanta": "globl",
        "local_upper_quanta": "locu",
        "local_lower_quanta": "locl",
        "line_mixing_flag": "lmix",
        "gp": "gp",
        "gpp": "gpp",
    }

    if not empty_range:
        tbl = Hitran._parse_result(response)
        df = tbl.to_pandas()
        df = df.rename(columns=rename_columns)
    else:
        df = pd.DataFrame(columns=list(rename_columns.values()))

    # Cast type to float64
    cast_type = {
        "wav": np.float64,
        "int": np.float64,
        "A": np.float64,
        "airbrd": np.float64,
        "selbrd": np.float64,
        "El": np.float64,
        "Tdpair": np.float64,
        "Pshft": np.float64,
    }
    for c, typ in cast_type.items():
        df[c] = df[c].astype(typ)

    # cached file mode but cached file doesn't exist yet (else we had returned)
    if cache:
        new_metadata = {
            "molecule": molecule,
            "isotope": isotope,
            "wmin": wmin,
            "wmax": wmax,
        }
        if verbose:
            print(
                "Generating cache file {0} with metadata :\n{1}".format(
                    fcache, new_metadata
                )
            )
        try:
            save_to_hdf(
                df,
                fcache,
                metadata=new_metadata,
                version=radis.__version__,
                key="df",
                overwrite=True,
                verbose=verbose,
            )
        except PermissionError:
            if verbose:
                print(sys.exc_info())
                print("An error occured in cache file generation. Lookup access rights")
            pass

    return df