Esempio n. 1
0
def cdsd2df(
    fname, version="hitemp", count=-1, cache=False, verbose=True, drop_non_numeric=True
):
    """ Convert a CDSD-HITEMP [1]_ or CDSD-4000 [2]_ file to a Pandas dataframe

    Parameters
    ----------

    fname: str
        CDSD file name 

    version: str ('4000', 'hitemp')
        CDSD version

    count: int
        number of items to read (-1 means all file)

    cache: boolean, or 'regen'
        if ``True``, a pandas-readable HDF5 file is generated on first access, 
        and later used. This saves on the datatype cast and conversion and
        improves performances a lot (but changes in the database are not 
        taken into account). If ``False``, no database is used. If 'regen', temp
        file are reconstructed. Default ``False``. 

    Other Parameters
    ----------------
    
    drop_non_numeric: boolean
        if ``True``, non numeric columns are dropped. This improves performances, 
        but make sure all the columns you need are converted to numeric formats 
        before hand. Default ``True``. Note that if a cache file is loaded it 
        will be left untouched.
        
    Returns
    -------

    df: pandas Dataframe
        dataframe containing all lines and parameters

    Notes
    -----

    CDSD-4000 Database can be downloaded from [3]_

    Performances: I had huge performance trouble with this function, because the files are 
    huge (500k lines) and the format is to special (no space between numbers...)
    to apply optimized methods such as pandas's. A line by line reading isn't
    so bad, using struct to parse each line. However, we waste typing determining
    what every line is. I ended up using the fromfiles functions from numpy,
    not considering *\\n* (line return) as a special character anymore, and a second call
    to numpy to cast the correct format. That ended up being twice as fast. 

        - initial:                      20s / loop
        - with mmap:                    worse 
        - w/o readline().rstrip('\\n'):  still 20s
        - numpy fromfiles:              17s
        - no more readline, 2x fromfile 9s

    Think about using cache mode too:

        - no cache mode                 9s
        - cache mode, first time        22s
        - cache mode, then              2s

    Moving to HDF5:

    On cdsd_02069_02070 (56 Mb)

    Reading::

        cdsd2df(): 9.29 s
        cdsd2df(cache=True [old .txt version]): 2.3s 
        cdsd2df(cache=True [new h5 version, table]): 910ms
        cdsd2df(cache=True [new h5 version, fixed]): 125ms

    Storage::

        %timeit df.to_hdf("cdsd_02069_02070.h5", "df", format="fixed")  337ms
        %timeit df.to_hdf("cdsd_02069_02070.h5", "df", format="table")  1.03s

    References
    ----------

    Note that CDSD-HITEMP is used as the line database for CO2 in HITEMP 2010

    .. [1] `HITEMP 2010, Rothman et al., 2010 <https://www.sciencedirect.com/science/article/pii/S002240731000169X>`_

    .. [2] `CDSD-4000 article, Tashkun et al., 2011 <https://www.sciencedirect.com/science/article/pii/S0022407311001154>`_

    .. [3] `CDSD-4000 database <ftp://ftp.iao.ru/pub/CDSD-4000/>`_

    See Also
    --------
    
    :func:`~radis.io.hitran.hit2df`

    """

    if verbose >= 2:
        print(
            "Opening file {0} (format=CDSD {1}, cache={2})".format(
                fname, version, cache
            )
        )

    if version == "hitemp":
        columns = columns_hitemp
    elif version == "4000":
        columns = columns_4000
    else:
        raise ValueError("Unknown CDSD version: {0}".format(version))

    # Use cache file if possible
    fcache = splitext(fname)[0] + ".h5"
    check_cache_file(fcache=fcache, use_cached=cache, verbose=verbose)
    if cache and exists(fcache):
        return get_cache_file(fcache, verbose=verbose)

    # %% Start reading the full file

    df = parse_binary_file(fname, columns, count)

    # Remove non numerical attributes
    if drop_non_numeric:
        replace_PQR_with_m101(df)
        df = drop_object_format_columns(df, verbose=verbose)

    # cached file mode but cached file doesn't exist yet (else we had returned)
    if cache:
        if verbose:
            print("Generating cached file: {0}".format(fcache))
        try:
            save_to_hdf(
                df,
                fcache,
                metadata={},
                version=radis.__version__,
                key="df",
                overwrite=True,
                verbose=verbose,
            )
        except:
            if verbose:
                print("An error occured in cache file generation. Lookup access rights")
            pass

    return df
Esempio n. 2
0
def hit2df(fname, count=-1, cache=False, verbose=True, drop_non_numeric=True):
    """ Convert a HITRAN/HITEMP [1]_ file to a Pandas dataframe 

    Parameters    
    ----------

    fname: str
        HITRAN-HITEMP file name 

    count: int
        number of items to read (-1 means all file)

    cache: boolean, or ``'regen'`` or ``'force'``
        if ``True``, a pandas-readable HDF5 file is generated on first access, 
        and later used. This saves on the datatype cast and conversion and
        improves performances a lot (but changes in the database are not 
        taken into account). If False, no database is used. If ``'regen'``, temp
        file are reconstructed. Default ``False``. 

    Other Parameters
    ----------------
    
    drop_non_numeric: boolean
        if ``True``, non numeric columns are dropped. This improves performances, 
        but make sure all the columns you need are converted to numeric formats 
        before hand. Default ``True``. Note that if a cache file is loaded it 
        will be left untouched.

    Returns
    -------

    df: pandas Dataframe
        dataframe containing all lines and parameters



    References
    ----------


    .. [1] `HITRAN 1996, Rothman et al., 1998 <https://www.sciencedirect.com/science/article/pii/S0022407398000788>`__



    Notes
    -----

    Performances: see CDSD-HITEMP parser


    See Also
    --------
    
    :func:`~radis.io.cdsd.cdsd2df`

    """

    if verbose >= 2:
        print("Opening file {0} (cache={1})".format(fname, cache))

    columns = columns_2004

    # Use cache file if possible
    fcache = splitext(fname)[0] + ".h5"
    check_cache_file(fcache=fcache, use_cached=cache, verbose=verbose)
    if cache and exists(fcache):
        return get_cache_file(fcache, verbose=verbose)

    # Detect the molecule by reading the start of the file
    try:
        with open(fname) as f:
            mol = get_molecule(int(f.read(2)))
    except UnicodeDecodeError as err:
        raise ValueError(
            "You're trying to read a binary file {0} ".format(fname) +
            "instead of an HITRAN file") from err

    # %% Start reading the full file

    df = parse_hitran_file(fname, columns, count)

    # %% Post processing

    # assert one molecule per database only. Else the groupbase data reading
    # above doesnt make sense
    nmol = len(set(df["id"]))
    if nmol == 0:
        raise ValueError("Databank looks empty")
    elif nmol != 1:
        # Crash, give explicity error messages
        try:
            secondline = df.iloc[1]
        except IndexError:
            secondline = ""
        raise ValueError(
            "Multiple molecules in database ({0}). Current ".format(nmol) +
            "spectral code only computes 1 species at the time. Use MergeSlabs. "
            +
            "Verify the parsing was correct by looking at the first row below: "
            + "\n{0}".format(df.iloc[0]) +
            "\n----------------\nand the second row " +
            "below: \n{0}".format(secondline))

    # dd local quanta attributes, based on the HITRAN group
    df = parse_local_quanta(df, mol)

    # Add global quanta attributes, based on the HITRAN class
    df = parse_global_quanta(df, mol)

    # Remove non numerical attributes
    if drop_non_numeric:
        if "branch" in df:
            replace_PQR_with_m101(df)
        df = drop_object_format_columns(df, verbose=verbose)

    # cached file mode but cached file doesn't exist yet (else we had returned)
    if cache:
        if verbose:
            print("Generating cached file: {0}".format(fcache))
        try:
            save_to_hdf(
                df,
                fcache,
                metadata={},
                version=radis.__version__,
                key="df",
                overwrite=True,
                verbose=verbose,
            )
        except:
            if verbose:
                print(sys.exc_info())
                print(
                    "An error occured in cache file generation. Lookup access rights"
                )
            pass

    return df
Esempio n. 3
0
def hit2df(
    fname,
    cache=True,
    verbose=True,
    drop_non_numeric=True,
    load_wavenum_min=None,
    load_wavenum_max=None,
):
    """Convert a HITRAN/HITEMP [1]_ file to a Pandas dataframe

    Parameters
    ----------
    fname: str
        HITRAN-HITEMP file name
    cache: boolean, or ``'regen'`` or ``'force'``
        if ``True``, a pandas-readable HDF5 file is generated on first access,
        and later used. This saves on the datatype cast and conversion and
        improves performances a lot (but changes in the database are not
        taken into account). If False, no database is used. If ``'regen'``, temp
        file are reconstructed. Default ``True``.

    Other Parameters
    ----------------
    drop_non_numeric: boolean
        if ``True``, non numeric columns are dropped. This improves performances,
        but make sure all the columns you need are converted to numeric formats
        before hand. Default ``True``. Note that if a cache file is loaded it
        will be left untouched.
    load_wavenum_min, load_wavenum_max: float
        if not ``'None'``, only load the cached file if it contains data for
        wavenumbers above/below the specified value. See :py:func`~radis.io.cache_files.load_h5_cache_file`.
        Default ``'None'``.

    Returns
    -------
    df: pandas Dataframe
        dataframe containing all lines and parameters


    References
    ----------

    .. [1] `HITRAN 1996, Rothman et al., 1998 <https://www.sciencedirect.com/science/article/pii/S0022407398000788>`__



    Notes
    -----

    Performances: see CDSD-HITEMP parser


    See Also
    --------

    :func:`~radis.io.cdsd.cdsd2df`
    """
    metadata = {}
    # Last modification time of the original file :
    metadata["last_modification"] = time.ctime(getmtime(fname))
    if verbose >= 2:
        print("Opening file {0} (cache={1})".format(fname, cache))
        print("Last modification time: {0}".format(metadata["last_modification"]))
    if load_wavenum_min and load_wavenum_max:
        assert load_wavenum_min < load_wavenum_max

    columns = columns_2004

    # Use cache file if possible
    fcache = cache_file_name(fname)
    if cache and exists(fcache):
        relevant_if_metadata_above = (
            {"wavenum_max": load_wavenum_min} if load_wavenum_min else {}
        )  # not relevant if wavenum_max of file is < wavenum min required
        relevant_if_metadata_below = (
            {"wavenum_min": load_wavenum_max} if load_wavenum_max else {}
        )  # not relevant if wavenum_min of file is > wavenum max required
        df = load_h5_cache_file(
            fcache,
            cache,
            valid_if_metadata_is=metadata,
            relevant_if_metadata_above=relevant_if_metadata_above,
            relevant_if_metadata_below=relevant_if_metadata_below,
            current_version=radis.__version__,
            last_compatible_version=OLDEST_COMPATIBLE_VERSION,
            verbose=verbose,
        )
        if df is not None:
            return df

    # Detect the molecule by reading the start of the file
    try:
        with open(fname) as f:
            mol = get_molecule(int(f.read(2)))
    except UnicodeDecodeError as err:
        raise ValueError(
            "You're trying to read a binary file {0} ".format(fname)
            + "instead of an HITRAN file"
        ) from err

    # %% Start reading the full file

    df = parse_hitran_file(fname, columns)

    # %% Post processing

    # assert one molecule per database only. Else the groupbase data reading
    # above doesnt make sense
    nmol = len(set(df["id"]))
    if nmol == 0:
        raise ValueError("Databank looks empty")
    elif nmol != 1:
        # Crash, give explicity error messages
        try:
            secondline = df.iloc[1]
        except IndexError:
            secondline = ""
        raise ValueError(
            "Multiple molecules in database ({0}). Current ".format(nmol)
            + "spectral code only computes 1 species at the time. Use MergeSlabs. "
            + "Verify the parsing was correct by looking at the first row below: "
            + "\n{0}".format(df.iloc[0])
            + "\n----------------\nand the second row "
            + "below: \n{0}".format(secondline)
        )

    # Add local quanta attributes, based on the HITRAN group
    df = parse_local_quanta(df, mol)

    # Add global quanta attributes, based on the HITRAN class
    df = parse_global_quanta(df, mol)

    # Remove non numerical attributes
    if drop_non_numeric:
        if "branch" in df:
            replace_PQR_with_m101(df)
        df = drop_object_format_columns(df, verbose=verbose)

    # cached file mode but cached file doesn't exist yet (else we had returned)
    if cache:
        new_metadata = {
            # Last modification time of the original file :
            "last_modification": time.ctime(getmtime(fname)),
            "wavenum_min": df.wav.min(),
            "wavenum_max": df.wav.max(),
        }
        if verbose:
            print(
                "Generating cache file {0} with metadata :\n{1}".format(
                    fcache, new_metadata
                )
            )
        try:
            save_to_hdf(
                df,
                fcache,
                metadata=new_metadata,
                version=radis.__version__,
                key="df",
                overwrite=True,
                verbose=verbose,
            )
        except PermissionError:
            if verbose:
                print(sys.exc_info())
                print("An error occured in cache file generation. Lookup access rights")
            pass

    # TODO : get only wavenum above/below 'load_wavenum_min', 'load_wavenum_max'
    # by parsing df.wav.   Completely irrelevant files are discarded in 'load_h5_cache_file'
    # but files that have partly relevant lines are fully loaded.
    # Note : cache file is generated with the full line list.

    return df
Esempio n. 4
0
def fetch_hitemp(
    molecule,
    local_databases="~/.radisdb/",
    databank_name="HITEMP-{molecule}",
    isotope=None,
    load_wavenum_min=None,
    load_wavenum_max=None,
    cache=True,
    verbose=True,
    chunksize=100000,
    clean_cache_files=True,
    return_local_path=False,
):
    """Stream HITEMP file from HITRAN website. Unzip and build a HDF5 file directly.

    Returns a Pandas DataFrame containing all lines.

    Parameters
    ----------
    molecule: `"CO2", "N2O", "CO", "CH4", "NO", "NO2", "OH"`
        HITEMP molecule. See :py:attr:`~radis.io.hitemp.HITEMP_SOURCE_FILES`
    local_databases: str
        where to create the RADIS HDF5 files. Default ``"~/.radisdb/"``
    databank_name: str
        name of the databank in RADIS :ref:`Configuration file <label_lbl_config_file>`
        Default ``"HITEMP-{molecule}"``
    isotope: str
        load only certain isotopes : ``'2'``, ``'1,2'``, etc. If ``None``, loads
        everything. Default ``None``.
    load_wavenum_min, load_wavenum_max: float (cm-1)
        load only specific wavenumbers.

    Other Parameters
    ----------------
    cache: bool, or ``'regen'``
        if ``True``, use existing HDF5 file. If ``False`` or ``'regen'``, rebuild it.
    verbose: bool
    chunksize: int
        number of lines to process at a same time. Higher is usually faster
        but can create Memory problems and keep the user uninformed of the progress.
    clean_cache_files: bool
        if ``True`` clean downloaded cache files after HDF5 are created.
    return_local_path: bool
        if ``True``, also returns the path of the local database file.

    Returns
    -------
    df: pd.DataFrame
        Line list
        A HDF5 file is also created in ``local_databases`` and referenced
        in the :ref:`RADIS config file <label_lbl_config_file>` with name
        ``databank_name``
    local_path: str
        path of local database file if ``return_local_path``

    Notes
    -----
    if using ``load_only_wavenum_above/below`` or ``isotope``, the whole
    database is anyway downloaded and uncompressed to ``local_databases``
    fast access .HDF5 files (which will take a long time on first call). Only
    the expected wavenumber range & isotopes are returned. The .HFD5 parsing uses
    :py:func:`~radis.io.hdf5.hdf2df`

    See Also
    --------
    :py:func:`~radis.io.hdf5.hdf2df`

    """
    # TODO ? : unzip only parts of the database
    # see https://github.com/radis/radis/pull/194

    if databank_name == "HITEMP-{molecule}":
        databank_name = databank_name.format(**{"molecule": molecule})
    local_databases = abspath(local_databases.replace("~", expanduser("~")))

    if molecule in ["H2O", "CO2"]:
        raise NotImplementedError(
            "Automatic HITEMP download not implemented for {0} : multiple files. Download manually on https://hitran.org/hitemp/ "
            .format(molecule))

    try:
        inputf = HITEMP_SOURCE_FILES[molecule]
    except KeyError as err:
        raise KeyError(
            f"Please choose one of HITEMP molecules : {list(HITEMP_SOURCE_FILES.keys())}. Got '{molecule}'"
        ) from err
    urlname = BASE_URL + inputf

    try:
        os.mkdir(local_databases)
    except OSError:
        pass
    else:
        if verbose:
            print("Created folder :", local_databases)

    local_file = abspath(
        join(local_databases,
             molecule + "-" + inputf.replace(".par.bz2", ".h5")))

    if not cache or cache == "regen":
        # Delete existing HDF5 file
        if exists(local_file):
            if verbose:
                print("Removing existing file ", local_file)
                # TODO: also clean the getDatabankList? Todo once it is in JSON format. https://github.com/radis/radis/issues/167
            os.remove(local_file)

    if exists(local_file):
        # Read and return from local file

        # check metadata :
        check_not_deprecated(
            local_file,
            metadata_is={},
            metadata_keys_contain=["wavenumber_min", "wavenumber_max"],
        )
        # check database is registered in ~/.radis
        if not databank_name in getDatabankList():
            # if not, check number of rows is correct :
            error_msg = ""
            with pd.HDFStore(local_file, "r") as store:
                nrows = store.get_storer("df").nrows
                if nrows != INFO_HITEMP_LINE_COUNT[molecule]:
                    error_msg += (
                        f"\nNumber of lines in local database ({nrows:,}) " +
                        "differ from the expected number of lines for " +
                        f"HITEMP {molecule}: {INFO_HITEMP_LINE_COUNT[molecule]}"
                    )
                file_metadata = store.get_storer("df").attrs.metadata
                for k in [
                        "wavenumber_min",
                        "wavenumber_max",
                        "download_url",
                        "download_date",
                ]:
                    if k not in file_metadata:
                        error_msg += (
                            "\nMissing key in file metadata to register the database "
                            + f"automatically : {k}")

            if error_msg:
                raise ValueError(
                    f"{databank_name} not declared in your RADIS ~/.config file although "
                    + f"{local_file} exists. {error_msg}\n" +
                    "If you know this file, add it to ~/.radisdb manually. " +
                    "Else regenerate the database with:\n\t" +
                    ">>> radis.SpectrumFactory().fetch_databank(..., use_cached='regen')"
                    + "\nor\n\t" +
                    ">>> radis.io.hitemp.fetch_hitemp({molecule}, cache='regen')"
                    +
                    "\n\n⚠️ It will re-download & uncompress the whole database "
                    +
                    "from HITEMP.\n\nList of declared databanks: {getDatabankList()}.\n"
                    + f"{local_file} metadata: {file_metadata}")

            # Else database looks ok : register it
            if verbose:
                print(
                    f"{databank_name} not declared in your RADIS ~/.config file although "
                    +
                    f"{local_file} exists. Registering the database automatically."
                )

            register_database(
                databank_name,
                [local_file],
                molecule=molecule,
                wmin=file_metadata["wavenumber_min"],
                wmax=file_metadata["wavenumber_max"],
                download_date=file_metadata["download_date"],
                urlname=file_metadata["download_url"],
                verbose=verbose,
            )

        if verbose:
            print(f"Using existing database {databank_name}")
        df = hdf2df(
            local_file,
            isotope=isotope,
            load_wavenum_min=load_wavenum_min,
            load_wavenum_max=load_wavenum_max,
            verbose=verbose,
        )
        return (df, local_file) if return_local_path else df

    # Doesnt exist : download
    ds = DataSource(join(local_databases, "downloads"))

    if verbose:
        print(f"Downloading {inputf} for {molecule}.")
    download_date = date.today().strftime("%d %b %Y")

    columns = columns_2004

    # Get linereturn (depends on OS, but file may also have been generated
    # on a different OS. Here we simply read the file to find out)
    with ds.open(urlname) as gfile:  # locally downloaded file

        dt = _create_dtype(
            columns, "a2"
        )  # 'a2' allocates space to get \n or \n\r for linereturn character
        b = np.zeros(1, dtype=dt)
        gfile.readinto(b)
        linereturnformat = _get_linereturnformat(b, columns)

    with ds.open(urlname) as gfile:  # locally downloaded file

        dt = _create_dtype(columns, linereturnformat)
        b = np.zeros(chunksize,
                     dtype=dt)  # receives the HITRAN 160-character data.
        wmin = np.inf
        wmax = 0
        if verbose:
            print(
                f"Download complete. Building {molecule} database to {local_file}"
            )

        with pd.HDFStore(local_file, mode="a", complib="blosc",
                         complevel=9) as f:
            Nlines = 0
            Ntotal_lines_expected = INFO_HITEMP_LINE_COUNT[molecule]
            pb = ProgressBar(N=Ntotal_lines_expected, active=verbose)
            for nbytes in iter(lambda: gfile.readinto(b), 0):

                if not b[-1]:
                    # End of file flag within the chunk (but does not start
                    # with End of file flag) so nbytes != 0
                    b = get_last(b)

                df = _ndarray2df(b, columns, linereturnformat)

                # Post-processing :
                # ... Add local quanta attributes, based on the HITRAN group
                df = parse_local_quanta(df, molecule)

                # ... Add global quanta attributes, based on the HITRAN class
                df = parse_global_quanta(df, molecule)

                # Switch 'P', 'Q', 'R' to -1, 0, 1
                if "branch" in df:
                    replace_PQR_with_m101(df)

                # df.to_hdf(
                #     local_file, "df", format="table", append=True, complib="blosc", complevel=9
                # )
                f.put(
                    key="df",
                    value=df,
                    append=True,
                    format="table",
                    data_columns=DATA_COLUMNS,
                )

                wmin = np.min((wmin, df.wav.min()))
                wmax = np.max((wmax, df.wav.max()))
                Nlines += len(df)
                pb.update(
                    Nlines,
                    message=
                    f"Parsed {Nlines:,} / {Ntotal_lines_expected:,} lines. Wavenumber range {wmin:.2f}-{wmax:.2f} cm-1 is complete.",
                )

                # Reinitialize for next read
                b = np.zeros(
                    chunksize,
                    dtype=dt)  # receives the HITRAN 160-character data.

            f.get_storer("df").attrs.metadata = {
                "wavenumber_min": wmin,
                "wavenumber_max": wmax,
                "download_date": download_date,
                "download_url": urlname,
                "version": radis.__version__,
            }
            pb.done()

    # Done: add final checks
    # ... check on the created file that all lines are there :
    with pd.HDFStore(local_file, "r") as store:
        nrows = store.get_storer("df").nrows
        assert nrows == Nlines
        if nrows != INFO_HITEMP_LINE_COUNT[molecule]:
            raise AssertionError(
                f"Number of lines in local database ({nrows:,}) " +
                "differ from the expected number of lines for " +
                f"HITEMP {molecule}: {INFO_HITEMP_LINE_COUNT[molecule]}" +
                ". Check that there was no recent update on HITEMP. " +
                "Else it may be a download error ?")

    # Add database to  ~/.radis
    register_database(
        databank_name,
        [local_file],
        molecule,
        wmin,
        wmax,
        download_date,
        urlname,
        verbose,
    )

    df = hdf2df(
        local_file,
        isotope=isotope,
        load_wavenum_min=load_wavenum_min,
        load_wavenum_max=load_wavenum_max,
        verbose=verbose,
    )

    # Fully unzipped (and working, as it was reloaded): clean
    if clean_cache_files:
        os.remove(ds._findfile(urlname))
        if verbose >= 3:
            from radis.misc.printer import printg

            printg("... removed downloaded cache file")

    return (df, local_file) if return_local_path else df
Esempio n. 5
0
def cdsd2df(
    fname,
    version="hitemp",
    cache=True,
    verbose=True,
    drop_non_numeric=True,
    load_wavenum_min=None,
    load_wavenum_max=None,
):
    """Convert a CDSD-HITEMP [1]_ or CDSD-4000 [2]_ file to a Pandas dataframe.

    Parameter
    ----------
    fname: str
        CDSD file name
    version: str ('4000', 'hitemp')
        CDSD version
    cache: boolean, or 'regen'
        if ``True``, a pandas-readable HDF5 file is generated on first access,
        and later used. This saves on the datatype cast and conversion and
        improves performances a lot (but changes in the database are not
        taken into account). If ``False``, no database is used. If 'regen', temp
        file are reconstructed. Default ``True``.

    Other Parameters
    ----------------
    drop_non_numeric: boolean
        if ``True``, non numeric columns are dropped. This improves performances,
        but make sure all the columns you need are converted to numeric formats
        before hand. Default ``True``. Note that if a cache file is loaded it
        will be left untouched.
    load_wavenum_min, load_wavenum_max: float
        if not ``'None'``, only load the cached file if it contains data for
        wavenumbers above/below the specified value. See :py:func`~radis.io.cache_files.load_h5_cache_file`.
        Default ``'None'``.

    Returns
    -------
    df: pandas Dataframe
        dataframe containing all lines and parameters

    Notes
    -----

    CDSD-4000 Database can be downloaded from [3]_

    Performances: I had huge performance trouble with this function, because the files are
    huge (500k lines) and the format is to special (no space between numbers...)
    to apply optimized methods such as pandas's. A line by line reading isn't
    so bad, using struct to parse each line. However, we waste typing determining
    what every line is. I ended up using the fromfiles functions from numpy,
    not considering *\\n* (line return) as a special character anymore, and a second call
    to numpy to cast the correct format. That ended up being twice as fast.

        - initial:                      20s / loop
        - with mmap:                    worse
        - w/o readline().rstrip('\\n'):  still 20s
        - numpy fromfiles:              17s
        - no more readline, 2x fromfile 9s

    Think about using cache mode too:

        - no cache mode                 9s
        - cache mode, first time        22s
        - cache mode, then              2s

    Moving to HDF5:

    On cdsd_02069_02070 (56 Mb)

    Reading::

        cdsd2df(): 9.29 s
        cdsd2df(cache=True [old .txt version]): 2.3s
        cdsd2df(cache=True [new h5 version, table]): 910ms
        cdsd2df(cache=True [new h5 version, fixed]): 125ms

    Storage::

        %timeit df.to_hdf("cdsd_02069_02070.h5", "df", format="fixed")  337ms
        %timeit df.to_hdf("cdsd_02069_02070.h5", "df", format="table")  1.03s

    References
    ----------

    Note that CDSD-HITEMP is used as the line database for CO2 in HITEMP 2010

    .. [1] `HITEMP 2010, Rothman et al., 2010 <https://www.sciencedirect.com/science/article/pii/S002240731000169X>`_

    .. [2] `CDSD-4000 article, Tashkun et al., 2011 <https://www.sciencedirect.com/science/article/pii/S0022407311001154>`_

    .. [3] `CDSD-4000 database <ftp://ftp.iao.ru/pub/CDSD-4000/>`_

    See Also
    --------

    :func:`~radis.io.hitran.hit2df`
    """
    metadata = {}
    metadata["last_modification"] = time.ctime(getmtime(fname))
    if load_wavenum_min and load_wavenum_max:
        assert load_wavenum_min < load_wavenum_max

    if verbose >= 2:
        print("Opening file {0} (format=CDSD {1}, cache={2})".format(
            fname, version, cache))
        print("Last Modification time: {0}".format(
            metadata["last_modification"]))

    if version == "hitemp":
        columns = columns_hitemp
    elif version == "4000":
        columns = columns_4000
    else:
        raise ValueError("Unknown CDSD version: {0}".format(version))

    # Use cache file if possible
    fcache = cache_file_name(fname)
    if cache and exists(fcache):
        relevant_if_metadata_above = (
            {
                "wavenum_max": load_wavenum_min
            } if load_wavenum_min else {}
        )  # not relevant if wavenum_max of file is < wavenum min required
        relevant_if_metadata_below = (
            {
                "wavenum_min": load_wavenum_max
            } if load_wavenum_max else {}
        )  # not relevant if wavenum_min of file is > wavenum max required
        df = load_h5_cache_file(
            fcache,
            cache,
            valid_if_metadata_is=metadata,
            relevant_if_metadata_above=relevant_if_metadata_above,
            relevant_if_metadata_below=relevant_if_metadata_below,
            current_version=radis.__version__,
            last_compatible_version=OLDEST_COMPATIBLE_VERSION,
            verbose=verbose,
        )
        if df is not None:
            return df

    # %% Start reading the full file

    df = parse_hitran_file(fname, columns)

    # Remove non numerical attributes
    if drop_non_numeric:
        replace_PQR_with_m101(df)
        df = drop_object_format_columns(df, verbose=verbose)

    # cached file mode but cached file doesn't exist yet (else we had returned)
    if cache:
        new_metadata = {
            # Last modification time of the original file :
            "last_modification": time.ctime(getmtime(fname)),
            "wavenum_min": df.wav.min(),
            "wavenum_max": df.wav.max(),
        }
        if verbose:
            print("Generating cache file {0} with metadata :\n{1}".format(
                fcache, new_metadata))
        try:
            save_to_hdf(
                df,
                fcache,
                metadata=new_metadata,
                version=radis.__version__,
                key="df",
                overwrite=True,
                verbose=verbose,
            )
        except PermissionError:
            if verbose:
                print(
                    "An error occured in cache file generation. Lookup access rights"
                )
            pass

    # TODO : get only wavenum above/below 'load_only_wavenum_above', 'load_only_wavenum_below'
    # by parsing df.wav.   Completely irrelevant files are discarded in 'load_h5_cache_file'
    # but files that have partly relevant lines are fully loaded.
    # Note : cache file is generated with the full line list.

    return df