Exemple #1
0
def _get_linereturnformat(data, columns, fname=""):
    """
    Get line return character & format (size).

    Notes
    -----

    We cannot simply infer it from the OS :
    problem arise when file was written in an OS and read in another OS (for instance,
    line return characters are not converted when read from .egg files). Here
    we read the first line and infer the line return character for it
    """
    # fname just for the error message

    # get format (size) of line return
    from radis.misc.basics import to_str

    linereturn = to_str(data[0][-1])
    if to_str("\r\n") in linereturn:
        linereturnformat = "a2"
    elif to_str("\n") in linereturn or to_str("\r") in linereturn:
        linereturnformat = "a1"
    else:
        raise ValueError(
            "Unknown Line return format: {0}. Check that your file {1} has the HITRAN format. First line : {2}"
            .format(linereturn, fname, data[0]))

    return linereturnformat
Exemple #2
0
def hit2df(fname, count=-1, cache=False, verbose=True):
    ''' Convert a HITRAN/HITEMP [1]_ file to a Pandas dataframe 
    
    
    Parameters    
    ----------
    
    fname: str
        HITRAN-HITEMP file name 
        
    count: int
        number of items to read (-1 means all file)
        
    cache: boolean
        if True, a pandas-readable HDF5 file is generated on first access, 
        and later used. This saves on the datatype cast and conversion and
        improves performances a lot (but changes in the database are not 
        taken into account). If False, no database is used. If 'regen', temp
        file are reconstructed. Default False. 
    
    
    Returns
    -------
    
    df: pandas Dataframe
        dataframe containing all lines and parameters
        
    
    
    References
    ----------

    
    .. [1] `HITRAN 1996, Rothman et al., 1998 <https://www.sciencedirect.com/science/article/pii/S0022407398000788>`__
    
    
    
    Notes
    -----
    
    Performances: see CDSD-HITEMP parser
    
    '''

    columns = columns_2004

    if cache:  # lookup if cached file exist.
        #        fcache = fname+'.cached'
        fcache = splitext(fname)[0] + '.h5'
        if exists(fcache):
            if cache == 'regen':
                os.remove(fcache)
                if verbose: print('Deleted h5 cache file : {0}'.format(fcache))
            else:
                if verbose: print('Using h5 file: {0}'.format(fcache))
                #            return pd.read_csv(fcache)
                return pd.read_hdf(fcache, 'df')

    # Detect the molecule by reading the start of the file
    with open(fname) as f:
        mol = get_molecule(int(f.read(2)))

    # %% Start reading the full file

    # To be faster, we read file totally in bytes mode with fromfiles. But that
    # requires to properly decode the line return character:

    # problem arise when file was written in an OS and read in another OS (for instance,
    # line return characters are not converted when read from .egg files). Here
    # we read the first line and infer the line return character for it

    # ... Create a dtype with the binary data format and the desired column names
    dtype = [(k, c[0]) for (k, c) in columns.items()] + [('_linereturn', 'a2')]
    # ... _linereturn is to capture the line return symbol. We delete it afterwards
    dt = _format_dtype(dtype)
    data = np.fromfile(fname, dtype=dt, count=1)  # just read the first line

    # get format of line return
    from radis.misc.basics import to_str
    linereturn = to_str(data[0][-1])
    if to_str('\r\n') in linereturn:
        linereturnformat = 'a2'
    elif to_str('\n') in linereturn or to_str('\r') in linereturn:
        linereturnformat = 'a1'
    else:
        raise ValueError(
            'Line return format unknown: {0}. Please update RADIS'.format(
                linereturn))

    # Now re-read with correct line return character

    # ... Create a dtype with the binary data format and the desired column names
    dtype = [(k, c[0]) for (k, c) in columns.items()
             ] + [('_linereturn', linereturnformat)]
    # ... _linereturn is to capture the line return symbol. We delete it afterwards
    dt = _format_dtype(dtype)
    data = np.fromfile(fname, dtype=dt, count=count)

    # ... Cast to new type
    # This requires to recast all the data already read, but is still the fastest
    # method I found to read a file directly (for performance benchmark see
    # CDSD-HITEMP parser)
    newtype = [c[0] if (c[1] == str) else c[1] for c in columns.values()]
    dtype = list(zip(list(columns.keys()),
                     newtype)) + [('_linereturn', linereturnformat)]
    data = _cast_to_dtype(data, dtype)

    # %% Create dataframe
    df = pd.DataFrame(data.tolist(),
                      columns=list(columns.keys()) + ['_linereturn'])

    # assert one molecule per database only. Else the groupbase data reading
    # above doesnt make sense
    nmol = len(set(df['id']))
    if nmol == 0:
        raise ValueError('Databank looks empty')
    elif nmol != 1:
        # Crash, give explicity error messages
        try:
            secondline = df.iloc[1]
        except IndexError:
            secondline = ''
        raise ValueError('Multiple molecules in database ({0}). Current '.format(nmol)+\
                         'spectral code only computes 1 species at the time. Use MergeSlabs. '+\
                         'Verify the parsing was correct by looking at the first row below: '+\
                         '\n{0}'.format(df.iloc[0])+'\n----------------\nand the second row '+\
                         'below: \n{0}'.format(secondline))

    for k, c in columns.items():
        if c[1] == str:
            df[k] = df[k].str.decode("utf-8")

    # %% Add local quanta attributes, based on the HITRAN group
    df = parse_local_quanta(df, mol)

    # %% Add global quanta attributes, based on the HITRAN class
    df = parse_global_quanta(df, mol)

    # Strip whitespaces around PQR columns (due to 2 columns jumped)
    if 'branch' in df:
        df['branch'] = df.branch.str.strip()

    # Delete dummy column than handled the line return character
    del df['_linereturn']

    if cache:  # cached file mode but cached file doesn't exist yet (else we had returned)
        if verbose: print('Generating cached file: {0}'.format(fcache))
        try:
            #            df.to_csv(fcache)
            _generate_cache_file(fcache, df)
        except:
            if verbose:
                print(sys.exc_info())
                print(
                    'An error occured in cache file generation. Lookup access rights'
                )
            pass

    return df
Exemple #3
0
def parse_hitran_file(fname, columns, count):
    """Parse a file under HITRAN ``par`` format. Parsing is done in binary
    format with :py:func:`numpy.fromfile` so it's as fast as possible.

    Parameters
    ----------

    fname: str
        filename

    columns: dict
        list of columns and their format

    count: int
        number of lines to read

    Returns
    -------

    df: pandas DataFrame
        dataframe with lines

    Notes
    -----

    Part common to hit2df and cdsd2df

    """

    # To be faster, we read file totally in bytes mode with fromfiles. But that
    # requires to properly decode the line return character:

    # problem arise when file was written in an OS and read in another OS (for instance,
    # line return characters are not converted when read from .egg files). Here
    # we read the first line and infer the line return character for it

    # ... Create a dtype with the binary data format and the desired column names
    dtype = [(k, c[0]) for (k, c) in columns.items()] + [("_linereturn", "a2")]
    # ... _linereturn is to capture the line return symbol. We delete it afterwards
    dt = _format_dtype(dtype)
    data = np.fromfile(fname, dtype=dt, count=1)  # just read the first line

    # get format of line return
    from radis.misc.basics import to_str

    linereturn = to_str(data[0][-1])
    if to_str("\r\n") in linereturn:
        linereturnformat = "a2"
    elif to_str("\n") in linereturn or to_str("\r") in linereturn:
        linereturnformat = "a1"
    else:
        raise ValueError(
            "Unknown `Line return` format: {0}. Check that your file {1} has the HITRAN format."
            .format(linereturn, fname))

    # Now re-read with correct line return character

    # ... Create a dtype with the binary data format and the desired column names
    dtype = [(k, c[0]) for (k, c) in columns.items()
             ] + [("_linereturn", linereturnformat)]
    # ... _linereturn is to capture the line return symbol. We delete it afterwards
    dt = _format_dtype(dtype)
    data = np.fromfile(fname, dtype=dt, count=count)

    # ... Cast to new type
    # This requires to recast all the data already read, but is still the fastest
    # method I found to read a file directly (for performance benchmark see
    # CDSD-HITEMP parser)
    newtype = [c[0] if (c[1] == str) else c[1] for c in columns.values()]
    dtype = list(zip(list(columns.keys()),
                     newtype)) + [("_linereturn", linereturnformat)]
    data = _cast_to_dtype(data, dtype)

    # %% Create dataframe
    df = pd.DataFrame(data.tolist(),
                      columns=list(columns.keys()) + ["_linereturn"])

    # Delete dummy column than handled the line return character
    del df["_linereturn"]

    # Update format
    for k, c in columns.items():
        if c[1] == str:
            df[k] = df[k].str.decode("utf-8")

    # Strip whitespaces around PQR columns (due to 2 columns jumped)
    if "branch" in df:  # (only in CDSD)
        df["branch"] = df.branch.str.strip()

    return df