Ejemplo n.º 1
0
def read_flux_files(
    file_dir,
    file_pre,
    tracer_fp=None,
    diag_fp=None
):
    """
    Since scale factors and results are examined on a monthly time-scale, raw
    3hr flux files need to be processed to produce a monthly flux for each grid
    point.

    Array objects within have 72x46 dimensions

    Assumptions -
    1. flux files are bpch files

    Parameters:
        file_dir  (str) : directory where files are stored
        file_pre  (str) : prefix for flux files, e.g. nep.geos.4x5.2010
        tracer_fp (str) : path to relevant tracer file
                          (if none, will look in file_dir)
        diag_fp   (str) : path to relevant diag file
                          (if none, will look in file_dir)

    Returns:
        xbpch object which will contain a flux of interest in additiona to
        dimension parameters (e.g. lon/lat/lev)
    """
    if tracer_fp:
        tracer_fp_1 = tracer_fp
    else:
        tracer_fp_1 = file_dir + '/tracerinfo.dat'

    if diag_fp:
        diag_fp_1 = diag_fp
    else:
        diag_fp_1 = file_dir + '/diaginfo.dat'

    # find the flux file names
    file_names = sorted(
        [file_nm for file_nm in glob(file_dir + '/%s*' % file_pre)]
    )

    assert len(file_names) > 0

    # read in all the prior fluxes
    fluxes = xbpch.open_mfbpchdataset(
        file_names,
        dask=True,
        tracerinfo_file=tracer_fp_1,
        diaginfo_file=diag_fp_1
    )

    return fluxes
Ejemplo n.º 2
0
def get_mean_fluxes(directory_path, file_prefix, month, flux_field=FLUX_FIELD):
    """
    Obtain mean fluxes for month of interest
        -- can be used for truth and prior.

    Parameters:
        directory_path (str) : see generate_flux_filenames docstring
        file_prefix    (str) : see generate_flux_filenames docstring
        month          (int) : integer representation of month of interest
        flux_field     (str) : name of flux field in flux files

    Returns:
        dictionary with following keys (all numpy arrays)
        - flux
        - latitude
        - longitude
        - time
    """
    assert isinstance(month, int)
    assert month > 0
    assert month < 13

    # get the flux files of interest
    flux_files = generate_flux_filenames(directory_path=directory_path,
                                         file_prefix=file_prefix)

    # read in the fluxes
    tracer_path = directory_path + 'tracerinfo.dat'
    diag_path = directory_path + 'diaginfo.dat'
    fluxes = xbpch.open_mfbpchdataset(flux_files,
                                      dask=True,
                                      tracerinfo_file=tracer_path,
                                      diaginfo_file=diag_path)

    # find the time indices of interest
    if month + 1 < 10:
        time_idxs = np.where(
            fluxes.time.values < np.datetime64('1985-0%i-01' % (month + 1)))[0]
    else:
        time_idxs = np.where(
            fluxes.time.values < np.datetime64('1985-%i-01' (month + 1)))[0]

    # filter the fluxes and find the mean
    month_fluxes = fluxes[flux_field].values[time_idxs, :, :].mean(axis=0)

    return {
        'flux': month_fluxes,
        'latitude': fluxes.lat.values,
        'longitude': fluxes.lon.values,
        'time': fluxes.time.values[time_idxs]
    }
Ejemplo n.º 3
0
def read_gc(fname,varname,cat='IJ-AVG-$',
            gc_dir = '/short/m19/jaf574/GC.v11-01/runs.v11-02e/geosfp_025x03125_tropchem_au.base/',
            **kwargs):

    # Some species involve multiple GEOS-Chem species...
    varname_gc = gcname_to_names(varname)

    # Expand wildcard if necessary and link to directory
    if '*' in fname:
        fname = glob(gc_dir+fname)
    else:
        fname = [gc_dir + f for f in fname]
    # Put files in order!
    fname.sort()

    # Read using xbpch
    # one file
    if isinstance(fname,str):
       ds = open_bpchdataset(fname,categories=[cat,],fields=varname_gc,
                             diaginfo_file=gc_dir+'diaginfo.dat',
                             tracerinfo_file=gc_dir+'tracerinfo.dat',**kwargs)

    # multiple files
    else:
       ds = open_mfbpchdataset(fname,dask=True,categories=[cat,],fields=varname_gc,
                             diaginfo_file=gc_dir+'diaginfo.dat',
                             tracerinfo_file=gc_dir+'tracerinfo.dat',**kwargs)

    # load dataset
    ds.load()

    # extract variables
    cat=cat.replace('$','S').replace('-','_')
    dfg = ds[[cat+'_'+v for v in varname_gc]]

    # If needed, sum GEOS-Chem variables
    if len(varname_gc) > 1:
        dfg = sum_gc_vars(dfg, [cat+'_'+v for v in varname_gc],
                                varname=cat+'_'+varname)

    return dfg
Ejemplo n.º 4
0
def read_daily_flux(flux_fp, flux_prefix, lb, ub, tracerfile_path,
                    diagfile_path):
    """
    Reads in a sequence of daily fluxes.

    Parameters:
        flux_fp         (str) : file path to fluxes
        flux_prefix     (str) : prefix to each flux file of interest
        lb              (str) : inclusive lower bound of flux file number
        ub              (str) : inclusive upper bound of flux file number
        tracerfile_path (str) : path to tracerinfo.dat
        diagfile_path   (str) : path to diaginfo.dat

    Returns:
        xarray core dataset containing fluxes
    """
    assert isinstance(flux_fp, str)
    assert isinstance(flux_prefix, str)

    # create a list of files to read in
    file_suffs = [f'{i:03}' for i in range(lb, ub + 1)]

    # create a list of flux filepaths
    flux_fps = [
        '%s/%s%s' % (flux_fp, flux_prefix, suff) for suff in file_suffs
    ]

    # check that the above files exist
    for flux_file in flux_fps:
        assert os.path.exists(flux_file)

    # read in the files
    fluxes = xbpch.open_mfbpchdataset(flux_fps,
                                      dask=True,
                                      tracerinfo_file=tracerfile_path,
                                      diaginfo_file=diagfile_path)

    return fluxes
Ejemplo n.º 5
0
            print("   " + fn)
        sys.exit(1)

    # Else, we should be good to read in and concatenate
    open_kws = {
        "tracerinfo_file": args.tracerinfo,
        "diaginfo_file": args.diaginfo,
        "memmap": True,
        "dask": True
    }

    print("\nReading in file(s)...")
    if len(args.bpch_files) == 1:
        ds = open_bpchdataset(args.bpch_files[0], **open_kws)
    else:
        ds = open_mfbpchdataset(args.bpch_files, **open_kws)

    # This block of code is hack to fix the encoding of attributes
    # on the DataArrays in this Dataset. They are being
    # set at a very low level when we read in the data, and manually
    # specifying the encoding doesn't work.
    # However, deleting them from the attributes dict
    # doesn't end up removing them from the final output file - they get
    # written just fine.
    print("\nDecoding variables...")
    for v in ds.data_vars:
        da = ds[v]
        da = _maybe_del_attr(da, 'scale_factor')
        da = _maybe_del_attr(da, 'units')
        da = _maybe_decode_attr(da, 'hydrocarbon')
        da = _maybe_decode_attr(da, 'chemical')
def generate_seaonal_data(dir_path, start_year, file_base, parameter):
    """
    Creates a dictionary for each of the seasons --
        - DJF
        - MAM
        - JJA
        - SON

    Parameters:
        dir_path   (str) : path to directory containing files
        start_year (int) : starting year for the fluxes
        file_base  (str) : form of files of interest
        parameter  (str) : parameter of interest in the underlying binary
                           punch files

    Returns:
        dictionary with keys corresponding to each of the seasons above. Each
        value is a list of all the days in that season.
        Also, has keys for
            - time
            - latitude
            - longitude

    NOTE:
    - files are assumed to be of the from filebase + .###, e.g.
      nep.geos.4x5.2010.001
    - the tracerinfo and diaginfo files are assumed to be in dir_path
    - files are assumed to be split into 3hour increments
    """
    START = datetime.now()

    # get all file names
    file_names = glob.glob(dir_path + file_base + '*')

    # read in the files
    fluxes = xbpch.open_mfbpchdataset(paths=file_names,
                                      dask=True,
                                      tracerinfo_file=dir_path +
                                      'tracerinfo.dat',
                                      diaginfo_file=dir_path + 'diaginfo.dat')
    print('Read in fluxes from %s' % dir_path + file_base)
    print('Elapsed time: %i seconds' % (datetime.now() - START).seconds)

    # generate flux dates from the given starting year
    end_data = np.datetime64('%s-01-01' % (start_year + 1))
    flux_dates = []
    start_date = np.datetime64('%s-01-01' % start_year)
    current_date = start_date

    while current_date < end_data:
        flux_dates.append(current_date)
        current_date += np.timedelta64(3, 'h')

    flux_dates = np.array(flux_dates)

    # create seasonal indices
    djf_indx, mam_indx, jja_indx, son_indx = create_season_indices(flux_dates)

    # create seasonal arrays
    djf_arr = fluxes[parameter].values[djf_indx, :, :]
    mam_arr = fluxes[parameter].values[mam_indx, :, :]
    jja_arr = fluxes[parameter].values[jja_indx, :, :]
    son_arr = fluxes[parameter].values[son_indx, :, :]

    # get latitude and longitude
    lat_arr = fluxes.lat.values
    lon_arr = fluxes.lon.values

    return {
        'time': flux_dates,
        'lat': lat_arr,
        'lon': lon_arr,
        'djf': djf_arr,
        'mam': mam_arr,
        'jja': jja_arr,
        'son': son_arr
    }
Ejemplo n.º 7
0
def bpch_to_netCDF(folder=None, filename='ctm.nc', bpch_file_list=None,
                   remake=False, filetype="*ctm.bpch*",
                   check4_trac_avg_if_no_ctm_bpch=True, backend='PyGChem',
                   verbose=False, **kwargs):
    """
    Converts GEOS-Chem ctm.bpch output file(s) to NetCDF

    Parameters
    ----------
    folder (str): working directory for data files
    filename (str): name to give created NetCDF
    bpch_file_list (list): list of files to convert
    remake (bool): overwrite existing NetCDF file
    filetype (str): string with wildcards to match filenames
    ( e.g. *ctm.bpch*, trac_avg.*, or *ts*bpch* )
    verbose (bool): print (minor) logging to screen

    Returns
    -------
    (None) saves a NetCDF file to disk
    """
    import os
    # Check if file already exists and warn about remaking
    if __package__ is None:
        from .bpch2netCDF import get_folder
    else:
        from .bpch2netCDF import get_folder
    folder = get_folder(folder)
    output_file = os.path.join(folder, filename)

    # If the netCDf file already exists dont overwrite it without remake=True.
    if not remake:
        if os.path.exists(output_file):
            logging.warning(output_file + ' already exists. Not recreating.')
            return

    # Look for files if file list is not provided.
    if isinstance(bpch_file_list, type(None)):
        logging.debug("Searching for the following bpch filetype: {filetype}"
                      .format(filetype=filetype))
        bpch_files = glob.glob(folder + '/' + filetype)
        # Also check if directory contains *trac_avg* files, if no ctm.bpch
        if (len(bpch_files) == 0) and check4_trac_avg_if_no_ctm_bpch:
            filetype = '*trac_avg*'
            logging.info('WARNING! - now trying filetype={}'.format(filetype))
            bpch_files = glob.glob(folder + '/' + filetype)
        # Raise error if no files matching filetype
        if len(bpch_files) == 0:
            logging.error("No bpch files ({}) found in {}".format(filetype,
                                                                  folder))
            raise IOError("{} contains no bpch files.".format(folder))

    # Use the specified files.
    else:
        file_list = []
        for bpch_file in bpch_file_list:
            full_path = folder + '/' + bpch_file
            if not os.path.exists(full_path):
                logging.error(full_path + " could not be found")
                raise IOError("Full path could not be found")
            file_list.append(full_path)
        bpch_files = file_list

    # Open the bpch files
    logging.debug("The following bpch files were found (n={}):"
                  .format(len(bpch_files)))
    logging.debug(str(bpch_files))
    if verbose:
        print(("Creating a netCDF from {} file(s).".format(len(bpch_files)) +
               " This can take some time..."))
    if backend == 'PyGChem':
        # Load all the files into memory
        bpch_data = datasets.load(bpch_files)
        # Save the netCDF file
        datasets.save(bpch_data, output_file)
    elif backend == 'xbpch':
        import xbpch
        # Load all the files into memory (as xarray dataset object)
        ds = xbpch.open_mfbpchdataset(bpch_files)
        # save through xarray dataset object
        ds.to_netcdf(output_file, unlimited_dims={'time_counter': True})
    elif backend == 'iris':
        #    iris.fileformats.netcdf.save(data, output_file)
        print('WARNING NetCDF made by iris is non CF-compliant')
    elif backend == 'PNC':
        import PseudoNetCDF as pnc
        import xarray as xr
        if len(bpch_files) == 1:
            bpch_to_netCDF_via_PNC(filename=filename,
                                   output_file=output_file, bpch_file=bpch_files[0])
        # Individually convert bpch files if more than one file
        if len(bpch_files) > 1:
            for n_bpch_file, bpch_file in enumerate(bpch_files):
                bpch_to_netCDF_via_PNC(filename=filename,
                                       output_file='TEMP_{}_'.format(
                                           n_bpch_file)+filename,
                                       bpch_file=bpch_file)
            # - Combine the NetCDF files with xarray
            TEMP_ncfiles = glob.glob(folder+'TEMP_*_'+filename)
            # Open files with xarray
            ds_l = [xr.open_dataset(i) for i in TEMP_ncfiles]
            # Make sure the time dimension is unlimitetd
            ds = xr.concat(ds_l, dim='time')
            # Now save the combined file
            ds.to_netcdf(folder+filename,
                         unlimited_dims={'time_counter': True})
            # Remove the temporary files
            for TEMP_ncfile in TEMP_ncfiles:
                os.remove(TEMP_ncfile)

    logging.info("A netCDF file has been created with the name {ctm}"
                 .format(ctm=output_file))
    return
Ejemplo n.º 8
0
import xbpch
from dask.diagnostics import ProgressBar

from os.path import join

# First we need to read in some data. We'll read a multi-file ND49 BPCH
# dataset using the xbpch package.
dates = ["200601{:02d}".format(d) for d in range(1, 22)]
ROOT = "/Users/daniel/workspace/bpch/test_data/"
fns = [
    join(ROOT, "ND49_{}_ref_e2006_m2010.bpch".format(date)) for date in dates
]
nd49_data = xbpch.open_mfbpchdataset(
    fns,
    diaginfo_file="/Users/daniel/Desktop/sample_nd49/diaginfo.dat",
    tracerinfo_file="/Users/daniel/Desktop/sample_nd49/tracerinfo.dat",
    dask=True,
    memmap=True,
)
o3_data = nd49_data['IJ_AVG_S_O3']
with ProgressBar():
    print("Loading data into memory")
    o3_data.load()

# Second, we compute the 8-hour rolling averages for the ozone.
avg_8hr_o3 = (o3_data.rolling(time=8, min_periods=6).mean())

# By default, this takes the last timestamp in a rolling interval; i.e. the
# timestamps correspond to the preceding 8 hours. We want them to refer to
# the proeding 8 hours, so we can adjust them using datetime arithmetic
times_np = avg_8hr_o3.time.values
Ejemplo n.º 9
0
def generate_txt_files(
    bpch_files, output_dir, tracer_path, diag_path,
    co2_var_nm='CO2_SRCE_CO2bf',
    dims=(8, 72, 46)
):
    """
    Creates one txt file for each binary punch file path provided in
    bpch_files. The expected dimension of each day's flux file is shown in
    the "dims" variable.

    When flattening arrays, the indices move fastest on the right side,
    so, latitidue is moving the fastest, followed by longitude, followed by
    time.

    e.g.
     input  - [nep.geos.4x5.001, nep.geos.4x5.002] <- bpch files
     output = [nep.geos.4x5.001, nep.geos.4x5.002] <- txt files

    Parameters:
        bpch_files  (str) : an ordered sequential collection of daily
                            bpch files
        output_dir  (str) : output directory for netcdf files
        tracer_path (str) : path to tracer file
        diag_path   (str) : path to diag file
        co2_var_nm  (str) : name of co2 variable of interest
        dims         (tuple)  : lon/lat/time array size tuple

    Returns:
        None - write txt file to path in output_file
    """
    # read in the binary punch files
    bpch_data = xbpch.open_mfbpchdataset(
        bpch_files,
        dask=True,
        tracerinfo_file=tracer_path,
        diaginfo_file=diag_path
    )

    # extract the array from the above
    bpch_arr = bpch_data[co2_var_nm].values

    # create new output file names
    output_file_nms = [
        output_dir + '/' + fp.split('/')[-1] for fp in bpch_files
    ]

    # create time indices to extract each day
    time_idxs = np.arange(
        0, dims[0] * len(output_file_nms)
    ).reshape(len(output_file_nms), dims[0])

    # for each output file name, generate a new text file
    for time_count, output_file_nm in enumerate(output_file_nms):

        # find the time indices for this file
        time_idx = time_idxs[time_count, :]

        # create a flattened version of the above data with the time filter
        data_arr = bpch_arr[time_idx, :, :]
        assert data_arr.shape == dims

        data_flat = data_arr.flatten()

        # write to file
        np.savetxt(fname=output_file_nm, X=data_flat)
Ejemplo n.º 10
0
def generate_nc_files(
    bpch_files, output_dir, tracer_path, diag_path,
    co2_var_nm='CO2_SRCE_CO2bf',
    dims=(72, 46, 8)
):
    """
    Creates one netcdf file for each binary punch file path provided in
    bpch_files.

    e.g.
     input  - [nep.geos.4x5.001, nep.geos.4x5.002] <- bpch files
     output = [nep.geos.4x5.001, nep.geos.4x5.002] <- netcdf files

    Parameters:
        bpch_files  (str) : an ordered sequential collection of daily
                            bpch files
        output_dir  (str) : output directory for netcdf files
        tracer_path (str) : path to tracer file
        diag_path   (str) : path to diag file
        co2_var_nm  (str) : name of co2 variable of interest
        dims         (tuple)  : lon/lat/time array size tuple

    Returns:
        None - write netcdf file to path in output_file
    """
    # read in the binary punch files
    bpch_data = xbpch.open_mfbpchdataset(
        bpch_files,
        dask=True,
        tracerinfo_file=tracer_path,
        diaginfo_file=diag_path
    )

    # create new output file names
    output_file_nms = [
        output_dir + '/' + fp.split('/')[-1] for fp in bpch_files
    ]

    # extract non-time dependent info from first bpch file
    lon = bpch_data.variables['lon'].values
    lat = bpch_data.variables['lat'].values
    time = bpch_data.variables['time'].values
    co2_arr = bpch_data.variables[co2_var_nm].values

    # create time indices to extract each day
    time_idxs = np.arange(
        0, dims[2] * len(output_file_nms)
    ).reshape(len(output_file_nms), dims[2])

    # create netcdf files
    for time_count, file_nm in enumerate(output_file_nms):

        # find the time indices for this file
        time_idx = time_idxs[time_count, :]

        # create netcdf file with time_count index co2 values
        create_netcdf_flux_file(
            write_loc=file_nm,
            lon=lon,
            lat=lat,
            time=time[time_idx],
            co2_vals=co2_arr[time_idx, :, :],
            co2_field_nm=co2_var_nm
        )
Ejemplo n.º 11
0
def run(bpch_use,
        true_flux_dir,
        prior_flux_dir,
        true_flux_prefix,
        prior_flux_prefix,
        lat_lon_dir,
        tracerinfo_path,
        diaginfo_path,
        output_dir,
        varname_oi='CO2_SRCE_CO2bf',
        TOL=0.000001):
    """
    Run the steps required to make the scaled fluxes and write them to disk.

    At the moment, this function scales the prior down to have the same global
    flux as the truth.

    Parameters:
        bpch_use          (bool)  : switch to indicate that bpch files are
                                    flux input
        true_flux_dir     (str)   : directory location of true fluxes
        prior_flux_dir    (str)   : directory location of prior fluxes
        true_flux_prefix  (str)   : e.g. 'nep.geos.4x5.2010.'
        prior_flux_prefix (str)   : e.g. 'nep.geos.4x5.'
        lat_lon_dir       (str)   : directory where lat/lon arrays can be found
                                    use when bpch_use==True
        tracerinfo_path   (str)   : location of tracerinfo file for
                                    reading bpch
        diaginfo_path     (str)   : location of diaginfo file for reading bpch
        output_dir        (str)   : directory of txt output files
        varname_oi        (str)   : variable to extract from the bpch objects
        TOL               (float) : tolerance of new integrated flux

    Returns:
        writes daily flux txt files to output_dir
    """
    # find the sorted file names
    prior_files = sorted(glob(prior_flux_dir + '/' + prior_flux_prefix + '*'),
                         key=lambda x: int(x[-3:]))
    true_files = sorted(glob(true_flux_dir + '/' + true_flux_prefix + '*'),
                        key=lambda x: int(x[-3:]))

    if bpch_use:

        # read in the fluxes
        prior_data = xbpch.open_mfbpchdataset(prior_files,
                                              dask=True,
                                              tracerinfo_file=tracerinfo_path,
                                              diaginfo_file=diaginfo_path)
        print('Prior fluxes acquired')
        true_data = xbpch.open_mfbpchdataset(true_files,
                                             dask=True,
                                             tracerinfo_file=tracerinfo_path,
                                             diaginfo_file=diaginfo_path)
        print('True fluxes acquired')

        # extract flux arrays from the xbpch objects
        prior_arr = prior_data.variables[varname_oi].values
        true_arr = true_data.variables[varname_oi].values

        # get longitude/latitude arrays
        lons = prior_data.variables['lon'].values
        lats = prior_data.variables['lat'].values

    else:

        # read in the fluxes
        prior_arr = cio.read_flux_txt_files(flux_files=prior_files)
        true_arr = cio.read_flux_txt_files(flux_files=true_files)

        # read in lat/lon
        lons = np.load(lat_lon_dir + '/lon.npy')
        lats = np.load(lat_lon_dir + '/lat.npy')

    print('=== Flux array dimensions ===')
    print('Prior : %s' % str(prior_arr.shape))
    print('Truth : %s' % str(true_arr.shape))
    print('Lon   : %s' % str(lons.shape))
    print('Lat   : %s' % str(lats.shape))

    # determine global integral of prior and posterior
    prior_global_flux = ccomp.compute_global_flux(flux_arr=prior_arr,
                                                  lons=lons,
                                                  lats=lats)
    true_global_flux = ccomp.compute_global_flux(flux_arr=true_arr,
                                                 lons=lons,
                                                 lats=lats)

    # find the scalar multiplier
    scl_mult = true_global_flux / prior_global_flux
    print('Scalar multiplier : %.5f' % scl_mult)

    # scale the prior
    prior_arr_scl = scl_mult * prior_arr

    # compute new integrated flux
    prior_global_flux_scl = ccomp.compute_global_flux(flux_arr=prior_arr_scl,
                                                      lons=lons,
                                                      lats=lats)
    scl_mult_updated = true_global_flux / prior_global_flux_scl

    if np.abs(scl_mult_updated - 1) > TOL:
        print('Normalized flux exceeds tolerance: TOL = %.10f' %
              np.abs(scl_mult_updated - 1))
        exit()

    # write the new flux array to directory
    cio.generate_txt_files_np(flux_arr=prior_arr_scl,
                              bpch_files=prior_files,
                              output_dir=output_dir)
Ejemplo n.º 12
0
def read_bpch(path,keys):
    '''
        Read  generic bpch file into dictionary
        keys = keys you want to read
    '''
    paths=path
    if __VERBOSE__:
        print('GC_fio.read_bpch called on paths:')
        print(path)
    multi=False
    if isinstance(path,list):
        path=path[0]
        if len(path) > 1:
            multi=True
    if '*' in path:
        multi=True

    # make sure coordinates are in keys list
    keys = list(set(keys + GC_coords)) # set removes any duplicates

    # assume tracerinfo and diaginfo in same folder:
    # otherwise use my generic one with coalesced info
    splt=path.split('/')
    splt[-1]='tracerinfo.dat'
    tracinf='/'.join(splt)
    if not os.path.isfile(tracinf):
        tracinf='Data/GC_Output/tracerinfo.dat'

    splt[-1]='diaginfo.dat'
    diaginf='/'.join(splt)
    if not os.path.isfile(diaginf):
        diaginf='Data/GC_Output/diaginfo.dat'


    # Improve read performance by only reading requested fields:
    fields=set(); categories=set()
    for key in keys:
        if '_' in key:
            # Split key on the underscores: Category_Field
            c,_,f = key.rpartition('_')
            categories.add(c)
            fields.add(f)
        else:
            fields.add(key)
    if __VERBOSE__:
        print("categories: ",categories)
        print("fields: ",fields)

    # get bpch file:
    data={}
    attrs={}
    bpchargs={'fields':list(fields), 'categories':list(categories),
              'tracerinfo_file':tracinf,'diaginfo_file':diaginf,
              'decode_cf':False,'dask':True}
    mod_times=[]
    if multi:
        ds=open_mfbpchdataset(paths,**bpchargs)
        for p in paths:
            mod_times.append(time.ctime(os.path.getmtime(p)))
    else:
        ds=open_bpchdataset(path,**bpchargs)
        mod_times=[time.ctime(os.path.getmtime(path))]

    data,attrs=dataset_to_dicts(ds,keys)
    data['modification_times']=np.array(mod_times)
    attrs['modification_times']={'desc':'When was file last modified'}
    return data,attrs