def read_flux_files( file_dir, file_pre, tracer_fp=None, diag_fp=None ): """ Since scale factors and results are examined on a monthly time-scale, raw 3hr flux files need to be processed to produce a monthly flux for each grid point. Array objects within have 72x46 dimensions Assumptions - 1. flux files are bpch files Parameters: file_dir (str) : directory where files are stored file_pre (str) : prefix for flux files, e.g. nep.geos.4x5.2010 tracer_fp (str) : path to relevant tracer file (if none, will look in file_dir) diag_fp (str) : path to relevant diag file (if none, will look in file_dir) Returns: xbpch object which will contain a flux of interest in additiona to dimension parameters (e.g. lon/lat/lev) """ if tracer_fp: tracer_fp_1 = tracer_fp else: tracer_fp_1 = file_dir + '/tracerinfo.dat' if diag_fp: diag_fp_1 = diag_fp else: diag_fp_1 = file_dir + '/diaginfo.dat' # find the flux file names file_names = sorted( [file_nm for file_nm in glob(file_dir + '/%s*' % file_pre)] ) assert len(file_names) > 0 # read in all the prior fluxes fluxes = xbpch.open_mfbpchdataset( file_names, dask=True, tracerinfo_file=tracer_fp_1, diaginfo_file=diag_fp_1 ) return fluxes
def get_mean_fluxes(directory_path, file_prefix, month, flux_field=FLUX_FIELD): """ Obtain mean fluxes for month of interest -- can be used for truth and prior. Parameters: directory_path (str) : see generate_flux_filenames docstring file_prefix (str) : see generate_flux_filenames docstring month (int) : integer representation of month of interest flux_field (str) : name of flux field in flux files Returns: dictionary with following keys (all numpy arrays) - flux - latitude - longitude - time """ assert isinstance(month, int) assert month > 0 assert month < 13 # get the flux files of interest flux_files = generate_flux_filenames(directory_path=directory_path, file_prefix=file_prefix) # read in the fluxes tracer_path = directory_path + 'tracerinfo.dat' diag_path = directory_path + 'diaginfo.dat' fluxes = xbpch.open_mfbpchdataset(flux_files, dask=True, tracerinfo_file=tracer_path, diaginfo_file=diag_path) # find the time indices of interest if month + 1 < 10: time_idxs = np.where( fluxes.time.values < np.datetime64('1985-0%i-01' % (month + 1)))[0] else: time_idxs = np.where( fluxes.time.values < np.datetime64('1985-%i-01' (month + 1)))[0] # filter the fluxes and find the mean month_fluxes = fluxes[flux_field].values[time_idxs, :, :].mean(axis=0) return { 'flux': month_fluxes, 'latitude': fluxes.lat.values, 'longitude': fluxes.lon.values, 'time': fluxes.time.values[time_idxs] }
def read_gc(fname,varname,cat='IJ-AVG-$', gc_dir = '/short/m19/jaf574/GC.v11-01/runs.v11-02e/geosfp_025x03125_tropchem_au.base/', **kwargs): # Some species involve multiple GEOS-Chem species... varname_gc = gcname_to_names(varname) # Expand wildcard if necessary and link to directory if '*' in fname: fname = glob(gc_dir+fname) else: fname = [gc_dir + f for f in fname] # Put files in order! fname.sort() # Read using xbpch # one file if isinstance(fname,str): ds = open_bpchdataset(fname,categories=[cat,],fields=varname_gc, diaginfo_file=gc_dir+'diaginfo.dat', tracerinfo_file=gc_dir+'tracerinfo.dat',**kwargs) # multiple files else: ds = open_mfbpchdataset(fname,dask=True,categories=[cat,],fields=varname_gc, diaginfo_file=gc_dir+'diaginfo.dat', tracerinfo_file=gc_dir+'tracerinfo.dat',**kwargs) # load dataset ds.load() # extract variables cat=cat.replace('$','S').replace('-','_') dfg = ds[[cat+'_'+v for v in varname_gc]] # If needed, sum GEOS-Chem variables if len(varname_gc) > 1: dfg = sum_gc_vars(dfg, [cat+'_'+v for v in varname_gc], varname=cat+'_'+varname) return dfg
def read_daily_flux(flux_fp, flux_prefix, lb, ub, tracerfile_path, diagfile_path): """ Reads in a sequence of daily fluxes. Parameters: flux_fp (str) : file path to fluxes flux_prefix (str) : prefix to each flux file of interest lb (str) : inclusive lower bound of flux file number ub (str) : inclusive upper bound of flux file number tracerfile_path (str) : path to tracerinfo.dat diagfile_path (str) : path to diaginfo.dat Returns: xarray core dataset containing fluxes """ assert isinstance(flux_fp, str) assert isinstance(flux_prefix, str) # create a list of files to read in file_suffs = [f'{i:03}' for i in range(lb, ub + 1)] # create a list of flux filepaths flux_fps = [ '%s/%s%s' % (flux_fp, flux_prefix, suff) for suff in file_suffs ] # check that the above files exist for flux_file in flux_fps: assert os.path.exists(flux_file) # read in the files fluxes = xbpch.open_mfbpchdataset(flux_fps, dask=True, tracerinfo_file=tracerfile_path, diaginfo_file=diagfile_path) return fluxes
print(" " + fn) sys.exit(1) # Else, we should be good to read in and concatenate open_kws = { "tracerinfo_file": args.tracerinfo, "diaginfo_file": args.diaginfo, "memmap": True, "dask": True } print("\nReading in file(s)...") if len(args.bpch_files) == 1: ds = open_bpchdataset(args.bpch_files[0], **open_kws) else: ds = open_mfbpchdataset(args.bpch_files, **open_kws) # This block of code is hack to fix the encoding of attributes # on the DataArrays in this Dataset. They are being # set at a very low level when we read in the data, and manually # specifying the encoding doesn't work. # However, deleting them from the attributes dict # doesn't end up removing them from the final output file - they get # written just fine. print("\nDecoding variables...") for v in ds.data_vars: da = ds[v] da = _maybe_del_attr(da, 'scale_factor') da = _maybe_del_attr(da, 'units') da = _maybe_decode_attr(da, 'hydrocarbon') da = _maybe_decode_attr(da, 'chemical')
def generate_seaonal_data(dir_path, start_year, file_base, parameter): """ Creates a dictionary for each of the seasons -- - DJF - MAM - JJA - SON Parameters: dir_path (str) : path to directory containing files start_year (int) : starting year for the fluxes file_base (str) : form of files of interest parameter (str) : parameter of interest in the underlying binary punch files Returns: dictionary with keys corresponding to each of the seasons above. Each value is a list of all the days in that season. Also, has keys for - time - latitude - longitude NOTE: - files are assumed to be of the from filebase + .###, e.g. nep.geos.4x5.2010.001 - the tracerinfo and diaginfo files are assumed to be in dir_path - files are assumed to be split into 3hour increments """ START = datetime.now() # get all file names file_names = glob.glob(dir_path + file_base + '*') # read in the files fluxes = xbpch.open_mfbpchdataset(paths=file_names, dask=True, tracerinfo_file=dir_path + 'tracerinfo.dat', diaginfo_file=dir_path + 'diaginfo.dat') print('Read in fluxes from %s' % dir_path + file_base) print('Elapsed time: %i seconds' % (datetime.now() - START).seconds) # generate flux dates from the given starting year end_data = np.datetime64('%s-01-01' % (start_year + 1)) flux_dates = [] start_date = np.datetime64('%s-01-01' % start_year) current_date = start_date while current_date < end_data: flux_dates.append(current_date) current_date += np.timedelta64(3, 'h') flux_dates = np.array(flux_dates) # create seasonal indices djf_indx, mam_indx, jja_indx, son_indx = create_season_indices(flux_dates) # create seasonal arrays djf_arr = fluxes[parameter].values[djf_indx, :, :] mam_arr = fluxes[parameter].values[mam_indx, :, :] jja_arr = fluxes[parameter].values[jja_indx, :, :] son_arr = fluxes[parameter].values[son_indx, :, :] # get latitude and longitude lat_arr = fluxes.lat.values lon_arr = fluxes.lon.values return { 'time': flux_dates, 'lat': lat_arr, 'lon': lon_arr, 'djf': djf_arr, 'mam': mam_arr, 'jja': jja_arr, 'son': son_arr }
def bpch_to_netCDF(folder=None, filename='ctm.nc', bpch_file_list=None, remake=False, filetype="*ctm.bpch*", check4_trac_avg_if_no_ctm_bpch=True, backend='PyGChem', verbose=False, **kwargs): """ Converts GEOS-Chem ctm.bpch output file(s) to NetCDF Parameters ---------- folder (str): working directory for data files filename (str): name to give created NetCDF bpch_file_list (list): list of files to convert remake (bool): overwrite existing NetCDF file filetype (str): string with wildcards to match filenames ( e.g. *ctm.bpch*, trac_avg.*, or *ts*bpch* ) verbose (bool): print (minor) logging to screen Returns ------- (None) saves a NetCDF file to disk """ import os # Check if file already exists and warn about remaking if __package__ is None: from .bpch2netCDF import get_folder else: from .bpch2netCDF import get_folder folder = get_folder(folder) output_file = os.path.join(folder, filename) # If the netCDf file already exists dont overwrite it without remake=True. if not remake: if os.path.exists(output_file): logging.warning(output_file + ' already exists. Not recreating.') return # Look for files if file list is not provided. if isinstance(bpch_file_list, type(None)): logging.debug("Searching for the following bpch filetype: {filetype}" .format(filetype=filetype)) bpch_files = glob.glob(folder + '/' + filetype) # Also check if directory contains *trac_avg* files, if no ctm.bpch if (len(bpch_files) == 0) and check4_trac_avg_if_no_ctm_bpch: filetype = '*trac_avg*' logging.info('WARNING! - now trying filetype={}'.format(filetype)) bpch_files = glob.glob(folder + '/' + filetype) # Raise error if no files matching filetype if len(bpch_files) == 0: logging.error("No bpch files ({}) found in {}".format(filetype, folder)) raise IOError("{} contains no bpch files.".format(folder)) # Use the specified files. else: file_list = [] for bpch_file in bpch_file_list: full_path = folder + '/' + bpch_file if not os.path.exists(full_path): logging.error(full_path + " could not be found") raise IOError("Full path could not be found") file_list.append(full_path) bpch_files = file_list # Open the bpch files logging.debug("The following bpch files were found (n={}):" .format(len(bpch_files))) logging.debug(str(bpch_files)) if verbose: print(("Creating a netCDF from {} file(s).".format(len(bpch_files)) + " This can take some time...")) if backend == 'PyGChem': # Load all the files into memory bpch_data = datasets.load(bpch_files) # Save the netCDF file datasets.save(bpch_data, output_file) elif backend == 'xbpch': import xbpch # Load all the files into memory (as xarray dataset object) ds = xbpch.open_mfbpchdataset(bpch_files) # save through xarray dataset object ds.to_netcdf(output_file, unlimited_dims={'time_counter': True}) elif backend == 'iris': # iris.fileformats.netcdf.save(data, output_file) print('WARNING NetCDF made by iris is non CF-compliant') elif backend == 'PNC': import PseudoNetCDF as pnc import xarray as xr if len(bpch_files) == 1: bpch_to_netCDF_via_PNC(filename=filename, output_file=output_file, bpch_file=bpch_files[0]) # Individually convert bpch files if more than one file if len(bpch_files) > 1: for n_bpch_file, bpch_file in enumerate(bpch_files): bpch_to_netCDF_via_PNC(filename=filename, output_file='TEMP_{}_'.format( n_bpch_file)+filename, bpch_file=bpch_file) # - Combine the NetCDF files with xarray TEMP_ncfiles = glob.glob(folder+'TEMP_*_'+filename) # Open files with xarray ds_l = [xr.open_dataset(i) for i in TEMP_ncfiles] # Make sure the time dimension is unlimitetd ds = xr.concat(ds_l, dim='time') # Now save the combined file ds.to_netcdf(folder+filename, unlimited_dims={'time_counter': True}) # Remove the temporary files for TEMP_ncfile in TEMP_ncfiles: os.remove(TEMP_ncfile) logging.info("A netCDF file has been created with the name {ctm}" .format(ctm=output_file)) return
import xbpch from dask.diagnostics import ProgressBar from os.path import join # First we need to read in some data. We'll read a multi-file ND49 BPCH # dataset using the xbpch package. dates = ["200601{:02d}".format(d) for d in range(1, 22)] ROOT = "/Users/daniel/workspace/bpch/test_data/" fns = [ join(ROOT, "ND49_{}_ref_e2006_m2010.bpch".format(date)) for date in dates ] nd49_data = xbpch.open_mfbpchdataset( fns, diaginfo_file="/Users/daniel/Desktop/sample_nd49/diaginfo.dat", tracerinfo_file="/Users/daniel/Desktop/sample_nd49/tracerinfo.dat", dask=True, memmap=True, ) o3_data = nd49_data['IJ_AVG_S_O3'] with ProgressBar(): print("Loading data into memory") o3_data.load() # Second, we compute the 8-hour rolling averages for the ozone. avg_8hr_o3 = (o3_data.rolling(time=8, min_periods=6).mean()) # By default, this takes the last timestamp in a rolling interval; i.e. the # timestamps correspond to the preceding 8 hours. We want them to refer to # the proeding 8 hours, so we can adjust them using datetime arithmetic times_np = avg_8hr_o3.time.values
def generate_txt_files( bpch_files, output_dir, tracer_path, diag_path, co2_var_nm='CO2_SRCE_CO2bf', dims=(8, 72, 46) ): """ Creates one txt file for each binary punch file path provided in bpch_files. The expected dimension of each day's flux file is shown in the "dims" variable. When flattening arrays, the indices move fastest on the right side, so, latitidue is moving the fastest, followed by longitude, followed by time. e.g. input - [nep.geos.4x5.001, nep.geos.4x5.002] <- bpch files output = [nep.geos.4x5.001, nep.geos.4x5.002] <- txt files Parameters: bpch_files (str) : an ordered sequential collection of daily bpch files output_dir (str) : output directory for netcdf files tracer_path (str) : path to tracer file diag_path (str) : path to diag file co2_var_nm (str) : name of co2 variable of interest dims (tuple) : lon/lat/time array size tuple Returns: None - write txt file to path in output_file """ # read in the binary punch files bpch_data = xbpch.open_mfbpchdataset( bpch_files, dask=True, tracerinfo_file=tracer_path, diaginfo_file=diag_path ) # extract the array from the above bpch_arr = bpch_data[co2_var_nm].values # create new output file names output_file_nms = [ output_dir + '/' + fp.split('/')[-1] for fp in bpch_files ] # create time indices to extract each day time_idxs = np.arange( 0, dims[0] * len(output_file_nms) ).reshape(len(output_file_nms), dims[0]) # for each output file name, generate a new text file for time_count, output_file_nm in enumerate(output_file_nms): # find the time indices for this file time_idx = time_idxs[time_count, :] # create a flattened version of the above data with the time filter data_arr = bpch_arr[time_idx, :, :] assert data_arr.shape == dims data_flat = data_arr.flatten() # write to file np.savetxt(fname=output_file_nm, X=data_flat)
def generate_nc_files( bpch_files, output_dir, tracer_path, diag_path, co2_var_nm='CO2_SRCE_CO2bf', dims=(72, 46, 8) ): """ Creates one netcdf file for each binary punch file path provided in bpch_files. e.g. input - [nep.geos.4x5.001, nep.geos.4x5.002] <- bpch files output = [nep.geos.4x5.001, nep.geos.4x5.002] <- netcdf files Parameters: bpch_files (str) : an ordered sequential collection of daily bpch files output_dir (str) : output directory for netcdf files tracer_path (str) : path to tracer file diag_path (str) : path to diag file co2_var_nm (str) : name of co2 variable of interest dims (tuple) : lon/lat/time array size tuple Returns: None - write netcdf file to path in output_file """ # read in the binary punch files bpch_data = xbpch.open_mfbpchdataset( bpch_files, dask=True, tracerinfo_file=tracer_path, diaginfo_file=diag_path ) # create new output file names output_file_nms = [ output_dir + '/' + fp.split('/')[-1] for fp in bpch_files ] # extract non-time dependent info from first bpch file lon = bpch_data.variables['lon'].values lat = bpch_data.variables['lat'].values time = bpch_data.variables['time'].values co2_arr = bpch_data.variables[co2_var_nm].values # create time indices to extract each day time_idxs = np.arange( 0, dims[2] * len(output_file_nms) ).reshape(len(output_file_nms), dims[2]) # create netcdf files for time_count, file_nm in enumerate(output_file_nms): # find the time indices for this file time_idx = time_idxs[time_count, :] # create netcdf file with time_count index co2 values create_netcdf_flux_file( write_loc=file_nm, lon=lon, lat=lat, time=time[time_idx], co2_vals=co2_arr[time_idx, :, :], co2_field_nm=co2_var_nm )
def run(bpch_use, true_flux_dir, prior_flux_dir, true_flux_prefix, prior_flux_prefix, lat_lon_dir, tracerinfo_path, diaginfo_path, output_dir, varname_oi='CO2_SRCE_CO2bf', TOL=0.000001): """ Run the steps required to make the scaled fluxes and write them to disk. At the moment, this function scales the prior down to have the same global flux as the truth. Parameters: bpch_use (bool) : switch to indicate that bpch files are flux input true_flux_dir (str) : directory location of true fluxes prior_flux_dir (str) : directory location of prior fluxes true_flux_prefix (str) : e.g. 'nep.geos.4x5.2010.' prior_flux_prefix (str) : e.g. 'nep.geos.4x5.' lat_lon_dir (str) : directory where lat/lon arrays can be found use when bpch_use==True tracerinfo_path (str) : location of tracerinfo file for reading bpch diaginfo_path (str) : location of diaginfo file for reading bpch output_dir (str) : directory of txt output files varname_oi (str) : variable to extract from the bpch objects TOL (float) : tolerance of new integrated flux Returns: writes daily flux txt files to output_dir """ # find the sorted file names prior_files = sorted(glob(prior_flux_dir + '/' + prior_flux_prefix + '*'), key=lambda x: int(x[-3:])) true_files = sorted(glob(true_flux_dir + '/' + true_flux_prefix + '*'), key=lambda x: int(x[-3:])) if bpch_use: # read in the fluxes prior_data = xbpch.open_mfbpchdataset(prior_files, dask=True, tracerinfo_file=tracerinfo_path, diaginfo_file=diaginfo_path) print('Prior fluxes acquired') true_data = xbpch.open_mfbpchdataset(true_files, dask=True, tracerinfo_file=tracerinfo_path, diaginfo_file=diaginfo_path) print('True fluxes acquired') # extract flux arrays from the xbpch objects prior_arr = prior_data.variables[varname_oi].values true_arr = true_data.variables[varname_oi].values # get longitude/latitude arrays lons = prior_data.variables['lon'].values lats = prior_data.variables['lat'].values else: # read in the fluxes prior_arr = cio.read_flux_txt_files(flux_files=prior_files) true_arr = cio.read_flux_txt_files(flux_files=true_files) # read in lat/lon lons = np.load(lat_lon_dir + '/lon.npy') lats = np.load(lat_lon_dir + '/lat.npy') print('=== Flux array dimensions ===') print('Prior : %s' % str(prior_arr.shape)) print('Truth : %s' % str(true_arr.shape)) print('Lon : %s' % str(lons.shape)) print('Lat : %s' % str(lats.shape)) # determine global integral of prior and posterior prior_global_flux = ccomp.compute_global_flux(flux_arr=prior_arr, lons=lons, lats=lats) true_global_flux = ccomp.compute_global_flux(flux_arr=true_arr, lons=lons, lats=lats) # find the scalar multiplier scl_mult = true_global_flux / prior_global_flux print('Scalar multiplier : %.5f' % scl_mult) # scale the prior prior_arr_scl = scl_mult * prior_arr # compute new integrated flux prior_global_flux_scl = ccomp.compute_global_flux(flux_arr=prior_arr_scl, lons=lons, lats=lats) scl_mult_updated = true_global_flux / prior_global_flux_scl if np.abs(scl_mult_updated - 1) > TOL: print('Normalized flux exceeds tolerance: TOL = %.10f' % np.abs(scl_mult_updated - 1)) exit() # write the new flux array to directory cio.generate_txt_files_np(flux_arr=prior_arr_scl, bpch_files=prior_files, output_dir=output_dir)
def read_bpch(path,keys): ''' Read generic bpch file into dictionary keys = keys you want to read ''' paths=path if __VERBOSE__: print('GC_fio.read_bpch called on paths:') print(path) multi=False if isinstance(path,list): path=path[0] if len(path) > 1: multi=True if '*' in path: multi=True # make sure coordinates are in keys list keys = list(set(keys + GC_coords)) # set removes any duplicates # assume tracerinfo and diaginfo in same folder: # otherwise use my generic one with coalesced info splt=path.split('/') splt[-1]='tracerinfo.dat' tracinf='/'.join(splt) if not os.path.isfile(tracinf): tracinf='Data/GC_Output/tracerinfo.dat' splt[-1]='diaginfo.dat' diaginf='/'.join(splt) if not os.path.isfile(diaginf): diaginf='Data/GC_Output/diaginfo.dat' # Improve read performance by only reading requested fields: fields=set(); categories=set() for key in keys: if '_' in key: # Split key on the underscores: Category_Field c,_,f = key.rpartition('_') categories.add(c) fields.add(f) else: fields.add(key) if __VERBOSE__: print("categories: ",categories) print("fields: ",fields) # get bpch file: data={} attrs={} bpchargs={'fields':list(fields), 'categories':list(categories), 'tracerinfo_file':tracinf,'diaginfo_file':diaginf, 'decode_cf':False,'dask':True} mod_times=[] if multi: ds=open_mfbpchdataset(paths,**bpchargs) for p in paths: mod_times.append(time.ctime(os.path.getmtime(p))) else: ds=open_bpchdataset(path,**bpchargs) mod_times=[time.ctime(os.path.getmtime(path))] data,attrs=dataset_to_dicts(ds,keys) data['modification_times']=np.array(mod_times) attrs['modification_times']={'desc':'When was file last modified'} return data,attrs