def update_time_in_NetCDF2save(ds, convert_time2dt=False): """ Update time of monthly output to be in NetCDF saveable format Parameters ------- convert_time2dt (bool): convert the time into a datetime.datetime format """ # Climate model time sdate = datetime.datetime(1985, 1, 1) # Convert / setup time dim? if convert_time2dt: months = np.arange(1, 13) ds['time'] = [AC.add_months(sdate, i - 1) for i in months] # Update to hours since X hours = [(AC.dt64_2_dt([i])[0] - sdate).days * 24. for i in ds['time'].values] ds['time'] = hours attrs_dict = {'units': 'hours since 1985-01-01 00:00:00'} ds['time'].attrs = attrs_dict return ds
def plot_up_df_data_by_yr(df=None, Datetime_var='datetime', TimeWindow=5, start_from_last_obs=False, drop_bins_without_data=True, target='Iodide', dpi=320): """ Plot up # of obs. data (Y) binned by region against year (X) Parameters ------- df (pd.DataFrame): DataFrame of data with and a datetime variable target (str): Name of the target variable (e.g. iodide) TimeWindow (int): number years to bit observations over start_from_last_obs (bool): start from the last observational date drop_bins_without_data (bool): exclude bins with no data from plotting dpi (int): resolution of figure (dots per sq inch) Returns ------- (None) """ # Sort the dataframe by date df.sort_values( by=Datetime_var, inplace=True ) # Get the minimum and maximum dates min_date = df[Datetime_var].min() max_date = df[Datetime_var].max() # How many years of data are there? yrs_of_data = (max_date-min_date).total_seconds()/60/60/24/365 nbins = AC.myround(yrs_of_data/TimeWindow, base=1 ) # Start from last observation or from last block of time sdate_block = AC.myround(max_date.year, 5) sdate_block = datetime.datetime(sdate_block, 1, 1) # Make sure the dates used are datetimes min_date, max_date = pd.to_datetime( [min_date, max_date] ).values min_date, max_date = AC.dt64_2_dt( [min_date, max_date]) # Calculate the number of points for each bin by region dfs = {} for nbin in range(nbins+2): # Start from last observation or from last block of time? days2rm = int(nbin*365*TimeWindow) if start_from_last_obs: bin_start = AC.add_days( max_date, -int(days2rm+(365*TimeWindow))) bin_end = AC.add_days( max_date, -days2rm ) else: bin_start = AC.add_days( sdate_block,-int(days2rm+(365*TimeWindow))) bin_end = AC.add_days( sdate_block, -days2rm ) # Select the data within the observational dates bool1 = df[Datetime_var] > bin_start bool2 = df[Datetime_var] <= bin_end df_tmp = df.loc[bool1 & bool2, :] # Print the number of values in regions for bin if verbose: print(bin_start, bin_end, df_tmp.shape) # String to save data with if start_from_last_obs: bin_start_str = bin_start.strftime( '%Y/%m/%d') bin_end_str = bin_end.strftime( '%Y/%m/%d') else: bin_start_str = bin_start.strftime( '%Y') bin_end_str = bin_end.strftime( '%Y') str2use = '{}-{}'.format(bin_start_str, bin_end_str) # Sum up the number of values by region dfs[ str2use] = df_tmp['ocean'].value_counts(dropna=False) # Combine to single dataframe and sort by date dfA = pd.DataFrame( dfs ) dfA = dfA[list(sorted(dfA.columns)) ] # Drop the years without any data if drop_bins_without_data: dfA = dfA.T.dropna(how='all').T # Update index names dfA = dfA.T dfA.columns rename_cols = { np.NaN : 'Other', 'INDIAN OCEAN': 'Indian Ocean', 'SOUTHERN OCEAN' : 'Southern Ocean' } dfA = dfA.rename(columns=rename_cols) dfA = dfA.T # Plot up as a stacked bar plot import seaborn as sns sns.set() dfA.T.plot(kind='bar', stacked=True) # Add title etc plt.ylabel( '# of observations') plt.title( '{} obs. data by region'.format(target)) # Save plotted figure savename = 's2s_{}_data_by_year_region'.format(target) plt.savefig(savename, dpi=dpi, bbox_inches='tight', pad_inches=0.05)
def mk_NetCDF_from_productivity_data(): """ Convert productivity .csv file (Behrenfeld and Falkowski, 1997) into a NetCDF file """ # Location of data (update to use public facing host) folder = utils.get_file_locations('data_root') + '/Productivity/' # Which file to use? filename = 'productivity_behrenfeld_and_falkowski_1997_extrapolated.csv' # Setup coordinates lon = np.arange(-180, 180, 1/6.) lat = np.arange(-90, 90, 1/6.) lat = np.append(lat, [90]) # Setup time varname = 'vgpm' months = np.arange(1, 13) # Extract data df = pd.read_csv(folder+filename, header=None) print(df.shape) # Extract data by month da_l = [] for n in range(12): # Assume the data is in blocks by longitude? arr = df.values[:, n*1081: (n+1)*1081].T[None, ...] print(arr.shape) da_l += [xr.Dataset( data_vars={varname: (['time', 'lat', 'lon', ], arr)}, coords={'lat': lat, 'lon': lon, 'time': [n]})] # Concatenate to data xr.Dataset ds = xr.concat(da_l, dim='time') # Update time ... sdate = datetime.datetime(1985, 1, 1) # Climate model tiem ds['time'] = [AC.add_months(sdate, i-1) for i in months] # Update to hours since X hours = [(AC.dt64_2_dt([i])[0] - sdate).days * 24. for i in ds['time'].values] ds['time'] = hours # Add units attrs_dict = {'units': 'hours since 1985-01-01 00:00:00'} ds['time'].attrs = attrs_dict # Add attributes for variable attrs_dict = { 'long_name': "net primary production", 'units': "mg C / m**2 / day", } ds[varname].attrs = attrs_dict # For latitude... attrs_dict = { 'long_name': "latitude", 'units': "degrees_north", "standard_name": "latitude", "axis": "Y", } ds['lat'].attrs = attrs_dict # And longitude... attrs_dict = { 'long_name': "longitude", 'units': "degrees_east", "standard_name": "longitude", "axis": "X", } ds['lon'].attrs = attrs_dict # Add extra global attributes global_attribute_dictionary = { 'Title': 'Sea-surface productivity (Behrenfeld and Falkowski, 1997)', 'Author': 'Tomas Sherwen ([email protected])', 'Notes': "Data extracted from OCRA and extrapolated to poles by Martin Wadley. NetCDF contructed using xarray (xarray.pydata.org) by Tomas Sherwen. \n NOTES from oringal site (http://orca.science.oregonstate.edu/) from 'based on the standard vgpm algorithm. npp is based on the standard vgpm, using modis chl, sst4, and par as input; clouds have been filled in the input data using our own gap-filling software. For citation, please reference the original vgpm paper by Behrenfeld and Falkowski, 1997a as well as the Ocean Productivity site for the data.' ", 'History': 'Last Modified on:' + strftime("%B %d %Y", gmtime()), 'Conventions': "COARDS", } ds.attrs = global_attribute_dictionary # Save to NetCDF filename = 'productivity_behrenfeld_and_falkowski_1997_extrapolated.nc' ds.to_netcdf(filename, unlimited_dims={'time': True})
def combine_output2_1_file_per_day(runs2use, run_dict=None, res='4x5', version="0.1.1"): """ Combine the PREFIA African AQ files into daily files for PREFIA inter-comparison """ # format of outputted file FileStr = 'PREFIA_York_GEOSChem_{}_{}_{}_v{}' # file prefixes to use prefixes = [ 'HEMCO_diagnostics', ] GCprefixs = [ 'StateMet', 'LevelEdgeDiags', 'SpeciesConc', 'WetLossLS', 'Aerosols', 'ConcAfterChem', 'DryDep', 'WetLossConv', ] prefixes += ['GEOSChem.' + i for i in GCprefixs] REFprefix = 'GEOSChem.StateMet' # Set ordering of prefixes # NOTE: use Met for base as it has all of the extra dimensions prefixes = [REFprefix] + [i for i in prefixes if i != REFprefix] # Loop by runs if isinstance(runs2use, type(None)): runs2use = run_dict.keys() for run in runs2use: folder = run_dict[run] print(run, folder) # Loop by prefix files4prefix = {} for prefix in prefixes: print(prefix) files4prefix[prefix] = get_files_in_folder4prefix(folder=folder, prefix=prefix) # Check the number of files for each prefix. N_files = [len(files4prefix[i]) for i in files4prefix.keys()] if len(set(N_files)) == 1: print('WARNING: Different numbers of files for prefixes') # Get a list of dates dates = get_files_in_folder4prefix(folder=folder, prefix=REFprefix, rtn_dates4files=True) for day in dates: # Check all the files are present and that there is one of them # day_str = day.strftime(format='%Y%m%d_%H%M') day_str = day.strftime(format='%Y%m%d') print(day_str) # Loop to get file names by prefix files2combine = [] for prefix in prefixes: files = [i for i in files4prefix[prefix] if day_str in i] if len(files) != 1: print('WARNING: more than one file found for prefix') print(day_str, prefix, files) # files2combine += [ xr.open_dataset( file[0] ) ] files2combine += [files] # Remove the doubled up data variables vars2rm = ['AREA', 'P0', 'hybm', 'hyam', 'hyai', 'hybi', ''] # Open first fileset as a single xarray dataset ds = xr.open_mfdataset(files2combine[0]) # Now add other file sets to this for files in files2combine[1:]: print(files) dsNew = xr.open_mfdataset(files) # Make sure indices for levels are coordinate variables var2add = 'ilev' if var2add not in dsNew.coords: print('Adding ilev to dataset') if var2add not in dsNew.data_vars: dsNew[var2add] = ds[var2add].values dsNew = dsNew.set_coords(var2add) dsNew = dsNew.assign_coords(ilev=ds[var2add].copy()) var2add = 'lev' if var2add not in dsNew.coords: print('Adding lev to dataset') if var2add not in dsNew.data_vars: dsNew[var2add] = ds[var2add].values dsNew = dsNew.set_coords(var2add) dsNew = dsNew.assign_coords(lev=ds[var2add]) # Update the timestamp for the HEMCO files if all(['HEMCO_diagnostics' in i for i in files]): dt = [ AC.add_hrs(i, 0.5) for i in AC.dt64_2_dt(dsNew.time.values) ] dsNew.time.values = dt # Combine new data into a core file vars2add = [i for i in dsNew.data_vars if i not in vars2rm] ds = xr.merge([ds, dsNew[vars2add]]) del dsNew # Remove OH and HO2 from SpeciesConc diag. ds = remove_OH_HO2_from_SpeciesConc(ds=ds) # Remove any unneeded spatial dimensions ds = remove_unneeded_vertical_dimensions(ds=ds) # Update any units to those requested for PREFIA... ds = convert_IC_ds_units(ds=ds) # loosing units here # Add combined variables ds = add_combined_vars(ds=ds) # Only include requested parameters ds = only_inc_requested_vars(ds=ds) # Change any names ds = update_names_in_ds(ds=ds) # Update extents to be over Africa # (19.9W, 39.9S), and north-eastern cell centre is placed at (54.9W, 39.9N) ds = only_consider_domain_over_Africa(ds) # Update the global attributes ds = add_global_attributes4run(ds, run=run) # remove all the unneeded coordinates ds = ds.squeeze() # Now save the file for the given day filename = FileStr.format(res, run, day_str, version) ds.to_netcdf('{}/{}.nc'.format(folder, filename)) # Remove the used files from memory del ds gc.collect()