def get_MASTER_Chance2014_iodide_obs_file( sheetname='S>30 data set', skiprows=1, file_and_path='./sparse2spatial.rc', ): """ To check on the correlations between the newly extract climatological values, this funtion extracts the details from Chance2014's master spreadsheet to perform comparisons. Parameters ------- sheetname (str): name of the excel sheet to use skiprows (int): number of rows to skip when reading sheet file_and_path (str): folder and filename with location settings as single str Returns ------- (pd.DataFrame) """ # Location and filename? filename = 'Iodide_correlations_310114_MASTER_TMS_EDIT.xlsx' folder = utils.get_file_locations('data_root', file_and_path=file_and_path) folder += 'Iodide/inputs/RJC_spreadsheets/' # Extract MASTER excel spreadsheet from Chance2014 df = pd.read_excel(folder + filename, sheetname=sheetname, skiprows=skiprows) return df
def Convert_DOC_prod_file_into_Standard_NetCDF(): """ Convert Saeed Roshan's file into CF compliant format """ # - convert the surface DOC file into a monthly average file # Directory? older = utils.get_file_locations('data_root') +'/DOC/' # Filename as a string file_str = 'DOC_Accum_rate_SR.nc' # Open dataset ds = xr.open_dataset(folder+file_str) # - Force use of coordinate variables in netCDF ds['latitude'] = ds['lat'][0, :].values ds['latitude'].attrs = ds['lat'].attrs ds['longitude'] = ds['lon'][:, 0].values ds['longitude'] .attrs = ds['lon'].attrs # - Rename dimensions dims_dict = {'latitude': 'lat', 'longitude': 'lon'} # - Only keep the variables of interest var2keep = [u'DOCaccum_avg', u'DOCaccum_std', ] var2keep += dims_dict.keys() ds = ds.drop(labels=[i for i in ds.variables if i not in var2keep]) ds.rename(dims_dict, inplace=True) # - Add history to attirubtes d = ds.attrs date = datetime.datetime.now().strftime("%Y-%m-%d %H:%M") hst_str = 'File structure/variables updated to CF by TMS ({}) on {}' d['History'] = hst_str.format('University of York', date) d['Originating author'] = 'SR - Saeed Roshan ([email protected])' d['Editting author'] = 'TMS - ([email protected])' d['Citation'] = 'doi.org/10.1038/s41467-017-02227-3' ds.attrs = d # - Save the new NetCDF file newfile_str = file_str.split('.nc')[0]+'_TMS_EDIT.nc' ds.to_netcdf(folder + newfile_str)
def get_processed_df_obs_mod(reprocess_params=False, target='CH2Br2', filename='s2s_CH2Br2_obs_ancillaries.csv', rm_Skagerrak_data=False, file_and_path='./sparse2spatial.rc', verbose=True, debug=False): """ Get the processed observation and model output Parameters ------- Returns ------- (pd.DataFrame) Notes ----- """ # Read in processed csv file folder = utils.get_file_locations('data_root', file_and_path=file_and_path) folder += '/{}/inputs/'.format(target) filename = 's2s_{}_obs_ancillaries.csv'.format(target) df = pd.read_csv(folder + filename, encoding='utf-8') # Add SST in Kelvin too if 'WOA_TEMP_K' not in df.columns: df['WOA_TEMP_K'] = df['WOA_TEMP_K'].values + 273.15 return df
def get_iodide_data_from_BODC(file_and_path='./sparse2spatial.rc', filename = 'Global_Iodide_obs_surface.csv'): """ Get the latest iodide data from .csv file archived with BODC Parameters ------- file_and_path (str): folder and filename with location settings as single str filename (str): name of the csv file or archived data from BODC debug (bool), print debug statements Returns ------- (pd.DataFrame) """ # print instructions to mannually download data. prt_str = 'WARNING: automated download from BODC not yet setup \n' prt_str += 'Please mannually download the lastest data from BODC \n' prt_str += '*.csv file availble from https://doi.org/10/czhx \n' print(prt_str) # Location of data folder = utils.get_file_locations('s2s_root', file_and_path=file_and_path) folder += '/Iodide/inputs/' # open .csv file and return df = pd.read_csv(folder+filename) return df
def get_processed_df_obs_mod(target='example', file_and_path='./sparse2spatial.rc'): """ Get the processed observation and model output Parameters ------- target (str), Name of the target variable (e.g. iodide) file_and_path (str), folder and filename with location settings as single str Returns ------- (pd.DataFrame) Notes ----- """ # Read in processed csv file of observations and ancillaries folder = utils.get_file_locations('s2s_root', file_and_path=file_and_path) folder += '/{}/inputs/'.format(target) filename = 's2s_{}_obs_ancillaries.csv'.format(target) df = pd.read_csv(folder+filename, encoding='utf-8') # Add SST in Kelvin too if 'WOA_TEMP_K' not in df.columns: df['WOA_TEMP_K'] = df['WOA_TEMP_K'].values + 273.15 return df
def get_example_obs(target='example', limit_depth_to=20): """ Get the raw sparse observations from a database... Parameters ------- target (str), Name of the target variable (e.g. iodide) limit_depth_to (float), depth from sea surface to include data (metres) Returns ------- (pd.DataFrame) """ # File to use (example name string...) filename = 'HC_seawater_concs_above_{}m.csv'.format(limit_depth_to) # Where is the file? s2s_root = utils.get_file_locations('s2s_root') folder = '{}/{}/inputs/'.format(s2s_root, target) df = pd.read_csv(folder+filename) # Variable name? Varname = 'example (pM)' # Assume using coord variables for now LatVar1 = '<native latitude name (+ve N)>' LonVar1 = '<native longitude name (+ve E)>' # Add time TimeVar1 = 'native Date and time (UTC)' month_var = 'Month' dt = pd.to_datetime( df[TimeVar1], format='%Y-%m-%d %H:%M:%S', errors='coerce') df['datetime'] = dt def get_month(x): return x.month df[month_var] = df['datetime'].map(get_month) # Make sure all values are numeric for var in [Varname]+[LatVar1, LonVar1]: df.loc[:, var] = pd.to_numeric(df[var].values, errors='coerce') # replace flagged values with NaN df.replace(999, np.NaN, inplace=True) df.replace(-999, np.NaN, inplace=True) # Update names to use cols2use = ['datetime', 'Month', LatVar1, LonVar1, Varname] name_dict = { LatVar1: 'Latitude', LonVar1: 'Longitude', month_var: 'Month', Varname: target } df = df[cols2use].rename(columns=name_dict) # Add a unique identifier df['NEW_INDEX'] = range(1, df.shape[0]+1) # Set to a unique string instead of a number def get_unique_Data_Key_ID(x): return 'HC_{:0>6}'.format(int(x)) df['Data_Key_ID'] = df['NEW_INDEX'].map(get_unique_Data_Key_ID) # Remove all the NaNs and print to screen the change in dataset size t0_shape = df.shape[0] df = df.dropna() if t0_shape != df.shape[0]: pstr = 'WARNING: Dropped obs. (#={}), now have #={} (had #={})' print(pstr.format(t0_shape-df.shape[0], df.shape[0], t0_shape)) return df
def get_processed_df_obs_mod(reprocess_params=False, filename='Iodine_obs_WOA.csv', rm_Skagerrak_data=False, file_and_path='./sparse2spatial.rc', verbose=True, debug=False): """ Get the processed observation and model output Parameters ------- file_and_path (str): folder and filename with location settings as single str rm_Skagerrak_data (boolean): remove the single data from the Skagerrak region reprocess_params (bool): filename (str): name of the input file of processed observational data verbose (bool): print verbose statements debug (bool): print debug statements Returns ------- (pd.DataFrame) """ # Read in processed csv file folder = utils.get_file_locations('data_root', file_and_path=file_and_path) folder += '/Iodide/' df = pd.read_csv(folder + filename, encoding='utf-8') # Add ln of iodide too df['ln(Iodide)'] = df['Iodide'].map(np.ma.log) # Add SST in Kelvin too if 'WOA_TEMP_K' not in df.columns: df['WOA_TEMP_K'] = df['WOA_TEMP_K'].values + 273.15 # Make sure month is numeric (if not given) month_var = 'Month' NaN_months_bool = ~np.isfinite(df[month_var].values) NaN_months_df = df.loc[NaN_months_bool, :] N_NaN_months = NaN_months_df.shape[0] if N_NaN_months > 1: print_str = 'DataFrame contains NaNs for {} months - ' print_str += 'Replacing these with month # 3 months ' print_str += 'before (hemispheric) summer solstice' if verbose: print(print_str.format(N_NaN_months)) NaN_months_df[month_var] = NaN_months_df.apply( lambda x: set_backup_month_if_unknown( lat=x['Latitude'], # main_var=var2use, # var2use=var2use, # # Data_key_ID_=Data_key_ID_, debug=False), axis=1) # Add back into DataFrame df.loc[NaN_months_bool, month_var] = NaN_months_df[month_var].values # Re-process the parameterisations (Chance et al etc + ensemble)? if reprocess_params: # Add predictions from literature df = get_literature_predicted_iodide(df=df) # Add ensemble prediction df = get_ensemble_predicted_iodide(rm_Skagerrak_data=rm_Skagerrak_data) return df
def process_iodide_obs_ancillaries_2_csv(rm_Skagerrak_data=False, add_ensemble=False, file_and_path='./sparse2spatial.rc', target='Iodide', verbose=True): """ Create a csv files of iodide observation and ancilllary observations Parameters ------- file_and_path (str): folder and filename with location settings as single str add_ensemble (bool): add the ensemble prediction to input data dataframe rm_Skagerrak_data (boolean): remove the single data from the Skagerrak region target (str): Name of the target variable (e.g. iodide) verbose (bool): print verbose statements Returns ------- (pd.DataFrame) Notes ----- - Workflow assumes that this step will be run to compile the data """ # Get iodide observations (as a dictionary/DataFrame) obs_data_df, obs_metadata_df = get_iodide_obs() # Add ancillary obs. obs_data_df = extract_ancillaries_from_compiled_file(df=obs_data_df) # Save the intermediate file folder = utils.get_file_locations('data_root', file_and_path=file_and_path) folder += '/{}/'.format(target) filename = 'Iodine_obs_WOA_v8_5_1_TEMP_TEST.csv' obs_data_df.to_csv(folder + filename, encoding='utf-8') # - Add predicted iodide from MacDonald and Chance parameterisations obs_data_df = get_literature_predicted_iodide(df=obs_data_df) # - Add ensemble prediction by averaging predictions at obs. locations.? if add_ensemble: print('NOTE - models must have already been provided via RFR_dict') RFR_dict = build_or_get_models(rm_Skagerrak_data=rm_Skagerrak_data, ) # Now extract for obs_data_df = get_ensemble_predicted_iodide( df=obs_data_df, use_vals_from_NetCDF=False, RFR_dict=RFR_dict, rm_Skagerrak_data=rm_Skagerrak_data) # - Join dataframes and save as csv. # filename = 'Iodine_obs_WOA.csv' # filename = 'Iodine_obs_WOA_v8_1_PLUS_ENSEMBLE.csv' # filename = 'Iodine_obs_WOA_v8_5_1_PLUS_ENSEMBLE_8_3_ENSEMBLE.csv' filename = 'Iodine_obs_WOA_v8_5_1_ENSEMBLE_csv__avg_nSkag_nOutliers.csv' # filename = 'Iodine_obs_WOA_v8_2_PLUS_PARAMS.csv' if verbose: print(obs_data_df.shape, obs_data_df.columns) obs_data_df.to_csv(folder + filename, encoding='utf-8') if verbose: print('File saved to: ', folder + filename)
def get_CHBr3_obs(target='CHBr3', limit_depth_to=20,): """ Get the raw observations from HalOcAt database """ # File to use filename = 'HC_seawater_concs_above_{}m.csv'.format(limit_depth_to) # Where is the file? data_root = utils.get_file_locations('data_root') folder = '{}/{}/inputs/'.format(data_root, target) df = pd.read_csv(folder+filename) # Variable name? - Just use one of the values for now Varname = 'CHBr3 (pM)' # Assume using coord variables for now LatVar1 = 'Sample start latitude (+ve N)' LonVar1 = 'Sample start longitude (+ve E)' # Add time TimeVar1 = 'Date (UTC) and time' TimeVar2 = 'Sampling date/time (UT)' month_var = 'Month' dt = pd.to_datetime( df[TimeVar1], format='%Y-%m-%d %H:%M:%S', errors='coerce') df['datetime'] = dt def get_month(x): return x.month df[month_var] = df['datetime'].map(get_month) # make sure all values are numeric for var in [Varname]+[LatVar1, LonVar1]: df.loc[:, var] = pd.to_numeric(df[var].values, errors='coerce') # replace flagged values with NaN df.replace(999, np.NaN, inplace=True) df.replace(-999, np.NaN, inplace=True) # Update names to use cols2use = ['datetime', 'Month', LatVar1, LonVar1, Varname] name_dict = { LatVar1: 'Latitude', LonVar1: 'Longitude', month_var: 'Month', Varname: target } df = df[cols2use].rename(columns=name_dict) # Add a unique identifier df['NEW_INDEX'] = range(1, df.shape[0]+1) # Kludge for now to just a name then number def get_unique_Data_Key_ID(x): return 'HC_{:0>6}'.format(int(x)) df['Data_Key_ID'] = df['NEW_INDEX'].map(get_unique_Data_Key_ID) # Remove all the NaNs t0_shape = df.shape[0] df = df.dropna() if t0_shape != df.shape[0]: pstr = 'WARNING: Dropped obs. (#={}), now have #={} (had #={})' print(pstr.format(t0_shape-df.shape[0], df.shape[0], t0_shape)) return df
def get_iodide_obs_metadata(file_and_path='./sparse2spatial.rc'): """ Extract and return metadata from metadata csv """ # What is the location of the iodide data? folder = utils.get_file_locations('data_root', file_and_path=file_and_path) folder += '/Iodide/inputs/' # Filename? filename = 'Iodine_climatology_Submitted_data_list_formatted_TMS.xlsx' # Extract df = pd.read_excel(folder + filename, sheetname='Full') # return as DataFrame return df
def mk_RAD_NetCDF_monthly(): """ Resample shortwave radiation NetCDF from daily to monthly """ # Directory? folder = utils.get_file_locations('data_root') +'/GFDL/' # Filename as a string file_str = 'ncar_rad.15JUNE2009.nc' ds = xr.open_dataset(folder + filename) # Resample to monthly ds = ds.resample(dim='TIME', freq='M') # Save as NetCDF newfile_str = file_str.split('.nc')[0]+'_TMS_EDIT.nc' ds.to_netcdf(folder+newfile_str)
def process_obs_and_ancillaries_2_csv(target='CH2Br2', file_and_path='./sparse2spatial.rc'): """ Process the observations and extract ancillary variables for these locations """ # Get the bass observations df = get_CH2Br2_obs() # Extract the ancillary values for these locations df = extract_ancillaries_from_compiled_file(df=df) # Save the intermediate file folder = utils.get_file_locations('data_root', file_and_path=file_and_path) folder += '/{}/inputs/'.format(target) filename = 's2s_{}_obs_ancillaries_v0_0_0.csv'.format(target) df.to_csv(folder + filename, encoding='utf-8')
def get_WOA18_data(automatically_download=False, target='Iodide'): """ Get data from World Ocean (WOA) 2018 version 2 Notes ------- https://www.nodc.noaa.gov/OC5/woa18/woa18-preliminary-notes.html """ # Use the data settings for Iodide file_and_path = './{}/sparse2spatial.rc'.format(target) data_root = utils.get_file_locations('data_root', file_and_path=file_and_path) folder = '{}/data/{}/'.format(data_root, 'WOA18') # Now loop through the list of variables to donwload vars_dict2download = store_of_values2download4WOA18() for n in vars_dict2download.keys(): print(n, vars_dict2download[n].items()) # Extract variables d = vars_dict2download[n] var = d['var'] res = d['res'] period = d['period'] # Which specific subfolder to save data to? sfolder = '{}/{}/'.format(folder, var) # If seasonal data (decadal averaged) then download the monthd if (period == 'decav') or (period == 'all'): # get monthly and seasonal files seasons = ['{:0>2}'.format(i + 1) for i in np.arange(16)] # download files for season for season in seasons: WOA18_data4var_period(folder=sfolder, season=season, period=period, res=res, var=var) # If decadal split data, down load by season else: # just get seasonal files seasons = ['{:0>2}'.format(i) for i in [13, 14, 15, 16]] # download files for season for season in seasons: WOA18_data4var_period(folder=sfolder, season=season, period=period, res=res, var=var)
Parameters ------- target (str), Name of the target variable (e.g. iodide) version (str), version name/number (e.g. semantic version - https://semver.org/) file_and_path (str), folder and filename with location settings as single str Returns ------- (None) """ # Get the base observations df = get_example_obs() # Extract the ancillary values for these locations df = ancillaries2grid.extract_ancillaries_from_compiled_file(df=df) # Save the intermediate file folder = utils.get_file_locations('s2s_root', file_and_path=file_and_path) folder += '/{}/inputs/'.format(target) filename = 's2s_{}_obs_ancillaries_{}.csv'.format(target, version) df.to_csv(folder+filename, encoding='utf-8') def get_processed_df_obs_mod(target='example', file_and_path='./sparse2spatial.rc'): """ Get the processed observation and model output Parameters ------- target (str), Name of the target variable (e.g. iodide) file_and_path (str), folder and filename with location settings as single str Returns
def mk_predictions_for_3D_features(dsA=None, RFR_dict=None, res='4x5', models_dict=None, features_used_dict=None, stats=None, folder=None, target='Iodide', use_updated_predictor_NetCDF=False, save2NetCDF=False, plot2check=False, models2compare=[], topmodels=None, xsave_str='', add_ensemble2ds=False, verbose=True, debug=False): """ Make a NetCDF file of predicted target from feature variables for a given resolution Parameters ---------- dsA (xr.Dataset): dataset object with variables to interpolate RFR_dict (dict): dictionary of core variables and data res (str): horizontal resolution (e.g. 4x5) of Dataset save2NetCDF (bool): save interpolated Dataset to as a NetCDF? features_used_dict (dict): dictionary of feature variables in models models_dict (dict): dictionary of RFR models and there names stats (pd.DataFrame): dataframe of statistics on models in models_dict folder (str): location of NetCDF file of feature variables target (str): name of the species being predicted models2compare (list): list of models to make spatial predictions for (rm: double up?) topmodels (list): list of models to make spatial predictions for xsave_str (str): string to include as suffix in filename used for saved NetCDF add_ensemble2ds (bool): calculate std. dev. and mean for list of topmodels verbose (bool): print out verbose output? debug (bool): print out debugging output? Returns ------- (xr.Dataset) """ # Make sure the core dictionary is provided assert (type(RFR_dict) == dict ), 'Core variables must be provided as dict (RFR_dict)' # Make sure a full list of models was provided assert (len(models2compare) > 0), 'List of models to must be provided!' # Inc. all the topmodels in the list of models to compare if they have been provided. if isinstance(topmodels, type(list)): models2compare += topmodels # Remove any double ups in list of of models to predict models2compare = list(set(models2compare)) # Get the variables required here if isinstance(models_dict, type(None)): models_dict = RFR_dict['models_dict'] if isinstance(features_used_dict, type(None)): features_used_dict = RFR_dict['features_used_dict'] # Get location to save file and set filename if isinstance(folder, type(None)): folder = utils.get_file_locations('data_root') + '/data/' if isinstance(dsA, type(None)): filename = 'Oi_prj_feature_variables_{}.nc'.format(res) dsA = xr.open_dataset(folder + filename) # - Make a dataset of predictions for each model ds_l = [] for modelname in models2compare: # get model model = models_dict[modelname] # get testinng features features_used = utils.get_model_features_used_dict(modelname) # Make a DataSet of predicted values ds_tmp = utils.mk_da_of_predicted_values(dsA=dsA, model=model, res=res, modelname=modelname, features_used=features_used) # Add attributes to the prediction ds_tmp = utils.add_attrs2target_ds(ds_tmp, add_global_attrs=False, varname=modelname) # Save to list ds_l += [ds_tmp] # Combine datasets ds = xr.merge(ds_l) # - Also get values for parameterisations # if target == 'Iodide': # # Chance et al (2013) # param = u'Chance2014_STTxx2_I' # arr = utils.calc_I_Chance2014_STTxx2_I(dsA['WOA_TEMP'].values) # ds[param] = ds[modelname] # use existing array as dummy to fill # ds[param].values = arr # # MacDonald et al (2013) # param = 'MacDonald2014_iodide' # arr = utils.calc_I_MacDonald2014(dsA['WOA_TEMP'].values) # ds[param] = ds[modelname] # use existing array as dummy to fill # ds[param].values = arr # Add ensemble to ds too if add_ensemble2ds: print('WARNING: Using topmodels for ensemble as calculated here') var2template = list(ds.data_vars)[0] ds = RFRanalysis.add_ensemble_avg_std_to_dataset( ds=ds, res=res, target=target, RFR_dict=RFR_dict, topmodels=topmodels, var2template=var2template, save2NetCDF=False) # Add global attributes ds = utils.add_attrs2target_ds(ds, add_varname_attrs=False) # Save to NetCDF if save2NetCDF: filename = 'Oi_prj_predicted_{}_{}{}.nc'.format(target, res, xsave_str) ds.to_netcdf(filename) else: return ds
def get_stats_on_spatial_predictions_0125x0125(use_annual_mean=True, target='Iodide', RFR_dict=None, ex_str='', just_return_df=False, folder=None, filename=None, rm_Skagerrak_data=False, debug=False): """ Evaluate the spatial predictions between models at 0.125x0.125 Parameters ------- target (str): Name of the target variable (e.g. iodide) debug (bool): print out debugging output? rm_Skagerrak_data (bool): Remove specific data (above argument is a iodide specific option - remove this) just_return_df (bool): just return the data as dataframe folder (str): folder where NetCDF of predicted data is located ex_str (str): extra string to include in file name to save data use_annual_mean (bool): use the annual mean of the variable for statistics var2template (str): variable to use a template for making new variables in ds Returns ------- Notes ----- """ # ---- # Get spatial prediction data from NetCDF files saved already res = '0.125x0.125' if isinstance(filename, type(None)): if rm_Skagerrak_data: extr_file_str = '_No_Skagerrak' else: extr_file_str = '' filename = 'Oi_prj_predicted_{}_{}{}.nc'.format( target, res, extr_file_str) if isinstance(folder, type(None)): data_root = utils.get_file_locations('data_root') folder = '{}/outputs/{}/'.format(data_root, target) ds = xr.open_dataset(folder + filename) # Variables to consider vars2analyse = list(ds.data_vars) # Add LWI and surface area to array ds = utils.add_LWI2array(ds=ds, res=res, var2template='Chance2014_STTxx2_I') # Set a name for output to saved as file_save_str = 'Oi_prj_annual_stats_global_ocean_{}{}'.format(res, ex_str) # ---- build an array with general statistics df = pd.DataFrame() # -- get general annual stats # Take annual average over time (if using annual mean) if use_annual_mean: ds_tmp = ds.mean(dim='time') for var_ in vars2analyse: # mask to only consider (100%) water boxes arr = ds_tmp[var_].values arr = arr[(ds_tmp['IS_WATER'] == True)] # save to dataframe df[var_] = pd.Series(arr.flatten()).describe() # Get area weighted mean too vals = [] # Take annual average over time (if using annual mean) - # Q: why does this need to be done twice separately? if use_annual_mean: ds_tmp = ds.mean(dim='time') for var_ in vars2analyse: # Mask to only consider (100%) water boxes mask = ~(ds_tmp['IS_WATER'] == True) arr = np.ma.array(ds_tmp[var_].values, mask=mask) # Also mask surface area (s_area) s_area_tmp = np.ma.array(ds_tmp['AREA'].values, mask=mask) # Save value to list vals += [AC.get_2D_arr_weighted_by_X(arr, s_area=s_area_tmp)] # Add area weighted mean to df df = df.T df['mean (weighted)'] = vals df = df.T # just return the dataframe of global stats if just_return_df: return df # save the values df.T.to_csv(file_save_str+'.csv') # ---- print out a more formatted version as a table for the paper # remove variables topmodels = get_top_models(RFR_dict=RFR_dict, vars2exclude=['DOC', 'Prod']) params = [ 'Chance2014_STTxx2_I', 'MacDonald2014_iodide', 'Ensemble_Monthly_mean' ] # select just the models of interest df = df[topmodels + params] # rename the models rename_titles = {u'Chance2014_STTxx2_I': 'Chance et al. (2014)', u'MacDonald2014_iodide': 'MacDonald et al. (2014)', 'Ensemble_Monthly_mean': 'RFR(Ensemble)', 'Iodide': 'Obs.', # u'Chance2014_Multivariate': 'Chance et al. (2014) (Multi)', } df.rename(columns=rename_titles, inplace=True) # Sort the dataframe by the mean weighted vales df = df.T df.sort_values(by=['mean (weighted)'], ascending=False, inplace=True) # rename columns (50% to median and ... ) cols2rename = {'50%': 'median', 'std': 'std. dev.', } df.rename(columns=cols2rename, inplace=True) # rename df.rename(index=rename_titles, inplace=True) # set column order # Set the stats to use first_columns = [ 'mean (weighted)', 'std. dev.', '25%', 'median', '75%', 'max', ] if debug: print(df.head()) df = df[first_columns] # save as CSV df.round(1).to_csv(file_save_str+'_FOR_TABLE_'+'.csv') # ---- Do some further analysis and save this to a text file a = open(file_save_str+'_analysis.txt', 'w') # Set a header print('This file contains global analysis of {} data'.format(str), file=a) print('\n', file=a) # which files are being analysed? print('---- Detail on the predicted fields', file=a) models2compare = { 1: u'RFR(Ensemble)', 2: u'Chance et al. (2014)', 3: u'MacDonald et al. (2014)', # 1: u'Ensemble_Monthly_mean', # 2: u'Chance2014_STTxx2_I', # 3:'MacDonald2014_iodide' # 1: u'RFR(TEMP+DEPTH+SAL+NO3+DOC)', # 2: u'RFR(TEMP+SAL+Prod)', # 3: u'RFR(TEMP+DEPTH+SAL)', } debug = True if debug: print(df.head()) df_tmp = df.T[models2compare.values()] # What are the core models print('Core models being compared are:', file=a) for key in models2compare.keys(): ptr_str = 'model {} - {}' print(ptr_str.format(key, models2compare[key]), file=a) print('\n', file=a) # Now print analysis on predicted fields # range in predicted model values mean_ = df_tmp.T['mean (weighted)'].values.mean() min_ = df_tmp.T['mean (weighted)'].values.min() max_ = df_tmp.T['mean (weighted)'].values.max() prt_str = 'avg predicted values = {:.5g} ({:.5g}-{:.5g})' print(prt_str.format(mean_, min_, max_), file=a) # range in predicted model values range_ = max_-min_ prt_str = 'range of predicted avg values = {:.3g}' print(prt_str.format(range_, min_, max_), file=a) # % of range in predicted model values ( as an error of model choice... ) pcents_ = range_ / df_tmp.T['mean (weighted)'] * 100 min_ = pcents_.min() max_ = pcents_.max() prt_str = 'As a % this is = {:.3g} ({:.5g}-{:.5g})' print(prt_str.format(pcents_.mean(), min_, max_), file=a) a.close()
def get_stats_on_spatial_predictions_4x5_2x25_by_lat(res='4x5', ex_str='', target='Iodide', use_annual_mean=False, filename=None, folder=None, ds=None, var2template='Chance2014_STTxx2_I', debug=False): """ Evaluate the spatial predictions between models, binned by latitude Parameters ------- target (str): Name of the target variable (e.g. iodide) res (str): horizontal resolution of dataset (e.g. 4x5) debug (bool): print out debugging output? var2template (str): variable to use a template for making new variables in ds use_annual_mean (bool): use the annual mean of the variable Returns ------- (pd.DataFrame) """ if isinstance(ds, type(None)): # If filename or folder not given, then use defaults if isinstance(filename, type(None)): filename = 'Oi_prj_predicted_{}_{}.nc'.format(target, res) if isinstance(folder, type(None)): data_root = utils.get_file_locations('data_root') folder = '{}/{}/outputs/'.format(data_root, target) ds = xr.open_dataset(folder + filename) # Variables to consider vars2analyse = list(ds.data_vars) # Add LWI to array ds = utils.add_LWI2array(ds=ds, var2template=var2template, res=res) # - Get general annual stats df = pd.DataFrame() # take annual average if use_annual_mean: ds_tmp = ds.mean(dim='time') else: ds_tmp = ds for var_ in vars2analyse: # Mask to only consider (100%) water boxes arr = ds_tmp[var_].values if debug: print(arr.shape, (ds_tmp['IS_WATER'] == False).shape) arr[(ds_tmp['IS_WATER'] == False).values] = np.NaN # Update values to include np.NaN ds_tmp[var_].values = arr # Setup series objects to hold stats s_mean = pd.Series() s_75 = pd.Series() s_50 = pd.Series() s_25 = pd.Series() # Loop by latasave to dataframe for lat_ in ds['lat'].values: vals = ds_tmp[var_].sel(lat=lat_).values stats_ = pd.Series(vals.flatten()).dropna().describe() # At poles all values will be the same (masked) value # if len( set(vals.flatten()) ) == 1: # pass # else: # save quartiles and mean # try: s_mean[lat_] = stats_['mean'] s_25[lat_] = stats_['25%'] s_75[lat_] = stats_['75%'] s_50[lat_] = stats_['50%'] # except KeyError: # print( 'Values not considered for lat={}'.format( lat_ ) ) # Save variables to DataFrame var_str = '{} - {}' stats_dict = {'mean': s_mean, '75%': s_75, '25%': s_25, 'median': s_50} for stat_ in stats_dict.keys(): df[var_str.format(var_, stat_)] = stats_dict[stat_] return df
def download_data4spec(lev2use=72, spec='LWI', res='0.125', file_prefix='nature_run', doys_list=None, verbose=True, debug=False): """ Download all data for a given species at a given resolution Parameters ------- spec (str): variable to extract from archived data res (str): horizontal resolution of dataset (e.g. 4x5) file_prefix (str): file prefix to add to saved file debug (bool): print out debugging output? Returns ------- (None) Notes ----- - use level=71 for lowest level (NetCDF is ordered the oposite way, python 0-71. Xarray numbering makes this level=72) (or use dictionary through xarray) """ # - local variables # Where is the remote data? root_url = 'https://opendap.nccs.nasa.gov/dods/OSSE/G5NR-Chem/Heracles/' # url_str = root_url+'12.5km/{}_deg/inst/inst1_3d_TRC{}_Nv'.format(res,spec) url_str = root_url+'12.5km/{}_deg/tavg/tavg1_2d_chm_Nx'.format(res) # Where should i save the data? save_dir = utils.get_file_locations('data_root') + '/NASA/LWI/' # - Open dataset via URL with xarray # Using xarray (issues found with NASA OpenDAP data model - via PyDAP) ds = xr.open_dataset(url_str) if verbose: print(ds, '\n\n\n') # Get list of (all) doys to extract (unless provided as argv.) if isinstance(doys_list, type(None)): doys_list = list(set(ds['time.dayofyear'].values)) # Variable to extract? var_name = '{}'.format(spec.lower()) # Just test a small extraction. # if debug: # data = ds[var_name][:10, lev, :, :] # select level and download all data ds = ds[var_name][:, :, :] # Make sure time is the dimension not module time = ds.time # - loop days of year (doy) # Custom mask def is_dayofyear(doy): return (doy == doy_) # Loop doys for doy_ in doys_list[:4]: try: if verbose: print(doy_, spec) # Now select for month ds_tmp = ds.sel(time=is_dayofyear(ds['time.dayofyear'])) # Save as NetCDF year_ = list(set(ds_tmp['time.year'].values))[0] # What is the filename? fstr = '{}_lev_{}_res_{}_spec_{}_{}_{:0>3}_ctm.nc' file2save = fstr.format(file_prefix, lev2use, res, spec, year_, str(doy_)) # Now save downloaded data as a NetCDF locally... if verbose: print(save_dir+file2save) ds_tmp.to_netcdf(save_dir+file2save) # Remove from memory del ds_tmp except RuntimeError: err_str = 'TMS ERROR - FAIL for spec={} (doy={})'.format( spec, doy_) print(err_str)
def add_ensemble_avg_std_to_dataset(res='0.125x0.125', RFR_dict=None, target='Iodide', stats=None, ds=None, topmodels=None, var2template='Chance2014_STTxx2_I', var2use4Ensemble = 'Ensemble_Monthly_mean', var2use4std = 'Ensemble_Monthly_std', save2NetCDF=True): """ Add ensemble average and std to dataset Parameters ------- target (str): Name of the target variable (e.g. iodide) var2use4Ensemble (str): variable name to use for ensemble prediction var2use4Std (str): variable name to use for ensemble prediction's std dev. var2template (str): variable to use a template to make new variables res (str): horizontal resolution of dataset (e.g. 4x5) topmodels (list): list of models to include in ensemble prediction save2NetCDF (bool): save the dataset as NetCDF file RFR_dict (dict): dictionary of core variables and data var2template (str): variable to use a template for making new variables in ds Returns ------- (xr.Dataset) """ # Get existing dataset from NetCDF if ds not provided filename = 'Oi_prj_predicted_{}_{}.nc'.format(target, res) if isinstance(ds, type(None)): data_root = utils.get_file_locations('data_root') folder = '{}/{}/'.format(data_root, target) ds = xr.open_dataset(folder + filename) # Just use top 10 models are included # ( with derivative variables ) if isinstance(topmodels, type(None)): # extract the models... if isinstance(RFR_dict, type(None)): RFR_dict = build_or_get_models() # Get list of topmodels = get_top_models(RFR_dict=RFR_dict, vars2exclude=['DOC', 'Prod']) # Now get average concentrations and std dev. per month avg_ars = [] std_ars = [] for month in range(1, 13): ars = [] for var in topmodels: ars += [ds[var].sel(time=(ds['time.month'] == month)).values] # Concatenate the models arr = np.concatenate(ars, axis=0) # Save the monthly average and standard deviation avg_ars += [np.ma.mean(arr, axis=0)] std_ars += [np.ma.std(arr, axis=0)] # Combine the arrays and then make the model variable # 1st Template an existing variable, then overwrite ds[var2use4Ensemble] = ds[var2template].copy() ds[var2use4Ensemble].values = np.stack(avg_ars) # And repeat for standard deviation ds[var2use4std] = ds[var2template].copy() ds[var2use4std].values = np.stack(std_ars) # Save the list of models used to make ensemble to array attrs = ds.attrs.copy() attrs['Ensemble_members ({})'.format(var2use4Ensemble)] = ', '.join(topmodels) ds.attrs = attrs # Save to NetCDF if save2NetCDF: ds.to_netcdf(filename) else: return ds
def mk_NetCDF_from_productivity_data(): """ Convert productivity .csv file (Behrenfeld and Falkowski, 1997) into a NetCDF file """ # Location of data (update to use public facing host) folder = utils.get_file_locations('data_root') + '/Productivity/' # Which file to use? filename = 'productivity_behrenfeld_and_falkowski_1997_extrapolated.csv' # Setup coordinates lon = np.arange(-180, 180, 1/6.) lat = np.arange(-90, 90, 1/6.) lat = np.append(lat, [90]) # Setup time varname = 'vgpm' months = np.arange(1, 13) # Extract data df = pd.read_csv(folder+filename, header=None) print(df.shape) # Extract data by month da_l = [] for n in range(12): # Assume the data is in blocks by longitude? arr = df.values[:, n*1081: (n+1)*1081].T[None, ...] print(arr.shape) da_l += [xr.Dataset( data_vars={varname: (['time', 'lat', 'lon', ], arr)}, coords={'lat': lat, 'lon': lon, 'time': [n]})] # Concatenate to data xr.Dataset ds = xr.concat(da_l, dim='time') # Update time ... sdate = datetime.datetime(1985, 1, 1) # Climate model tiem ds['time'] = [AC.add_months(sdate, i-1) for i in months] # Update to hours since X hours = [(AC.dt64_2_dt([i])[0] - sdate).days * 24. for i in ds['time'].values] ds['time'] = hours # Add units attrs_dict = {'units': 'hours since 1985-01-01 00:00:00'} ds['time'].attrs = attrs_dict # Add attributes for variable attrs_dict = { 'long_name': "net primary production", 'units': "mg C / m**2 / day", } ds[varname].attrs = attrs_dict # For latitude... attrs_dict = { 'long_name': "latitude", 'units': "degrees_north", "standard_name": "latitude", "axis": "Y", } ds['lat'].attrs = attrs_dict # And longitude... attrs_dict = { 'long_name': "longitude", 'units': "degrees_east", "standard_name": "longitude", "axis": "X", } ds['lon'].attrs = attrs_dict # Add extra global attributes global_attribute_dictionary = { 'Title': 'Sea-surface productivity (Behrenfeld and Falkowski, 1997)', 'Author': 'Tomas Sherwen ([email protected])', 'Notes': "Data extracted from OCRA and extrapolated to poles by Martin Wadley. NetCDF contructed using xarray (xarray.pydata.org) by Tomas Sherwen. \n NOTES from oringal site (http://orca.science.oregonstate.edu/) from 'based on the standard vgpm algorithm. npp is based on the standard vgpm, using modis chl, sst4, and par as input; clouds have been filled in the input data using our own gap-filling software. For citation, please reference the original vgpm paper by Behrenfeld and Falkowski, 1997a as well as the Ocean Productivity site for the data.' ", 'History': 'Last Modified on:' + strftime("%B %d %Y", gmtime()), 'Conventions': "COARDS", } ds.attrs = global_attribute_dictionary # Save to NetCDF filename = 'productivity_behrenfeld_and_falkowski_1997_extrapolated.nc' ds.to_netcdf(filename, unlimited_dims={'time': True})
def extract_templated_excel_file(limit_depth_to=20, Data_Key=None, metadata_df=None, use_inclusive_limit=False, file_and_path='./sparse2spatial.rc', verbose=True, debug=False): """ Extract an excel file in the iodide template format & return as DataFrame Parameters ------- file_and_path (str): folder and filename with location settings as single str filename (str): name of the csv file or archived data from BODC limit_depth_to (float), depth (m) to limit inclusion of data to use_inclusive_limit (bool), limit depth (limit_depth_to) in a inclusive way debug (bool), print debugging statements to screen Returns ------- (pd.DataFrame) """ # limit_depth_to=20; Data_Key=None; metadata_df=None; debug=False # - Get file details # Load metadata file as a DataFrame Data_Key_meta = metadata_df[metadata_df.Data_Key == Data_Key] # Use TMS updated variable for filename # filename = Data_Key_meta['File name'].values[0] filename = Data_Key_meta['File_name_UPDATED'].values[0] source = Data_Key_meta['source'].values[0] InChance2014 = Data_Key_meta['In Chance2014?'].values[0] == 'Y' # - Get directory which contains files # Data submitted directly for preparation # (as publish by Chance et al (2014) ) # New data, acquired since 2017 folder = utils.get_file_locations('data_root', file_and_path=file_and_path) folder = '/{}/Iodide/'.format(folder) if (not InChance2014): folder += '/inputs/new_data/' elif ((source == 's') or (source == 'bodc')) and (InChance2014): folder += '/inputs/submitted_data/' # Data digitalised for Chance et al (2014) elif (source == 'd') and (InChance2014): folder += '/inputs/digitised_data/' else: print("Source received ('') unknown?!".format(source)) sys.exit() # File specific reading settings? read_csv_settings = read_csv_settings_4_data_key_file(Data_Key=Data_Key) skiprows, file_extension = read_csv_settings # - Read file and process if verbose: print('reading: {}'.format(filename), Data_Key) df = pd.read_excel(folder + filename + file_extension, sheet_name='Data', skiprows=skiprows) # Force use of 'Index' column as index to preserve ordering. df.index = df['Index'].values # Only consider values with a depth value lower than x (e.g. 100m) # From Chance et al (2014): On ship-based campaigns, ‘surface’ water is # usually collected from an underway pumped seawater inlet # (typically at a depth of around 6 m on a 100 m length research # ship), and/or sampling bottles mounted on a CTD rosette and # closed within a few metres of the sea surface, but during some # eld campaigns (e.g. winter samples in the Antarctic73), only # data from 15 m depth was available. In most cases, the water # column is thought to be sufficiently homogenous between 0 and # 20 m that this choice of depth can be assumed to be representative # of concentrations in the top few metres of the water # column (see Section 3.4 for a description of the changes in # iodine speciation with depth). if verbose: print(df.columns) if use_inclusive_limit: df = df.loc[df['Depth'] <= limit_depth_to, :] # consider values inclusively else: df = df.loc[df['Depth'] < limit_depth_to, :] # only consider values less than X # Add a column to be a unique identifier and column index def get_unique_Data_Key_label(x, Data_Key=Data_Key): # Use the index as the number (which now starts from 1) x = int(x) return '{}_{:0>4}'.format(Data_Key, x) # Map to index, then assign to be index df['Data_Key_ID'] = df['Index'].map(get_unique_Data_Key_label) # df.index = df['Data_Key_ID'] # Also add column for data key df['Data_Key'] = Data_Key return df
def get_processed_df_obs_mod(reprocess_params=False, target='CHBr3', filename='s2s_CHBr3_obs_ancillaries.csv', rm_Skagerrak_data=False, file_and_path='./sparse2spatial.rc', verbose=True, debug=False): """ Get the processed observation and model output Parameters ------- Returns ------- (pd.DataFrame) Notes ----- """ # Read in processed csv file folder = utils.get_file_locations('data_root', file_and_path=file_and_path) folder += '/{}/inputs/'.format(target) filename = 's2s_{}_obs_ancillaries.csv'.format(target) df = pd.read_csv(folder+filename, encoding='utf-8') # Kludge (temporary) - make Chlorophyll values all floats # def mk_float_or_nan(input): # try: # return float(input) # except: # return np.nan # df['SeaWIFs_ChlrA'] = df['SeaWIFs_ChlrA'].map(mk_float_or_nan) # Add ln of iodide too # df['ln(Iodide)'] = df['Iodide'].map(np.ma.log) # Add SST in Kelvin too if 'WOA_TEMP_K' not in df.columns: df['WOA_TEMP_K'] = df['WOA_TEMP_K'].values + 273.15 # Add a flag for coastal values # coastal_flagged = 'coastal_flagged' # if coastal_flagged not in df.columns: # df = get_coastal_flag(df=df) # Make sure month is numeric (if not given) # month_var = 'Month' # NaN_months_bool = ~np.isfinite(df[month_var].values) # NaN_months_df = df.loc[NaN_months_bool, :] # N_NaN_months = NaN_months_df.shape[0] # if N_NaN_months > 1: # print_str = 'DataFrame contains NaNs for {} months - ' # print_str += 'Replacing these with month # 3 months ' # print_str += 'before (hemispheric) summer solstice' # if verbose: # print(print_str.format(N_NaN_months)) # NaN_months_df[month_var] = NaN_months_df.apply(lambda x: # set_backup_month_if_unkonwn( # lat=x['Latitude'], # #main_var=var2use, # #var2use=var2use, # # # #Data_key_ID_=Data_key_ID_, # debug=False), axis=1) # # Add back into DataFrame # df.loc[NaN_months_bool, month_var] = NaN_months_df[month_var].values # Re-process the parameterisations (Chance et al etc + ensemble)? # if reprocess_params: # # Add predictions from literature # df = get_literature_predicted_iodide(df=df) # # Add ensemble prediction # df = get_ensemble_predicted_iodide( # rm_Skagerrak_data=rm_Skagerrak_data # ) return df
def get_stats_on_spatial_predictions_4x5_2x25(res='4x5', ex_str='', target='Iodide', use_annual_mean=True, filename=None, folder=None, just_return_df=False, var2template='Chance2014_STTxx2_I', ): """ Evaluate the spatial predictions between models at a resolution of 4x5 or 2x2.5 Parameters ------- target (str): Name of the target variable (e.g. iodide) res (str): horizontal resolution of dataset (e.g. 4x5) var2template (str): variable to use a template for making new variables in ds use_annual_mean (bool): use the annual mean of the variable Returns ------- Notes ----- """ # If filename or folder not given, then use defaults if isinstance(filename, type(None)): filename = 'Oi_prj_predicted_{}_{}.nc'.format(target, res) if isinstance(folder, type(None)): data_root = utils.get_file_locations('data_root') folder = '{}/{}/outputs/'.format(data_root, target) ds = xr.open_dataset(folder + filename) # variables to consider vars2plot = list(ds.data_vars) # add LWI and surface area to array ds = utils.add_LWI2array(ds=ds, var2template=var2template) IS_WATER = ds['IS_WATER'].mean(dim='time') # -- get general annual stats in a dataframe df = pd.DataFrame() for var_ in vars2plot: ds_tmp = ds[var_].copy() # take annual average if use_annual_mean: ds_tmp = ds_tmp.mean(dim='time') # mask to only consider (100%) water boxes arr = ds_tmp.values arr = arr[(IS_WATER == True)] # sve to dataframe df[var_] = pd.Series(arr.flatten()).describe() # Get area weighted mean vals = [] for var_ in vars2plot: ds_tmp = ds[var_] # take annual average if use_annual_mean: ds_tmp = ds_tmp.mean(dim='time') # mask to only consider (100%) water boxes arr = np.ma.array(ds_tmp.values, mask=~(LWI == 0).T) # also mask s_area s_area_tmp = np.ma.array(s_area, mask=~(LWI == 0)) # save value vals += [AC.get_2D_arr_weighted_by_X(arr, s_area=s_area_tmp.T)] # Add area weighted mean to df df = df.T df['mean (weighted)'] = vals df = df.T # Save or just return the values file_save = 'Oi_prj_annual_stats_global_ocean_{}{}.csv'.format(res, ex_str) if just_return_df: return df df.T.to_csv(file_save)
def add_all_Chance2014_correlations(df=None, debug=False, verbose=False): """ Add Chance et al 2014 parameterisations to df (from processed .csv) """ # get details of parameterisations # filename='Chance_2014_Table2_PROCESSED_17_04_19.csv' filename = 'Chance_2014_Table2_PROCESSED.csv' folder = utils.get_file_locations('data_root') folder += '/Iodide/' param_df = pd.read_csv(folder + filename) # map input variables input_dict = { 'C': 'WOA_TEMP', 'ChlorA': 'SeaWIFs_ChlrA', 'K': 'WOA_TEMP_K', 'Lat': 'Latitude', 'MLDpd': 'WOA_MLDpd', 'MLDpt': 'WOA_MLDpt', 'MLDvd': 'WOA_MLDvd', 'MLDpd_max': 'WOA_MLDpd_max', 'MLDpt_max': 'WOA_MLDpt_max', 'MLDvd_max': 'WOA_MLDvd_max', 'MLDpd_sum': 'WOA_MLDpd_sum', 'MLDpt_sum': 'WOA_MLDpt_sum', 'MLDvd_sum': 'WOA_MLDvd_sum', 'NO3': 'WOA_Nitrate', 'Salinity': 'WOA_Salinity', } # - Loop parameterisations and add to dataframe for param in param_df['TMS ID'].values: sub_df = param_df[param_df['TMS ID'] == param] if debug: print(sub_df) # extract variables data = df[input_dict[sub_df.param.values[0]]].values # Function to use? func2use = str(sub_df.function.values[0]) if debug: print(func2use) # Do any functions on the data if func2use == 'None': pass elif func2use == 'abs': data = abs(data) elif func2use == 'inverse': data = 1. / data elif func2use == 'square': data = data**2 # elif func2use == 'max': # print 'Need to add max option!' # elif func2use == 'sum': # print 'Need to add sum option!' else: print('function not in list') sys.exit() # if not isinstance(func2use, type(None) ): # data = func2use(data) # apply linear scaling m, c = [sub_df[i].values[0] for i in ['m', 'c']] # print [ (type(i), i) for i in m, c,data ] data = (m * data) + c # now add to dictionary df[param] = data return df
def get_core_Chance2014_obs(debug=False, file_and_path='./sparse2spatial.rc'): """ Get core observation data from Chance2014 Parameters ------- file_and_path (str): folder and filename with location settings as single str debug (bool): print debugging to screen Returns ------- (pd.DataFrame) Notes ----- - This assumes that core data is "surface data" above 20m - only considers rows of csv where there is iodine data. """ # - Get file observational file # Directory to use? folder = utils.get_file_locations('data_root', file_and_path=file_and_path) folder = '{}/Iodide/inputs/'.format(folder) # Filename for <20m iodide data? filename = 'Iodide_data_above_20m.csv' # Open data as DataFrame df = pd.read_csv(folder + filename) # - Process the input observational data # list of core variables core_vars = [ 'Ammonium', 'Chl-a', 'Cruise', 'Data_Key', 'Data_Key_ID', 'Date', 'Day', 'Depth', 'Iodate', 'Iodide', 'Latitude', 'Longitude', 'MLD', 'Month', 'MLD(vd)', 'Nitrate', 'Nitrite', 'O2', 'Organic-I', 'Salinity', 'Station', 'Temperature', 'Time', 'Total-I', 'Unique id', 'Year', u'Method', u'ErrorFlag', ] # 2nd iteration excludes 'MLD(vd)', so remove this. core_vars.pop(core_vars.index('MLD(vd)')) # 2nd iterations includes new flag columns. Add these. core_vars += [ 'Coastal', 'LocatorFlag', 'Province', ] # Just select core variables df = df[core_vars] # Remove datapoints that are not floats def make_sure_values_are_floats(x): """ Some values in the dataframes are "nd" or "###?". remove these. """ try: x = float(x) except: x = np.NaN return x # TODO: Make this more pythonic make_data_floats = [ 'Ammonium', 'Chl-a', 'Iodate', 'Iodide', 'Latitude', 'Longitude', 'MLD', 'MLD(vd)', 'Month', 'Nitrate', 'Nitrite', 'O2', 'Organic-I', 'Salinity', 'Temperature', 'Total-I' ] # 2nd iteration excludes 'MLD(vd)', so remove this. make_data_floats.pop(make_data_floats.index('MLD(vd)')) # 2nd iterations includes new flag columns. Add these. make_data_floats += [ 'Coastal', 'LocatorFlag', 'Province', ] # v8.4 had further updates. make_data_floats += [ 'ErrorFlag', ] for col in make_data_floats: df[col] = df[col].map(make_sure_values_are_floats)[:] # Only consider rows where there is iodide data (of values from <20m N=930) if debug: print('I- df shape (inc. NaNs): {}'.format(str(df.shape))) df = df[np.isfinite(df['Iodide'])] if debug: print("I- df post rm'ing NaNs: {}".format(str(df.shape))) return df
def Hyperparameter_Tune_model(use_choosen_model=True, model=None, RFR_dict=None, df=None, cv=3, testset='Test set (strat. 20%)', target='Iodide', features_used=None, model_name=None, save_best_estimator=True): """ Driver to tune hyperparmeters of model Parameters ------- testset (str): Testset to use, e.g. stratified sampling over quartiles for 20%:80% target (str): Name of the target variable (e.g. iodide) RFR_dict (dict): dictionary of core variables and data model_name (str): name of model to tune performance of features_used (list): list of the features within the model_name model save_best_estimator (bool): save the best performing model offline model (RandomForestRegressor), Random Forest Regressor model to tune cv (int), number of folds of cross-validation to use Returns ------- (RandomForestRegressor) """ from sklearn.externals import joblib from sklearn.ensemble import RandomForestRegressor # Get data to test if isinstance(df, type(None)): # df = get_dataset_processed4ML() df = RFR_dict['df'] # Use the model selected from the feature testing if use_choosen_model: assert_str = "model name not needed as use_choosen_model selected!" assert isinstance(model, type(None)), assert_str # select a single chosen model mdict = get_choosen_model_from_features_selection() features_used = mdict['features_used'] model = mdict['model'] model_name = mdict['name'] # - extract training dataset test_set = df.loc[df[testset] == True, :] train_set = df.loc[df[testset] == False, :] # also sub select all vectors for input data # ( Making sure to remove the target!!! ) train_features = df[features_used].loc[train_set.index] train_labels = df[[target]].loc[train_set.index] test_features = df[features_used].loc[test_set.index] test_labels = df[[target]].loc[test_set.index] # - Make the base model for comparisons base_model = RandomForestRegressor(n_estimators=10, random_state=42, criterion='mse') base_model.fit(train_features, train_labels) quick_model_evaluation(base_model, test_features, test_labels) # - First make an intial explore of the parameter space rf_random = Use_RS_CV_to_explore_hyperparams(cv=cv, train_features=train_features, train_labels=train_labels, features_used=features_used) # Check the performance by Random searching (RandomizedSearchCV) best_random = rf_random.best_estimator_ best_params_ = rf_random.best_params_ print(rf_random.best_params_) quick_model_evaluation(best_random, test_features, test_labels) # - Now do a more focused optimisation # get the parameters based on the RandomizedSearchCV output param_grid = define_hyperparameter_options2test( features_used=features_used, best_params_=best_params_, param_grid_RandomizedSearchCV=True) # Use GridSearchCV grid_search = use_GS_CV_to_tune_Hyperparams( cv=cv, train_features=train_features, param_grid=param_grid, train_labels=train_labels, features_used=features_used, ) print(grid_search.best_params_) # Check the performance of grid seraching searching BEST_ESTIMATOR = grid_search.best_estimator_ quick_model_evaluation(BEST_ESTIMATOR, test_features, test_labels) # Save the best estimator now for future use if save_best_estimator: data_root = utils.get_file_locations('data_root') folder = '{}/{}/models/LIVE/OPTIMISED_MODELS/'.format( data_root, target) model_savename = "my_model_{}.pkl".format(model_name) joblib.dump(BEST_ESTIMATOR, folder + model_savename) else: return BEST_ESTIMATOR
def mk_predictions_NetCDF_4_many_builds(model2use, res='4x5', models_dict=None, features_used_dict=None, RFR_dict=None, target='Iodide', stats=None, plot2check=False, rm_Skagerrak_data=False, debug=False): """ Make a NetCDF file of predicted variables for a given resolution Parameters ------- model2use (str): name of the model to use target (str): Name of the target variable (e.g. iodide) RFR_dict (dict): dictionary of core variables and data res (str): horizontal resolution of dataset (e.g. 4x5) features_used_dict (dict): dictionary of feature variables in models plot2check (bool): make a quick plot to check the prediction models_dict (dict): dictionary of RFR models and there names stats (pd.DataFrame): dataframe of statistics on models in models_dict rm_Skagerrak_data (bool): Remove specific data (above argument is a iodide specific option - remove this) debug (bool): print out debugging output? Returns ------- (None) """ from sklearn.externals import joblib import gc import glob # - local variables # extract the models... if isinstance(RFR_dict, type(None)): RFR_dict = build_or_get_models(rm_Skagerrak_data=rm_Skagerrak_data) # Get the variables required here if isinstance(features_used_dict, type(None)): features_used_dict = RFR_dict['features_used_dict'] # Set the extr_str if rm_Skagerrak_data set to True if rm_Skagerrak_data: extr_str = '_No_Skagerrak' else: extr_str = '' # Get location to save file and set filename folder = utils.get_file_locations('data_root') + '/data/' filename = 'Oi_prj_feature_variables_{}.nc'.format(res) dsA = xr.open_dataset(folder + filename) # Get location to save ensemble builds of models folder_str = '{}/{}/models/LIVE/ENSEMBLE_REPEAT_BUILD{}/' folder = folder_str.format(folder, target, extr_str) # - Make a dataset for each model ds_l = [] # Get list of twenty models built models_str = folder + '*{}*.pkl'.format(model2use) builds4model = glob.glob(models_str) print(builds4model, models_str) # Print a string to debug the output db_str = "Found {} saved models for '{} - glob str:{}'" print(db_str.format(len(builds4model), model2use, models_str)) # Get the numbers for the models in directory b_modelnames = [i.split('my_model_')[-1][:-3] for i in builds4model] # Check the number of models selected ast_str = "There aren't models for {} in {}" assert len(b_modelnames) > 1, ast_str.format(model2use, folder) # Now loop by model built for ensemble member and predict values for n_modelname, b_modelname in enumerate(b_modelnames): # Load the model model = joblib.load(builds4model[n_modelname]) # Get testinng features features_used = features_used_dict[model2use].split('+') # Make a DataSet of predicted values ds_l += [ mk_da_of_predicted_values(model=model, res=res, dsA=dsA, modelname=b_modelname, features_used=features_used) ] # Force local tidy of garbage gc.collect() # Combine datasets ds = xr.merge(ds_l) # - Also get values for existing parameterisations if target == 'Iodide': # Chance et al (2013) param = u'Chance2014_STTxx2_I' arr = utils.calc_I_Chance2014_STTxx2_I(dsA['WOA_TEMP'].values) ds[param] = ds[b_modelname] # use existing array as dummy to fill ds[param].values = arr # MacDonald et al (2013) param = 'MacDonald2014_iodide' arr = utils.calc_I_MacDonald2014(dsA['WOA_TEMP'].values) ds[param] = ds[b_modelname] # use existing array as dummy to fill ds[param].values = arr # Do a test diagnostic plot? if plot2check: for var_ in ds.data_vars: # Do a quick plot to check arr = ds[var_].mean(dim='time') AC.map_plot(arr, res=res) plt.title(var_) plt.show() # Save to NetCDF save_name = 'Oi_prj_predicted_{}_{}_ENSEMBLE_BUILDS_{}_{}.nc' ds.to_netcdf(save_name.format(target, res, model2use, extr_str))
def process_MLD_csv2NetCDF(debug=False, _fill_value=-9999.9999E+10): """ Process NOAA WOA94 csv files into NetCDF files Parameters ------- _fill_value (float): fill value to use for new NetCDF debug (bool): perform debugging and verbose printing? Returns ------- (xr.Dataset) """ # The MLD fields available are computed from climatological monthly mean # profiles of potential temperature and potential density based on three # different criteria: a temperature change from the ocean surface of 0.5 # degree Celsius, a density change from the ocean surface of 0.125 # (sigma units), and a variable density change from the ocean surface # corresponding to a temperature change of 0.5 degree Celsius. The MLD # based on the variable density criterion is designed to account for the # large variability of the coefficient of thermal expansion that # characterizes seawater. # Citation: Monterey, G. and Levitus, S., 1997: Seasonal Variability of # Mixed Layer Depth for the World Ocean. NOAA Atlas NESDIS 14, U.S. # Gov. Printing Office, Wash., D.C., 96 pp. 87 figs. (pdf, 13.0 MB). # variables for MLD_vars = ['pt', 'pd', 'vd'] folder = utils.get_file_locations('data_root') + '/WOA94/' # - Loop MLD variables for var_ in MLD_vars: file_str = 'mld*{}*'.format(var_) files = sorted(glob.glob(folder+file_str)) print(files) # Loop files and extract data as an arrayu ars = [] for file in files: # values are assume to have been outputed in a row major way # e.g. (lon, lat) # open with open(file, 'rb') as file_: # Extract all values lines = [i.split() for i in file_] # Convert to floats (and masked values (e.g. "-") to NaN ), # the concatenate to "big" list big = [] for n, line in enumerate(lines): for value in line: try: value = float(value) except ValueError: value = np.NaN big += [value] # Now reshape ars += [np.ma.array(big).reshape((180, 360)).T] # Debug (?) by showing 2D grid if debug: plt.pcolor(np.arange(0, 360), np.arange(0, 180), ars[0]) plt.colorbar() plt.show() # Force to be in COARDS format? (e.g. lat, lon) instead of (lon, lat) ars = [i.T for i in ars] # Fill nans with _fill_value, ars = [np.ma.filled(i, fill_value=_fill_value) for i in ars] # Then convert to numpy array... ars = [np.array(i) for i in ars] print([type(i) for i in ars]) # Force dates dates = [datetime.datetime(1985, 1, i+1) for i in range(12)] lons = np.arange(0+0.5, 360+0.5, 1) lats = np.arange(-90+0.5, 90+0.5, 1) res = '1x1' # Save to NetCDF AC.save_2D_arrays_to_3DNetCDF(ars=ars, dates=dates, varname=var_, res=res, filename='WOA94_MLD_1x1_{}'.format(var_), lons=lons, lats=lats)
def Hyperparameter_Tune4choosen_models(RFR_dict=None, target='Iodide', cv=7, testset='Test set (strat. 20%)'): """ Driver to tune mutiple RFR models Parameters ------- testset (str): Testset to use, e.g. stratified sampling over quartiles for 20%:80% cv (int), number of folds of cross-validation to use target (str): Name of the target variable (e.g. iodide) RFR_dict (dict): dictionary of models, data and shared variables Returns ------- (None) """ from sklearn.externals import joblib # Get the data for the models if isinstance(RFR_dict, type(None)): RFR_dict = build_or_get_models() # Set models to optimise models2compare = get_top_models(RFR_dict=RFR_dict, vars2exclude=['DOC', 'Prod']) # Get variables needed from core dictionary features_used_dict = RFR_dict['features_used_dict'] models_dict = RFR_dict['models_dict'] # Set folder to use for optimised models data_root = utils.get_file_locations('data_root') folder = '{}/{}/models/LIVE/OPTIMISED_MODELS/'.format(data_root, target) # Loop and save optimised model # NOTE: this could be speed up by using more cores for model_name in models2compare: print('Optimising model: {}'.format(model_name)) # Get model model = models_dict[model_name] # get testing features features_used = features_used_dict[model_name].split('+') # Tune parameters BE = Hyperparameter_Tune_model(model=model, use_choosen_model=False, save_best_estimator=True, model_name=model_name, RFR_dict=RFR_dict, features_used=features_used, cv=cv) # - Test the tuned models against the test set test_the_tuned_models = False if test_the_tuned_models: # Get the core data df = RFR_dict['df'] # Get the data test_set = df.loc[df[testset] == True, :] train_set = df.loc[df[testset] == False, :] # Test the improvements in the optimised models? for model_name in models2compare: # - Get existing model model = models_dict[model_name] # Get testing features features_used = features_used_dict[model_name].split('+') # - Get the data # ( Make sure to remove the target ) # train_features = df[features_used].loc[ train_set.index ] # train_labels = df[[target]].loc[ train_set.index ] test_features = df[features_used].loc[test_set.index] test_labels = df[[target]].loc[test_set.index] # - test the existing model print(' ---------------- ' * 3) print(' ---------------- {}: '.format(model_name)) print(' - Base values: ') quick_model_evaluation(model, test_features, test_labels) # - Get optimised model try: model_savename = "my_model_{}.pkl".format(model_name) OPmodel = joblib.load(folder + model_savename) # print(' - Optimised values: ') quick_model_evaluation(OPmodel, test_features, test_labels) except: pass # - Test the tuned models against the training set # Get the core data df = RFR_dict['df'] # get the data test_set = df.loc[df[testset] == True, :] train_set = df.loc[df[testset] == False, :] # Test the improvements in the optimised models? for model_name in models2compare: # - Get existing model model = models_dict[model_name] # get testing features features_used = features_used_dict[model_name].split('+') # - Get the data # ( Making sure to remove the target!!! ) train_features = df[features_used].loc[train_set.index] train_labels = df[[target]].loc[train_set.index] # test_features = df[features_used].loc[ test_set.index ] # test_labels = df[[target]].loc[ test_set.index ] # - test the existing model print(' ---------------- ' * 3) print(' ---------------- {}: '.format(model_name)) print(' - Base values: ') quick_model_evaluation(model, train_features, train_labels) # - Get optimised model try: model_savename = "my_model_{}.pkl".format(model_name) OPmodel = joblib.load(folder + model_savename) # print(' - Optimised values: ') quick_model_evaluation(OPmodel, train_features, train_labels) except: pass
def get_iodide_obs(just_use_submitted_data=False, use_Chance2014_core_data=True, analyse_iodide_values2drop=False, process_new_iodide_obs_file=False, file_and_path='./sparse2spatial.rc', limit_depth_to=20, verbose=True, debug=False): """ Extract iodide observations from the (re-formated) file from Chance2014 Parameters ------- just_use_submitted_data (bool), just use the data submitted for Chance et al 2014 use_Chance2014_core_data (bool), just use the code data in Chance2014's analysis analyse_iodide_values2drop (bool), check which values should be removed process_new_iodide_obs_file (bool), make a new iodide obs. file? file_and_path (str): folder and filename with location settings as single str limit_depth_to (float), depth (m) to limit inclusion of data to verbose (bool): print verbose statements to screen debug (bool): print debugging statements to screen Returns ------- (pd.DataFrame) Notes ----- """ # What is the location of the iodide data? folder = utils.get_file_locations('data_root', file_and_path=file_and_path) folder += '/Iodide/inputs/' # Name to save file as filename = 'Iodide_data_above_20m.csv' # - Get Metadata (and keep as a seperate DataFrame ) metadata_df = get_iodide_obs_metadata() # Process new iodide obs. (data) file? if process_new_iodide_obs_file: # - Extract data? # To test processing... just use submitted data? if just_use_submitted_data: Data_Keys = metadata_df['Data_Key'][metadata_df['source'] == 's'] print(Data_Keys) # Add bodc data bool_ = metadata_df['source'] == 'bodc' bodc_Data_Keys = metadata_df['Data_Key'].loc[bool_] Data_Keys = list(Data_Keys) bodc_Data_Keys = list(bodc_Data_Keys) print(bodc_Data_Keys) Data_Keys = Data_Keys + bodc_Data_Keys print(Data_Keys) else: # use all data Data_Keys = metadata_df['Data_Key'] # - Loop by the datasets ("Data_Keys") # Setup list to store dataframes dfs = [] # Loop data keys for sites for n_Data_Key, Data_Key in enumerate(Data_Keys): pcent = float(n_Data_Key) / len(Data_Keys) * 100 if verbose: print(n_Data_Key, Data_Key, pcent) # Extract data df = extract_templated_excel_file(Data_Key=Data_Key, metadata_df=metadata_df, limit_depth_to=limit_depth_to) # Save to list dfs += [df] # Combine dataframes. main_df = pd.concat(dfs) # Analyse the datapoints that are being removed. if analyse_iodide_values2drop: # Loop indexes and save out values that are "odd" ind2save = [] tmp_var = 'temp #' main_df[tmp_var] = np.arange(main_df.shape[0]) for ind in main_df[tmp_var].values: df_tmp = main_df.loc[main_df[tmp_var] == ind, :] try: pd.to_numeric(df_tmp['Iodide']) except: ind2save += [ind] # Make sure core values are numeric core_numeric_vars = [ u'Ammonium', u'Chl-a', u'Depth', u'Iodate', u'Iodide', u'Latitude', u'Longitude', u'Nitrate', u'Nitrite', u'O2', u'Organic-I', u'Salinity', u'Total-I', u'Temperature', u'\u03b4Ammonium', u'\u03b4Chl-a', u'\u03b4Iodate', u'\u03b4Iodide', u'\u03b4Nitrate', u'\u03b4Nitrite', u'\u03b4Org-I', u'\u03b4Total-I' ] for var in core_numeric_vars: main_df[var] = pd.to_numeric(main_df[var].values, errors='coerce') # Save to disk main_df.to_csv(folder + filename, encoding='utf-8') # - Just use existing file else: try: # Just open existing file if use_Chance2014_core_data: main_df = get_core_Chance2014_obs() else: main_df = pd.read_csv(folder + filename, encoding='utf-8') except: print('Error opening processed iodide data file') # Return DataFrames return main_df, metadata_df