def get_stats_on_spatial_predictions_4x5_2x25(res='4x5', ex_str='', target='Iodide', use_annual_mean=True, filename=None, folder=None, just_return_df=False, var2template='Chance2014_STTxx2_I', ): """ Evaluate the spatial predictions between models at a resolution of 4x5 or 2x2.5 Parameters ------- target (str): Name of the target variable (e.g. iodide) res (str): horizontal resolution of dataset (e.g. 4x5) var2template (str): variable to use a template for making new variables in ds use_annual_mean (bool): use the annual mean of the variable Returns ------- Notes ----- """ # If filename or folder not given, then use defaults if isinstance(filename, type(None)): filename = 'Oi_prj_predicted_{}_{}.nc'.format(target, res) if isinstance(folder, type(None)): data_root = utils.get_file_locations('data_root') folder = '{}/{}/outputs/'.format(data_root, target) ds = xr.open_dataset(folder + filename) # variables to consider vars2plot = list(ds.data_vars) # add LWI and surface area to array ds = utils.add_LWI2array(ds=ds, var2template=var2template) IS_WATER = ds['IS_WATER'].mean(dim='time') # -- get general annual stats in a dataframe df = pd.DataFrame() for var_ in vars2plot: ds_tmp = ds[var_].copy() # take annual average if use_annual_mean: ds_tmp = ds_tmp.mean(dim='time') # mask to only consider (100%) water boxes arr = ds_tmp.values arr = arr[(IS_WATER == True)] # sve to dataframe df[var_] = pd.Series(arr.flatten()).describe() # Get area weighted mean vals = [] for var_ in vars2plot: ds_tmp = ds[var_] # take annual average if use_annual_mean: ds_tmp = ds_tmp.mean(dim='time') # mask to only consider (100%) water boxes arr = np.ma.array(ds_tmp.values, mask=~(LWI == 0).T) # also mask s_area s_area_tmp = np.ma.array(s_area, mask=~(LWI == 0)) # save value vals += [AC.get_2D_arr_weighted_by_X(arr, s_area=s_area_tmp.T)] # Add area weighted mean to df df = df.T df['mean (weighted)'] = vals df = df.T # Save or just return the values file_save = 'Oi_prj_annual_stats_global_ocean_{}{}.csv'.format(res, ex_str) if just_return_df: return df df.T.to_csv(file_save)
def get_stats_on_spatial_predictions_0125x0125(use_annual_mean=True, target='Iodide', RFR_dict=None, ex_str='', just_return_df=False, folder=None, filename=None, rm_Skagerrak_data=False, debug=False): """ Evaluate the spatial predictions between models at 0.125x0.125 Parameters ------- target (str): Name of the target variable (e.g. iodide) debug (bool): print out debugging output? rm_Skagerrak_data (bool): Remove specific data (above argument is a iodide specific option - remove this) just_return_df (bool): just return the data as dataframe folder (str): folder where NetCDF of predicted data is located ex_str (str): extra string to include in file name to save data use_annual_mean (bool): use the annual mean of the variable for statistics var2template (str): variable to use a template for making new variables in ds Returns ------- Notes ----- """ # ---- # Get spatial prediction data from NetCDF files saved already res = '0.125x0.125' if isinstance(filename, type(None)): if rm_Skagerrak_data: extr_file_str = '_No_Skagerrak' else: extr_file_str = '' filename = 'Oi_prj_predicted_{}_{}{}.nc'.format( target, res, extr_file_str) if isinstance(folder, type(None)): data_root = utils.get_file_locations('data_root') folder = '{}/outputs/{}/'.format(data_root, target) ds = xr.open_dataset(folder + filename) # Variables to consider vars2analyse = list(ds.data_vars) # Add LWI and surface area to array ds = utils.add_LWI2array(ds=ds, res=res, var2template='Chance2014_STTxx2_I') # Set a name for output to saved as file_save_str = 'Oi_prj_annual_stats_global_ocean_{}{}'.format(res, ex_str) # ---- build an array with general statistics df = pd.DataFrame() # -- get general annual stats # Take annual average over time (if using annual mean) if use_annual_mean: ds_tmp = ds.mean(dim='time') for var_ in vars2analyse: # mask to only consider (100%) water boxes arr = ds_tmp[var_].values arr = arr[(ds_tmp['IS_WATER'] == True)] # save to dataframe df[var_] = pd.Series(arr.flatten()).describe() # Get area weighted mean too vals = [] # Take annual average over time (if using annual mean) - # Q: why does this need to be done twice separately? if use_annual_mean: ds_tmp = ds.mean(dim='time') for var_ in vars2analyse: # Mask to only consider (100%) water boxes mask = ~(ds_tmp['IS_WATER'] == True) arr = np.ma.array(ds_tmp[var_].values, mask=mask) # Also mask surface area (s_area) s_area_tmp = np.ma.array(ds_tmp['AREA'].values, mask=mask) # Save value to list vals += [AC.get_2D_arr_weighted_by_X(arr, s_area=s_area_tmp)] # Add area weighted mean to df df = df.T df['mean (weighted)'] = vals df = df.T # just return the dataframe of global stats if just_return_df: return df # save the values df.T.to_csv(file_save_str+'.csv') # ---- print out a more formatted version as a table for the paper # remove variables topmodels = get_top_models(RFR_dict=RFR_dict, vars2exclude=['DOC', 'Prod']) params = [ 'Chance2014_STTxx2_I', 'MacDonald2014_iodide', 'Ensemble_Monthly_mean' ] # select just the models of interest df = df[topmodels + params] # rename the models rename_titles = {u'Chance2014_STTxx2_I': 'Chance et al. (2014)', u'MacDonald2014_iodide': 'MacDonald et al. (2014)', 'Ensemble_Monthly_mean': 'RFR(Ensemble)', 'Iodide': 'Obs.', # u'Chance2014_Multivariate': 'Chance et al. (2014) (Multi)', } df.rename(columns=rename_titles, inplace=True) # Sort the dataframe by the mean weighted vales df = df.T df.sort_values(by=['mean (weighted)'], ascending=False, inplace=True) # rename columns (50% to median and ... ) cols2rename = {'50%': 'median', 'std': 'std. dev.', } df.rename(columns=cols2rename, inplace=True) # rename df.rename(index=rename_titles, inplace=True) # set column order # Set the stats to use first_columns = [ 'mean (weighted)', 'std. dev.', '25%', 'median', '75%', 'max', ] if debug: print(df.head()) df = df[first_columns] # save as CSV df.round(1).to_csv(file_save_str+'_FOR_TABLE_'+'.csv') # ---- Do some further analysis and save this to a text file a = open(file_save_str+'_analysis.txt', 'w') # Set a header print('This file contains global analysis of {} data'.format(str), file=a) print('\n', file=a) # which files are being analysed? print('---- Detail on the predicted fields', file=a) models2compare = { 1: u'RFR(Ensemble)', 2: u'Chance et al. (2014)', 3: u'MacDonald et al. (2014)', # 1: u'Ensemble_Monthly_mean', # 2: u'Chance2014_STTxx2_I', # 3:'MacDonald2014_iodide' # 1: u'RFR(TEMP+DEPTH+SAL+NO3+DOC)', # 2: u'RFR(TEMP+SAL+Prod)', # 3: u'RFR(TEMP+DEPTH+SAL)', } debug = True if debug: print(df.head()) df_tmp = df.T[models2compare.values()] # What are the core models print('Core models being compared are:', file=a) for key in models2compare.keys(): ptr_str = 'model {} - {}' print(ptr_str.format(key, models2compare[key]), file=a) print('\n', file=a) # Now print analysis on predicted fields # range in predicted model values mean_ = df_tmp.T['mean (weighted)'].values.mean() min_ = df_tmp.T['mean (weighted)'].values.min() max_ = df_tmp.T['mean (weighted)'].values.max() prt_str = 'avg predicted values = {:.5g} ({:.5g}-{:.5g})' print(prt_str.format(mean_, min_, max_), file=a) # range in predicted model values range_ = max_-min_ prt_str = 'range of predicted avg values = {:.3g}' print(prt_str.format(range_, min_, max_), file=a) # % of range in predicted model values ( as an error of model choice... ) pcents_ = range_ / df_tmp.T['mean (weighted)'] * 100 min_ = pcents_.min() max_ = pcents_.max() prt_str = 'As a % this is = {:.3g} ({:.5g}-{:.5g})' print(prt_str.format(pcents_.mean(), min_, max_), file=a) a.close()