import datetime as dt import numpy as np import sys import argparse import matplotlib matplotlib.use('Agg') import calendar import netCDF4 as ncdf import copy import utils import set_paths_and_vars defaults = set_paths_and_vars.set() OBS_ORDER = utils.make_MetVars(defaults.mdi, multiplier = False) # what size grid (lat/lon) DELTA_LAT = 5 DELTA_LON = 5 # set up the grid # set up the grid # KATE modified - flipped the lats to go 90 to -90 grid_lats = np.arange(90 - DELTA_LAT, -90 - DELTA_LAT, -DELTA_LAT) #grid_lats = np.arange(-90 + DELTA_LAT, 90 + DELTA_LAT, DELTA_LAT) # end grid_lons = np.arange(-180 + DELTA_LAT, 180 + DELTA_LON, DELTA_LON) # subroutine start
def do_conversion(start_year=defaults.START_YEAR, end_year=defaults.END_YEAR, period="all", doBC=False, doQC=True): """ Convert dailies to pentads 1x1 :param int start_year: start year to process :param int end_year: end year to process :param str period: which period to do day/night/all? :param bool doBC: work on the bias corrected data :param bool doQC: incorporate the QC flags or not :returns: """ settings = set_paths_and_vars.set(doBC=doBC, doQC=doQC) OBS_ORDER = utils.make_MetVars(settings.mdi, multiplier=False) for year in np.arange(start_year, end_year + 1): # set up empty data array all_dailies = np.ma.zeros([len(OBS_ORDER), utils.days_in_year(year), len(grid_lats), len(grid_lons)]) all_dailies.mask = np.zeros([len(OBS_ORDER), utils.days_in_year(year), len(grid_lats), len(grid_lons)]) all_dailies.fill_value = settings.mdi all_n_obs = np.zeros([utils.days_in_year(year), len(grid_lats), len(grid_lons)]) year_start = dt.datetime(year, 1, 1, 0, 0) for month in np.arange(12) + 1: print year, month month_start = utils.day_of_year(year, month) month_end = month_start + calendar.monthrange(year, month)[1] filename = "{}/{}_1x1_daily_{}{:02d}_{}.nc".format( settings.DATA_LOCATION, settings.OUTROOT, year, month, period ) ncdf_file = ncdf.Dataset(filename, "r", format="NETCDF4") for v, var in enumerate(OBS_ORDER): if month == 12: # run to end of year if december all_dailies[v, month_start:, :, :] = ncdf_file.variables[var.name][:] else: all_dailies[v, month_start:month_end, :, :] = ncdf_file.variables[var.name][:] # now get number of observations if month == 12: all_n_obs[month_start:, :, :] = ncdf_file.variables["n_obs"][:] else: all_n_obs[month_start:month_end, :, :] = ncdf_file.variables["n_obs"][:] if calendar.isleap(year): assert all_dailies.shape[1] == 366 # extract 6-day pentad incl_feb29th = all_dailies[:, 55:61, :, :] # remove the data of Feb 29th from array # np.ma.delete doesn't exist, so have to copy mask separately mask = all_dailies.mask all_dailies = np.delete(all_dailies, 59, 1) mask = np.delete(mask, 59, 1) all_dailies = np.ma.array(all_dailies, mask=mask) del mask # number of observations incl_feb29th_n_obs = all_n_obs[55:61, :, :] all_n_obs = np.delete(all_n_obs, 59, 0) else: assert all_dailies.shape[1] == 365 shape = all_dailies.shape all_dailies = all_dailies.reshape(shape[0], -1, 5, shape[-2], shape[-1]) n_days_per_pentad = np.ma.count(all_dailies, axis=2) if settings.doMedian: pentad_grid = utils.bn_median(all_dailies, axis=2) else: pentad_grid = np.ma.mean(all_dailies, axis=2) # clear up memory del all_dailies gc.collect() all_n_obs = all_n_obs.reshape(-1, 5, shape[-2], shape[-1]) all_n_obs = np.sum(all_n_obs, axis=1) pentad_grid.mask[ n_days_per_pentad < N_OBS ] = True # mask where fewer than 2 days have values # KW THIS IS ACTUALLY 2 - WHICH I THINK IS GOOD # the pentad containing feb 29th is the 11th in the year if calendar.isleap(year): # overwrite this with the me(di)an of a 6-day pentad if settings.doMedian: pentad_grid[:, 11, :, :] = utils.bn_median(incl_feb29th, axis=1) else: pentad_grid[:, 11, :, :] = np.ma.mean(incl_feb29th, axis=1) feb_n_days_per_pentad = np.ma.count(incl_feb29th, axis=1) pentad_grid.mask[:, 11, :, :][feb_n_days_per_pentad < N_OBS] = True n_days_per_pentad[:, 11, :, :] = feb_n_days_per_pentad all_n_obs[11, :, :] = np.sum(incl_feb29th_n_obs, axis=0) print "processed Feb 29th" times = utils.TimeVar("time", "time since 1/1/{} in hours".format(year), "hours", "time") times.data = np.arange(0, pentad_grid.shape[1]) * 5 * 24 out_filename = settings.DATA_LOCATION + settings.OUTROOT + "_1x1_pentad_{}_{}.nc".format(year, period) utils.netcdf_write( out_filename, pentad_grid, n_days_per_pentad[0], all_n_obs, OBS_ORDER, grid_lats, grid_lons, times, frequency="P", ) del pentad_grid del all_n_obs del n_days_per_pentad gc.collect() return # do_conversion
def make_timeseries( suffix="relax", doQC=False, doQC1it=False, doQC2it=False, doQC3it=False, doBC=False, doBCtotal=False, doBChgt=False, doBCscn=False, ): # def make_timeseries(suffix = "relax", doQC = False, doBC = False): # end """ Make the timeseries - plots and netCDF files :param str suffix: "relax" or "strict" criteria :param bool doQC: incorporate the QC flags or not # KATE modified :param bool doQC1it: incorporate the first iteration QC flags or not :param bool doQC2it: incorporate the second iteration QC flags or not :param bool doQC3it: incorporate the third iteration QC flags or not # end :param bool doBC: work on the bias corrected data # KATE modified :param bool doBCtotal: work on the bias corrected data :param bool doBChgt: work on the bias corrected data :param bool doBCscn: work on the bias corrected data # end :returns: """ # KATE modified settings = set_paths_and_vars.set( doBC=doBC, doBCtotal=doBCtotal, doBChgt=doBChgt, doBCscn=doBCscn, doQC=doQC, doQC1it=doQC1it, doQC2it=doQC2it, doQC3it=doQC3it, ) # settings = set_paths_and_vars.set(doBC = doBC, doQC = doQC) # end print "Do QC = {}".format(doQC) # KATE modified print "Do QC1it = {}".format(doQC1it) print "Do QC2it = {}".format(doQC2it) print "Do QC3it = {}".format(doQC3it) # end print "Do BC = {}".format(doBC) # KATE modified print "Do BCtotal = {}".format(doBCtotal) print "Do BChgt = {}".format(doBChgt) print "Do BCscn = {}".format(doBCscn) # end # monthly -> annual watermarkstring = ( "/".join(os.getcwd().split("/")[4:]) + "/" + os.path.basename(__file__) + " " + dt.datetime.strftime(dt.datetime.now(), "%d-%b-%Y %H:%M") ) # run on the actuals (which include anomalies from ERA) and the anomalies (calculated from obs-actuals, but also include the anomalies from ERA) # KATE modified to add new file name bit '_renorm19812010' for version in ["", "_renorm19812010_anomalies"]: # for version in ["", "_anomalies"]: # end if version == "": print "5x5 monthly Standard" elif version == "_anomalies": print "5x5 monthly Anomalies" for period in ["both", "day", "night"]: print period filename = "{}/{}_5x5_monthly{}_from_daily_{}_{}.nc".format( settings.DATA_LOCATION, settings.OUTROOT, version, period, suffix ) print filename ncdf_file = ncdf.Dataset(filename, "r", format="NETCDF4") lat_centres = ncdf_file.variables["latitude"] lon_centres = ncdf_file.variables["longitude"] n_obs = utils.set_MetVar_attributes( "n_obs", "Number of Observations", "Number of Observations", 1, -1, np.dtype("int64"), 0 ) OBS_ORDER = utils.make_MetVars(settings.mdi, multiplier=False) OBS_ORDER += [n_obs] for v, var in enumerate(OBS_ORDER): print var.name var.data = ncdf_file.variables[var.name][:] # make annual and monthly timeseries mesh_lon, mesh_lat = np.meshgrid(lon_centres, lat_centres) cosines = np.cos(np.radians(mesh_lat)) full_cosines = mask_and_normalise_weights(cosines, var.data) # masked weights now sum to one for each field if var.name == "n_obs": weighted_data = var.data else: weighted_data = var.data * full_cosines plot_values = np.zeros(weighted_data.shape[0]) plot_times = [] for y in range(weighted_data.shape[0]): plot_values[y] = np.ma.sum(weighted_data[y]) plot_times += [dt.datetime(settings.START_YEAR + (y / 12), 1 + (y % 12), 1, 0, 0)] # plot the monthly data plt.clf() plt.plot(plot_times, plot_values, "r-", label="Monthly") var.mdata = plot_values monthly_times = plot_times # and annual plot_values = plot_values.reshape(-1, 12) if var.name != "n_obs": plot_values = np.mean(plot_values, axis=1) plot_times = [dt.datetime(settings.START_YEAR + y, 7, 1) for y in range(plot_values.shape[0])] plt.plot(plot_times, plot_values, "b-", label="Annual") plt.ylabel(var.units) else: # if n_obs, then have second x-axis plot_values = np.sum(plot_values, axis=1) plot_times = [dt.datetime(settings.START_YEAR + y, 7, 1) for y in range(plot_values.shape[0])] # finish off first axis ax1 = plt.gca() ax1.set_ylabel("Monthly", color="r") for tl in ax1.get_yticklabels(): tl.set_color("r") # add second axis ax2 = ax1.twinx() ax2.plot(plot_times, plot_values, "b-", label="Annual") ax2.set_ylabel("Annual", color="b") for tl in ax2.get_yticklabels(): tl.set_color("b") var.adata = plot_values annual_times = plot_times # and prettify the plot plt.title(" ".join([x.capitalize() for x in var.name.split("_")])) if var.name != "n_obs": plt.legend() plt.figtext(0.01, 0.01, watermarkstring, size=6) plt.savefig( "{}/{}_5x5_monthly{}_from_daily_{}_{}_ts.png".format( settings.PLOT_LOCATION, settings.OUTROOT, version, period, var.name ) ) # clean up ncdf_file.close() del (weighted_data) del (full_cosines) gc.collect() # write output files (annual and monthly) filename = "{}/{}_5x5_monthly{}_from_daily_{}_{}_ts_annual.nc".format( settings.DATA_LOCATION, settings.OUTROOT, version, period, suffix ) if os.path.exists(filename): os.remove(filename) write_ncdf_ts(annual_times, OBS_ORDER, filename, annual=True, do_zip=True) filename = "{}/{}_5x5_monthly{}_from_daily_{}_{}_ts_monthly.nc".format( settings.DATA_LOCATION, settings.OUTROOT, version, period, suffix ) if os.path.exists(filename): os.remove(filename) write_ncdf_ts(monthly_times, OBS_ORDER, filename, monthly=True, do_zip=True) # clean up del (plot_values) del (plot_times) del (OBS_ORDER) gc.collect() # not activated at present pentads = False if pentads: # pentad -> annual OBS_ORDER = utils.make_MetVars(settings.mdi, multiplier=False) for v, var in enumerate(OBS_ORDER): print var.name filename = "{}/{}_1x1_pentads_from_3hrly_{}_{}_{}.nc".format( settings.DATA_LOCATION, settings.OUTROOT, var.name, period, suffix ) ncdf_file = ncdf.Dataset(filename, "r", format="NETCDF4") lat_centres = ncdf_file.variables["latitude"] lon_centres = ncdf_file.variables["longitude"] data_shape = ncdf_file.variables[var.name][:].shape # pentads mesh_lon, mesh_lat = np.meshgrid(lon_centres, lat_centres) cosines = np.cos(np.radians(mesh_lat)) plot_values = np.zeros(data_shape[0]) plot_times = [] year = copy.deepcopy(settings.START_YEAR) for ts in range(data_shape[0]): data = ncdf_file.variables[var.name][ts] full_cosines = np.ma.array(cosines) full_cosines.mask = data.mask full_cosines = full_cosines / np.sum(full_cosines) weighted_data = data * full_cosines plot_values[ts] = np.ma.sum(weighted_data) if calendar.isleap(year) and ((ts + 1) * 5) % 365 > 60: # account for 6 day pentad in leap years plot_times += [dt.datetime(year, 1, 1, 0, 0) + dt.timedelta(days=((ts + 1) * 5) % 365 + 1)] else: plot_times += [dt.datetime(year, 1, 1, 0, 0) + dt.timedelta(days=((ts + 1) * 5) % 365)] print year, ts, plot_times[-1] if ((ts + 1) * 5) % 365 == 0: year += 1 plt.clf() plt.plot(plot_times, plot_values, "r-") plt.title(var.name) plt.ylabel(var.units) # annual plot_values = plot_values.reshape(-1, 73, data_shape[-2], data_shape[-1]) plot_values = np.mean(plot_values, axis=1) plt.plot(plot_times[36::73], plot_values, "b-") plt.savefig("{}/{}_pentads_all.png".format(settings.PLOT_LOCATION, var.name)) raw_input("check") return # make_timeseries
def combine_files(suffix = "relax", pentads = False, do3hr = False, months = False, daily = False, start_year = defaults.START_YEAR, end_year = defaults.END_YEAR, start_month = 1, end_month = 12, period = "both", doQC = False, doQC1it = False, doQC2it = False, doQC3it = False, doBC = False, doBCtotal = False, doBChgt = False, doBCscn = False): #def combine_files(suffix = "relax", pentads = False, do3hr = False, months = False, daily = False, start_year = defaults.START_YEAR, end_year = defaults.END_YEAR, start_month = 1, end_month = 12, period = "both", doQC = False, doBC = False): # end ''' Combine the files, first the pentads 1x1, then the monthlies 5x5 :param str suffix: "relax" or "strict" criteria :param bool pentads: run on pentads :param bool do3hr: run on pentads created from 3hrly data (if False then run on those from daily) :param bool months: run on 5x5 monthly data :param bool daily: run on monthlies created direct from dailies (if False the run on those from 1x1 monthlies) :param int start_year: start year to process :param int end_year: end year to process :param int start_month: start month to process :param int end_month: end month to process :param str period: which period to do day/night/both? :param bool doQC: incorporate the QC flags or not # KATE modified :param bool doQC1it: incorporate the 1st iteration QC flags or not :param bool doQC2it: incorporate the 2nd iteration QC flags or not :param bool doQC3it: incorporate the 3rd iteration QC flags or not # end :param bool doBC: work on the bias corrected data # KATE modified :param bool doBCtotal: work on the bias corrected data :param bool doBChgt: work on the hieght only bias corrected data :param bool doBCscn: work on the screen only bias corrected data # end :returns: ''' # KATE modified settings = set_paths_and_vars.set(doBC = doBC, doBCtotal = doBCtotal, doBChgt = doBChgt, doBCscn = doBCscn, doQC = doQC, doQC1it = doQC1it, doQC2it = doQC2it, doQC3it = doQC3it) #settings = set_paths_and_vars.set(doBC = doBC, doQC = doQC) # end # pentads if pentads: OBS_ORDER = utils.make_MetVars(settings.mdi, multiplier = False) # KW make OBS_ORDER only the actual variables - remove anomalies NEWOBS_ORDER = [] for v, var in enumerate(OBS_ORDER): if "anomalies" not in var.name: NEWOBS_ORDER.append(var) del OBS_ORDER OBS_ORDER = np.copy(NEWOBS_ORDER) del NEWOBS_ORDER # set up the grids DELTA=1 grid_lats = np.arange(-90+DELTA, 90+DELTA, DELTA) grid_lons = np.arange(-180+DELTA, 180+DELTA, DELTA) Nyears = end_year - start_year + 1 # read in each variable - memory issues for v, var in enumerate(OBS_ORDER): print var.name all_pentads = np.ma.zeros((1, Nyears, 73, len(grid_lats), len(grid_lons))) all_pentads.mask = np.ones((1, Nyears, 73, len(grid_lats), len(grid_lons))) all_pentads.fill_value = settings.mdi n_obs = np.zeros((Nyears, 73, len(grid_lats), len(grid_lons))) n_grids = np.zeros((Nyears, 73, len(grid_lats), len(grid_lons))) for y, year in enumerate(np.arange(start_year, end_year + 1)): if do3hr: filename = settings.DATA_LOCATION + "{}_1x1_pentad_from_3hrly_{}_{}_{}.nc".format(settings.OUTROOT, year, period, suffix) else: filename = settings.DATA_LOCATION + "{}_1x1_pentad_{}_{}_{}.nc".format(settings.OUTROOT, year, period, suffix) ncdf_file = ncdf.Dataset(filename,'r', format='NETCDF4') time = ncdf_file.variables["time"] try: assert time.long_name == "time since 1/1/{} in hours".format(year) except AssertionError: print "time units are not as expected." print " expected time since 1/1/{} in hours".format(year) print " got {}".format(time.long_name) sys.exit() all_pentads[0, y, :, :, :] = ncdf_file.variables[var.name][:] n_obs[y, :, :, :] = ncdf_file.variables["n_obs"][:] n_grids[y, :, :, :] = ncdf_file.variables["n_obs"][:] print year if y == 0 and period == "both": lat_centres = ncdf_file.variables["latitude"] # KATE modified - this results in lats that go from 92.5 to -82,5 or 90.5 to -88.5 so I've switched the + for a - latitudes = lat_centres - (lat_centres[1] - lat_centres[0])/2. #latitudes = lat_centres + (lat_centres[1] - lat_centres[0])/2. # end lon_centres = ncdf_file.variables["longitude"] longitudes = lon_centres + (lon_centres[1] - lon_centres[0])/2. ncdf_file.close() all_pentads = all_pentads.reshape(1, -1, len(grid_lats), len(grid_lons)) # sort the times times = utils.TimeVar("time", "time since 1/1/1973 in months", "months", "time") times.data = np.arange(all_pentads.shape[1]) # and write file if do3hr: out_filename = settings.DATA_LOCATION + "{}_1x1_pentads_from_3hrly_{}_{}_{}.nc".format(settings.OUTROOT, var.name, period, suffix) else: out_filename = settings.DATA_LOCATION + "{}_1x1_pentads_{}_{}_{}.nc".format(settings.OUTROOT, var.name, period, suffix) if period == "both": utils.netcdf_write(out_filename, all_pentads, n_grids, n_obs, OBS_ORDER, latitudes, longitudes, times, frequency = "P", single = var) else: utils.netcdf_write(out_filename, all_pentads, n_grids, n_obs, OBS_ORDER, grid_lats, grid_lons, times, frequency = "P", single = var) # Reset the data holding arrays and objects del OBS_ORDER gc.collect() if months: OBS_ORDER = utils.make_MetVars(settings.mdi, multiplier = False) #***************************** # monthlies for y, year in enumerate(np.arange(start_year, end_year + 1)): print year for month in np.arange(start_month, end_month + 1): print " {}".format(month) if daily: filename = settings.DATA_LOCATION + "{}_5x5_monthly_from_daily_{}{:02d}_{}_{}.nc".format(settings.OUTROOT, year, month, period, suffix) else: filename = settings.DATA_LOCATION + "{}_5x5_monthly_{}{:02d}_{}_{}.nc".format(settings.OUTROOT, year, month, period, suffix) ncdf_file = ncdf.Dataset(filename,'r', format='NETCDF4') time = ncdf_file.variables["time"] try: assert time.long_name == "time since 1/{}/{} in hours".format(month, year) except AssertionError: print "time units are not as expected." print " expected time since 1/{}/{} in hours".format(month, year) print " got {}".format(time.long_name) sys.exit() for v, var in enumerate(OBS_ORDER): nc_var = ncdf_file.variables[var.name] try: var.data = utils.ma_append(var.data, nc_var[:], axis = 0) if v == 0: n_obs = utils.ma_append(n_obs, ncdf_file.variables["n_obs"][:], axis = 0) n_grids = utils.ma_append(n_grids, ncdf_file.variables["n_grids"][:], axis = 0) except AttributeError: var.data = nc_var[:] var.data.fill_value = nc_var.missing_value if v == 0: n_obs = ncdf_file.variables["n_obs"][:] n_grids = ncdf_file.variables["n_grids"][:] if y == 0 and month == start_month and period == "both": lat_centres = ncdf_file.variables["latitude"] latitudes = lat_centres + (lat_centres[1] - lat_centres[0])/2. lon_centres = ncdf_file.variables["longitude"] longitudes = lon_centres + (lon_centres[1] - lon_centres[0])/2. # KATE modified - added an extra loop so that we can flip the latitudes for day and night too if y == 0 and month == start_month and period != "both": lat_centres = ncdf_file.variables["latitude"] # THIS IS - RATHER THAN + READY TO FLIP THE LATS latitudes = lat_centres - (lat_centres[1] - lat_centres[0])/2. lon_centres = ncdf_file.variables["longitude"] longitudes = lon_centres + (lon_centres[1] - lon_centres[0])/2. # end ncdf_file.close() # write out into big array for netCDF file all_data = np.ma.zeros((len(OBS_ORDER), var.data.shape[0], var.data.shape[1], var.data.shape[2])) all_data.mask = np.zeros((len(OBS_ORDER), var.data.shape[0], var.data.shape[1], var.data.shape[2])) for v, var in enumerate(OBS_ORDER): all_data[v, :, :, :] = var.data # KATE modified - switching the latitudes on day and night data for consistency with both if period == "day" or period == "night": # invert latitudes latitudes = latitudes[::-1] all_data = all_data[:,:,::-1,:] # variable, time, latitude, longitude # end all_data.fill_value = var.data.fill_value # extra stuff for writing # KATE modified - no longer need grid5 as we're using latitudes and longitudes #DELTA=5 #grid5_lats = np.arange(-90+DELTA, 90+DELTA, DELTA) #grid5_lons = np.arange(-180+DELTA, 180+DELTA, DELTA) # end # KATE modified - START_YEAR not defined, should be start_year times = utils.TimeVar("time", "time since 1/1/{} in months".format(start_year), "months", "time") #times = utils.TimeVar("time", "time since 1/1/{} in months".format(START_YEAR), "months", "time") # end times.data = np.arange(var.data.shape[0]) # and write file if daily: out_filename = settings.DATA_LOCATION + settings.OUTROOT + "_5x5_monthly_from_daily_{}_{}.nc".format(period, suffix) else: out_filename = settings.DATA_LOCATION + settings.OUTROOT + "_5x5_monthly_{}_{}.nc".format(period, suffix) # KATE modified - now always using latitudes and longitudes utils.netcdf_write(out_filename, all_data, n_grids, n_obs, OBS_ORDER, latitudes, longitudes, times, frequency = "Y") #if period == "both": # utils.netcdf_write(out_filename, all_data, n_grids, n_obs, OBS_ORDER, latitudes, longitudes, times, frequency = "Y") #else: # utils.netcdf_write(out_filename, all_data, n_grids, n_obs, OBS_ORDER, grid5_lats, grid5_lons, times, frequency = "Y") # end return # combine_files
def do_merge(fileroot, mdi, suffix = "relax", clims = False, doMedian = False): ''' Merge the _day and _night files Do a np.ma.mean or median for the data and a sum for the n_obs and n_grids Output with a _both suffix :param str fileroot: root for filenames :param flt mdi: missing data indicator :param str suffix: "relax" or "strict" criteria :param bool clims: if climatologies then don't try and process anomalies. ''' OBS_ORDER = utils.make_MetVars(mdi, multiplier = False) if clims: # KW make OBS_ORDER only the actual variables - remove anomalies NEWOBS_ORDER = [] for v, var in enumerate(OBS_ORDER): if "anomalies" not in var.name: NEWOBS_ORDER.append(var) del OBS_ORDER OBS_ORDER = np.copy(NEWOBS_ORDER) del NEWOBS_ORDER # spin through both periods for p, period in enumerate(["day", "night"]): print period # go through the variables for v, var in enumerate(OBS_ORDER): print " {}".format(var.name) ncdf_file = ncdf.Dataset("{}_{}_{}.nc".format(fileroot, period, suffix),'r', format='NETCDF4') if v == 0 and p == 0: shape = list(ncdf_file.variables[var.name][:].shape) shape.insert(0, len(OBS_ORDER)+2) # add all the variables shape.insert(0, 2) # insert extra dimension to allow day + night all_data = np.ma.zeros(shape) all_data[p, v] = ncdf_file.variables[var.name][:] # get lats/lons of box centres lat_centres = ncdf_file.variables["latitude"] # KATE modified - this results in lats that go from 92.5 to -82,5 so I've switched the + for a - latitudes = lat_centres - (lat_centres[1] - lat_centres[0])/2. #latitudes = lat_centres + (lat_centres[1] - lat_centres[0])/2. # end lon_centres = ncdf_file.variables["longitude"] longitudes = lon_centres + (lon_centres[1] - lon_centres[0])/2. # get times - make a dummy object and then populate attributes times = utils.TimeVar("time", "time since 1/{}/{} in hours".format(1, 1973), "hours", "time") times.long_name = ncdf_file.variables["time"].long_name times.standard_name = ncdf_file.variables["time"].standard_name times.long_name = ncdf_file.variables["time"].long_name times.units = ncdf_file.variables["time"].units times.data = ncdf_file.variables["time"][:] else: all_data[p, v] = ncdf_file.variables[var.name][:] # and get n_obs and n_grids all_data[p, -2] = ncdf_file.variables["n_grids"][:] all_data[p, -1] = ncdf_file.variables["n_obs"][:] # invert latitudes latitudes = latitudes[::-1] all_data = all_data[:,:,:,::-1,:] # got all the info, now merge if doMedian: merged_data = utils.bn_median(all_data[:, :len(OBS_ORDER)], axis = 0) else: merged_data = np.ma.mean(all_data[:, :len(OBS_ORDER)], axis = 0) # and process the grids and observations (split off here so have incorporated latitude inversion) n_grids = np.ma.sum(all_data[:, -2], axis = 0) n_obs = np.ma.sum(all_data[:, -1], axis = 0) n_obs.fill_value = -1 n_grids.fill_value = -1 # write the output file utils.netcdf_write("{}_{}_{}.nc".format(fileroot, "both", suffix), merged_data, n_grids, n_obs, OBS_ORDER, latitudes, longitudes, times, frequency = "P") # test distribution of obs with grid boxes outfile = file("{}_{}_{}.txt".format(fileroot.split("/")[-1], "both", suffix), "w") utils.boxes_with_n_obs(outfile, n_obs, merged_data[0], "") return # do_merge
def do_gridding(suffix = "relax", start_year = defaults.START_YEAR, end_year = defaults.END_YEAR, start_month = 1, end_month = 12, doQC = False, doQC1it = False, doQC2it = False, doQC3it = False, doSST_SLP = False, doBC = False, doBCtotal = False, doBChgt = False, doBCscn = False, doUncert = False): #def do_gridding(suffix = "relax", start_year = defaults.START_YEAR, end_year = defaults.END_YEAR, start_month = 1, end_month = 12, doQC = False, doSST_SLP = False, doBC = False, doUncert = False): # end ''' Do the gridding, first to 3hrly 1x1, then to daily 1x1 and finally monthly 1x1 and 5x5 :param str suffix: "relax" or "strict" criteria :param int start_year: start year to process :param int end_year: end year to process :param int start_month: start month to process :param int end_month: end month to process :param bool doQC: incorporate the QC flags or not :param bool doQC1it: incorporate the first iteration (no buddy) QC flags or not :param bool doQC2it: incorporate the second iteration (no buddy) QC flags or not :param bool doQC3it: incorporate the third iteration (buddy) QC flags or not :param bool doSST_SLP: process additional variables or not :param bool doBC: work on the bias corrected data :param bool doBCtotal: work on the full bias corrected data :param bool doBChgt: work on the height only bias corrected data :param bool doBCscn: work on the screen only bias corrected data :param bool doUncert: work on files with uncertainty information (not currently used) :returns: ''' # KATE modified settings = set_paths_and_vars.set(doBC = doBC, doBCtotal = doBCtotal, doBChgt = doBChgt, doBCscn = doBCscn, doQC = doQC, doQC1it = doQC1it, doQC2it = doQC2it, doQC3it = doQC3it) #settings = set_paths_and_vars.set(doBC = doBC, doQC = doQC) # end # KATE modified - added other BC options # if doBC: if doBC | doBCtotal | doBChgt | doBCscn: # end fields = mds.TheDelimitersExt # extended (BC) else: fields = mds.TheDelimitersStd # Standard # KATE modified - added other BC options # OBS_ORDER = utils.make_MetVars(settings.mdi, doSST_SLP = doSST_SLP, multiplier = True, doBC = doBC) # ensure that convert from raw format at writing stage with multiplier OBS_ORDER = utils.make_MetVars(settings.mdi, doSST_SLP = doSST_SLP, multiplier = True, doBC = doBC, doBCtotal = doBCtotal, doBChgt = doBChgt, doBCscn = doBCscn) # ensure that convert from raw format at writing stage with multiplier # end # KW switching between 4 ('_strict') for climatology build and 2 for anomaly buily ('_relax') - added subscripts to files if suffix == "relax": N_OBS_DAY = 2 # KW ok for anomalies but this was meant to be 4 for dailies_all? and 2 for dailies_night/day? N_OBS_FRAC_MONTH = 0.3 elif suffix == "strict": N_OBS_DAY = 4 N_OBS_FRAC_MONTH = 0.3 # flags to check on and values to allow through # KATE modified if doQC1it | doQC2it: these_flags = {"ATclim":0,"ATrep":0,"DPTclim":0,"DPTssat":0,"DPTrep":0,"DPTrepsat":0} else: these_flags = {"ATbud":0, "ATclim":0,"ATrep":0,"DPTbud":0,"DPTclim":0,"DPTssat":0,"DPTrep":0,"DPTrepsat":0} #these_flags = {"ATbud":0, "ATclim":0,"ATrep":0,"DPTbud":0,"DPTclim":0,"DPTssat":0,"DPTrep":0,"DPTrepsat":0} # end # spin through years and months to read files for year in np.arange(start_year, end_year + 1): for month in np.arange(start_month, end_month + 1): times = utils.TimeVar("time", "time since 1/{}/{} in hours".format(month, year), "hours", "time") grid_hours = np.arange(0, 24 * calendar.monthrange(year, month)[1], DELTA_HOUR) times.data = grid_hours # process the monthly file # KATE modified - added other BC options # if doBC: if doBC | doBCtotal | doBChgt | doBCscn: # end filename = "new_suite_{}{:02d}_{}_extended.txt".format(year, month, settings.OUTROOT) else: filename = "new_suite_{}{:02d}_{}.txt".format(year, month, settings.OUTROOT) # KATE modified - added other BC options # raw_platform_data, raw_obs, raw_meta, raw_qc = utils.read_qc_data(filename, settings.ICOADS_LOCATION, fields, doBC = doBC) raw_platform_data, raw_obs, raw_meta, raw_qc = utils.read_qc_data(filename, settings.ICOADS_LOCATION, fields, doBC = doBC, doBCtotal = doBCtotal, doBChgt = doBChgt, doBCscn = doBCscn) # end # extract observation details lats, lons, years, months, days, hours = utils.process_platform_obs(raw_platform_data) # test dates *KW - SHOULDN'T NEED THIS - ONLY OBS PASSING DATE CHECK ARE INCLUDED* # *RD* - hasn't run yet but will leave it in just in case of future use. if not utils.check_date(years, year, "years", filename): sys.exit(1) if not utils.check_date(months, month, "months", filename): sys.exit(1) # KATE modified - seems to be an error with missing global name plots so have changed to settings.plots # Choose this one to only output once per decade #if settings.plots and (year in [1973, 1983, 1993, 2003, 2013]): # Choose this one to output a plot for each month if settings.plots: #if plots and (year in [1973, 1983, 1993, 2003, 2013]): # end # plot the distribution of hours import matplotlib.pyplot as plt plt.clf() plt.hist(hours, np.arange(-100,2500,100)) plt.ylabel("Number of observations") plt.xlabel("Hours") plt.xticks(np.arange(-300, 2700, 300)) plt.savefig(settings.PLOT_LOCATION + "obs_distribution_{}{:02d}_{}.png".format(year, month, suffix)) # only for a few of the variables for variable in OBS_ORDER: if variable.name in ["marine_air_temperature", "dew_point_temperature", "specific_humidity", "relative_humidity", "marine_air_temperature_anomalies", "dew_point_temperature_anomalies", "specific_humidity_anomalies", "relative_humidity_anomalies"]: #plot_qc_diagnostics.values_vs_lat(variable, lats, raw_obs[:, variable.column], raw_qc, these_flags, settings.PLOT_LOCATION + "qc_actuals_{}_{}{:02d}_{}.png".format(variable.name, year, month, suffix), multiplier = variable.multiplier, doBC = doBC) plot_qc_diagnostics.values_vs_lat_dist(variable, lats, raw_obs[:, variable.column], raw_qc, these_flags, \ settings.PLOT_LOCATION + "qc_actuals_{}_{}{:02d}_{}.png".format(variable.name, year, month, suffix), multiplier = variable.multiplier, \ # KATE modified - added other BC options doBC = doBC, doBCtotal = doBCtotal, doBChgt = doBChgt, doBCscn = doBCscn) # end # QC sub-selection # KATE modified - added QC iterations but also think this needs to include the bias corrected versions because the QC flags need to be applied to those too. # Not sure what was happening previously with the doBC run - any masking to QC'd obs? if doQC | doQC1it | doQC2it | doQC3it | doBC | doBCtotal | doBChgt | doBCscn: #if doQC: # end print "Using {} as flags".format(these_flags) # KATE modified - BC options # mask = utils.process_qc_flags(raw_qc, these_flags, doBC = doBC) mask = utils.process_qc_flags(raw_qc, these_flags, doBC = doBC, doBCtotal = doBCtotal, doBChgt = doBChgt, doBCscn = doBCscn) # end print "All Obs: ",len(mask) print "Good Obs: ",len(mask[np.where(mask == 0)]) print "Bad Obs: ",len(mask[np.where(mask == 1)]) #pdb.set_trace() complete_mask = np.zeros(raw_obs.shape) for i in range(raw_obs.shape[1]): complete_mask[:,i] = mask clean_data = np.ma.masked_array(raw_obs, mask = complete_mask) # end else: print "No QC flags selected" clean_data = np.ma.masked_array(raw_obs, mask = np.zeros(raw_obs.shape)) # discretise hours hours = utils.make_index(hours, DELTA_HOUR, multiplier = 100) # get the hours since start of month hours_since = ((days - 1) * 24) + (hours * DELTA_HOUR) # discretise lats/lons lat_index = utils.make_index(lats, DELTA_LAT, multiplier = 100) lon_index = utils.make_index(lons, DELTA_LON, multiplier = 100) lat_index += ((len(grid_lats)-1)/2) # and as -ve indices are unhelpful, roll by offsetting by most westward lon_index += ((len(grid_lons)-1)/2) # or most southerly so that (0,0) is (-90,-180) # NOTE - ALWAYS GIVING TOP-RIGHT OF BOX TO GIVE < HARD LIMIT (as opposed to <=) # do the gridding # extract the full grid, number of obs, and day/night flag # KATE MEDIAN WATCH This is hard coded to doMedian (rather than settings.doMedian) - OK WITH MEDIAN HERE!!! # KATE modified - to add settings.doMedian instead of just doMedian which seems to be consistent with the other bits and BC options raw_month_grid, raw_month_n_obs, this_month_period = utils.grid_1by1_cam(clean_data, raw_qc, hours_since, lat_index, lon_index, \ grid_hours, grid_lats, grid_lons, OBS_ORDER, settings.mdi, doMedian = settings.doMedian, \ doBC = doBC, doBCtotal = doBCtotal, doBChgt = doBChgt, doBCscn = doBCscn) #raw_month_grid, raw_month_n_obs, this_month_period = utils.grid_1by1_cam(clean_data, raw_qc, hours_since, lat_index, lon_index, grid_hours, grid_lats, grid_lons, OBS_ORDER, settings.mdi, doMedian = True, doBC = doBC) # end print "successfully read data into 1x1 3hrly grids" # create matching array size this_month_period = np.tile(this_month_period, (len(OBS_ORDER),1,1,1)) for period in ["all", "day", "night"]: if period == "day": this_month_grid = np.ma.masked_where(this_month_period == 1, raw_month_grid) this_month_obs = np.ma.masked_where(this_month_period[0] == 1, raw_month_n_obs) # and take first slice to re-match the array size elif period == "night": this_month_grid = np.ma.masked_where(this_month_period == 0, raw_month_grid) this_month_obs = np.ma.masked_where(this_month_period[0] == 0, raw_month_n_obs) # and take first slice to re-match the array size else: this_month_grid = copy.deepcopy(raw_month_grid) this_month_obs = copy.deepcopy(raw_month_n_obs) # KATE modified # If SwitchOutput == 1 then we're in test mode - output interim files!!! if (SwitchOutput == 1): # have one month of gridded data. out_filename = settings.DATA_LOCATION + settings.OUTROOT + "_1x1_3hr_{}{:02d}_{}_{}.nc".format(year, month, period, suffix) utils.netcdf_write(out_filename, this_month_grid, np.zeros(this_month_obs.shape), this_month_obs, OBS_ORDER, grid_lats, grid_lons, times, frequency = "H") ## have one month of gridded data. #out_filename = settings.DATA_LOCATION + settings.OUTROOT + "_1x1_3hr_{}{:02d}_{}_{}.nc".format(year, month, period, suffix) #utils.netcdf_write(out_filename, this_month_grid, np.zeros(this_month_obs.shape), this_month_obs, OBS_ORDER, grid_lats, grid_lons, times, frequency = "H") # end # now average over time # Dailies daily_hours = grid_hours.reshape(-1, 24/DELTA_HOUR) shape = this_month_grid.shape this_month_grid = this_month_grid.reshape(shape[0], -1, 24/DELTA_HOUR, shape[2], shape[3]) this_month_obs = this_month_obs.reshape(-1, 24/DELTA_HOUR, shape[2], shape[3]) # KATE MEDIAN WATCH - settings.doMedian is generally set to True - I think we may want the MEAN HERE!!! # KATE modified - to hard wire in MEAN here daily_grid = np.ma.mean(this_month_grid, axis = 2) #if settings.doMedian: # daily_grid = np.ma.median(this_month_grid, axis = 2) #else: # daily_grid = np.ma.mean(this_month_grid, axis = 2) # end daily_grid.fill_value = settings.mdi # filter on number of observations/day n_hrs_per_day = np.ma.count(this_month_grid, axis = 2) n_obs_per_day = np.ma.sum(this_month_obs, axis = 1) if period == "all": bad_locs = np.where(n_hrs_per_day < N_OBS_DAY) # at least 2 of possible 8 3-hourly values (6hrly data *KW OR AT LEAST 4 3HRLY OBS PRESENT*) else: bad_locs = np.where(n_hrs_per_day < np.floor(N_OBS_DAY / 2.)) # at least 1 of possible 8 3-hourly values (6hrly data *KW OR AT LEAST 4 3HRLY OBS PRESENT*) daily_grid.mask[bad_locs] = True # KATE modified - added SwitchOutput to if loop if (SwitchOutput == 1) and settings.plots and (year in [1973, 1983, 1993, 2003, 2013]): #if settings.plots and (year in [1973, 1983, 1993, 2003, 2013]): # end # plot the distribution of hours plt.clf() plt.hist(n_hrs_per_day.reshape(-1), bins = np.arange(-1,10), align = "left", log = True, rwidth=0.5) if period == "all": plt.axvline(x = N_OBS_DAY-0.5, color = "r") else: plt.axvline(x = np.floor(N_OBS_DAY / 2.)-0.5, color = "r") plt.title("Number of 1x1-3hrly in each 1x1-daily grid box") plt.xlabel("Number of 3-hrly observations (max = 8)") plt.ylabel("Frequency (log scale)") plt.savefig(settings.PLOT_LOCATION + "n_grids_1x1_daily_{}{:02d}_{}_{}.png".format(year, month, period, suffix)) plt.clf() plt.hist(n_obs_per_day.reshape(-1), bins = np.arange(-5,100,5), log = True, rwidth=0.5) plt.title("Total number of raw observations in each 1x1 daily grid box") plt.xlabel("Number of raw observations") plt.ylabel("Frequency (log scale)") plt.savefig(settings.PLOT_LOCATION + "n_obs_1x1_daily_{}{:02d}_{}_{}.png".format(year, month, period, suffix)) # clear up memory del this_month_grid del this_month_obs gc.collect() # KATE modified # If SwitchOutput == 1 then we're in test mode - output interim files!!! if (SwitchOutput == 1): # write dailies file times.data = daily_hours[:,0] out_filename = settings.DATA_LOCATION + settings.OUTROOT + "_1x1_daily_{}{:02d}_{}_{}.nc".format(year, month, period, suffix) utils.netcdf_write(out_filename, daily_grid, n_hrs_per_day[0], n_obs_per_day, OBS_ORDER, grid_lats, grid_lons, times, frequency = "D") #times.data = daily_hours[:,0] #out_filename = settings.DATA_LOCATION + settings.OUTROOT + "_1x1_daily_{}{:02d}_{}_{}.nc".format(year, month, period, suffix) #utils.netcdf_write(out_filename, daily_grid, n_hrs_per_day[0], n_obs_per_day, OBS_ORDER, grid_lats, grid_lons, times, frequency = "D") # end # Monthlies times.data = daily_hours[0,0] # KATE modified - commenting out as we don't need this anymore # if settings.doMedian: # monthly_grid = np.ma.median(daily_grid, axis = 1) # else: # monthly_grid = np.ma.mean(daily_grid, axis = 1) # # monthly_grid.fill_value = settings.mdi # # # filter on number of observations/month # n_grids_per_month = np.ma.count(daily_grid, axis = 1) # bad_locs = np.where(n_grids_per_month < calendar.monthrange(year, month)[1] * N_OBS_FRAC_MONTH) # 30% of possible daily values # monthly_grid.mask[bad_locs] = True # # # number of raw observations # n_obs_per_month = np.ma.sum(n_obs_per_day, axis = 0) # # if settings.plots and (year in [1973, 1983, 1993, 2003, 2013]): # # plot the distribution of days # # plt.clf() # plt.hist(n_obs_per_month.reshape(-1), bins = np.arange(-10,500,10), log = True, rwidth=0.5) # plt.title("Total number of raw observations in each 1x1 monthly grid box") # plt.xlabel("Number of raw observations") # plt.ylabel("Frequency (log scale)") # plt.savefig(settings.PLOT_LOCATION + "n_obs_1x1_monthly_{}{:02d}_{}_{}.png".format(year, month, period, suffix)) # # plt.clf() # plt.hist(n_grids_per_month[0].reshape(-1), bins = np.arange(-2,40,2), align = "left", log = True, rwidth=0.5) # plt.axvline(x = calendar.monthrange(year, month)[1] * N_OBS_FRAC_MONTH, color="r") # plt.title("Total number of 1x1 daily grids in each 1x1 monthly grid") # plt.xlabel("Number of 1x1 daily grids") # plt.ylabel("Frequency (log scale)") # plt.savefig(settings.PLOT_LOCATION + "n_grids_1x1_monthly_{}{:02d}_{}_{}.png".format(year, month, period, suffix)) # # # write monthly 1x1 file # out_filename = settings.DATA_LOCATION + settings.OUTROOT + "_1x1_monthly_{}{:02d}_{}_{}.nc".format(year, month, period, suffix) # utils.netcdf_write(out_filename, monthly_grid, n_grids_per_month[0], n_obs_per_month, OBS_ORDER, grid_lats, grid_lons, times, frequency = "M") # # # now to re-grid to coarser resolution # # KW # Here we may want to use the mean because its a large area but could be sparsely # # populated with quite different climatologies so we want # # the influence of the outliers (we've done our best to ensure these are good values) # # # go from monthly 1x1 to monthly 5x5 - retained as limited overhead # monthly_5by5, monthly_5by5_n_grids, monthly_5by5_n_obs, grid5_lats, grid5_lons = utils.grid_5by5(monthly_grid, n_obs_per_month, grid_lats, grid_lons, doMedian = settings.doMedian, daily = False) # out_filename = settings.DATA_LOCATION + settings.OUTROOT + "_5x5_monthly_{}{:02d}_{}_{}.nc".format(year, month, period, suffix) # # utils.netcdf_write(out_filename, monthly_5by5, monthly_5by5_n_grids, monthly_5by5_n_obs, OBS_ORDER, grid5_lats, grid5_lons, times, frequency = "M") # # if settings.plots and (year in [1973, 1983, 1993, 2003, 2013]): # # plot the distribution of days # # plt.clf() # plt.hist(monthly_5by5_n_obs.reshape(-1), bins = np.arange(0,100,5), log = True, rwidth=0.5) # plt.title("Total number of raw observations in each 5x5 monthly grid box") # plt.xlabel("Number of raw observations") # plt.ylabel("Frequency (log scale)") # plt.savefig(settings.PLOT_LOCATION + "n_obs_5x5_monthly_{}{:02d}_{}_{}.png".format(year, month, period, suffix)) # # plt.clf() # plt.hist(monthly_5by5_n_grids.reshape(-1), bins = np.arange(-2,30,2), align = "left", log = True, rwidth=0.5) # plt.axvline(x = 1, color="r") # plt.title("Total number of 1x1 monthly grids in each 5x5 monthly grid") # plt.xlabel("Number of 1x1 monthly grids") # plt.ylabel("Frequency (log scale)") # plt.savefig(settings.PLOT_LOCATION + "n_grids_5x5_monthly_{}{:02d}_{}_{}.png".format(year, month, period, suffix)) # # # clear up memory # del monthly_grid # del monthly_5by5 # del monthly_5by5_n_grids # del monthly_5by5_n_obs # del n_grids_per_month # del n_obs_per_month # del n_hrs_per_day # gc.collect() # end # go direct from daily 1x1 to monthly 5x5 # KATE MEDIAN WATCH - settings.doMedian is generally set to True - I think we may want the MEAN HERE!!! # KATE modified - to hard wire in MEAN here monthly_5by5, monthly_5by5_n_grids, monthly_5by5_n_obs, grid5_lats, grid5_lons = utils.grid_5by5(daily_grid, n_obs_per_day, grid_lats, grid_lons, doMedian = False, daily = True) #monthly_5by5, monthly_5by5_n_grids, monthly_5by5_n_obs, grid5_lats, grid5_lons = utils.grid_5by5(daily_grid, n_obs_per_day, grid_lats, grid_lons, doMedian = settings.doMedian, daily = True) # end out_filename = settings.DATA_LOCATION + settings.OUTROOT + "_5x5_monthly_from_daily_{}{:02d}_{}_{}.nc".format(year, month, period, suffix) utils.netcdf_write(out_filename, monthly_5by5, monthly_5by5_n_grids, monthly_5by5_n_obs, OBS_ORDER, grid5_lats, grid5_lons, times, frequency = "M") if settings.plots and (year in [1973, 1983, 1993, 2003, 2013]): # plot the distribution of days plt.clf() plt.hist(monthly_5by5_n_obs.reshape(-1), bins = np.arange(-10,1000,10), log = True, rwidth=0.5) plt.title("Total number of raw observations in each 5x5 monthly grid box") plt.xlabel("Number of raw observations") plt.ylabel("Frequency (log scale)") plt.savefig(settings.PLOT_LOCATION + "n_obs_5x5_monthly_from_daily_{}{:02d}_{}_{}.png".format(year, month, period, suffix)) plt.clf() plt.hist(monthly_5by5_n_grids.reshape(-1), bins = np.arange(-5,100,5), align = "left", log = True, rwidth=0.5) plt.axvline(x = (0.3 * daily_grid.shape[0]), color="r") plt.title("Total number of 1x1 daily grids in each 5x5 monthly grid") plt.xlabel("Number of 1x1 daily grids") plt.ylabel("Frequency (log scale)") plt.savefig(settings.PLOT_LOCATION + "n_grids_5x5_monthly_from_daily_{}{:02d}_{}_{}.png".format(year, month, period, suffix)) del daily_grid del monthly_5by5 del n_obs_per_day del monthly_5by5_n_grids del monthly_5by5_n_obs gc.collect() return # do_gridding