def do_conversion(start_year=defaults.START_YEAR, end_year=defaults.END_YEAR, period="all", doBC=False, doQC=True): """ Convert dailies to pentads 1x1 :param int start_year: start year to process :param int end_year: end year to process :param str period: which period to do day/night/all? :param bool doBC: work on the bias corrected data :param bool doQC: incorporate the QC flags or not :returns: """ settings = set_paths_and_vars.set(doBC=doBC, doQC=doQC) OBS_ORDER = utils.make_MetVars(settings.mdi, multiplier=False) for year in np.arange(start_year, end_year + 1): # set up empty data array all_dailies = np.ma.zeros([len(OBS_ORDER), utils.days_in_year(year), len(grid_lats), len(grid_lons)]) all_dailies.mask = np.zeros([len(OBS_ORDER), utils.days_in_year(year), len(grid_lats), len(grid_lons)]) all_dailies.fill_value = settings.mdi all_n_obs = np.zeros([utils.days_in_year(year), len(grid_lats), len(grid_lons)]) year_start = dt.datetime(year, 1, 1, 0, 0) for month in np.arange(12) + 1: print year, month month_start = utils.day_of_year(year, month) month_end = month_start + calendar.monthrange(year, month)[1] filename = "{}/{}_1x1_daily_{}{:02d}_{}.nc".format( settings.DATA_LOCATION, settings.OUTROOT, year, month, period ) ncdf_file = ncdf.Dataset(filename, "r", format="NETCDF4") for v, var in enumerate(OBS_ORDER): if month == 12: # run to end of year if december all_dailies[v, month_start:, :, :] = ncdf_file.variables[var.name][:] else: all_dailies[v, month_start:month_end, :, :] = ncdf_file.variables[var.name][:] # now get number of observations if month == 12: all_n_obs[month_start:, :, :] = ncdf_file.variables["n_obs"][:] else: all_n_obs[month_start:month_end, :, :] = ncdf_file.variables["n_obs"][:] if calendar.isleap(year): assert all_dailies.shape[1] == 366 # extract 6-day pentad incl_feb29th = all_dailies[:, 55:61, :, :] # remove the data of Feb 29th from array # np.ma.delete doesn't exist, so have to copy mask separately mask = all_dailies.mask all_dailies = np.delete(all_dailies, 59, 1) mask = np.delete(mask, 59, 1) all_dailies = np.ma.array(all_dailies, mask=mask) del mask # number of observations incl_feb29th_n_obs = all_n_obs[55:61, :, :] all_n_obs = np.delete(all_n_obs, 59, 0) else: assert all_dailies.shape[1] == 365 shape = all_dailies.shape all_dailies = all_dailies.reshape(shape[0], -1, 5, shape[-2], shape[-1]) n_days_per_pentad = np.ma.count(all_dailies, axis=2) if settings.doMedian: pentad_grid = utils.bn_median(all_dailies, axis=2) else: pentad_grid = np.ma.mean(all_dailies, axis=2) # clear up memory del all_dailies gc.collect() all_n_obs = all_n_obs.reshape(-1, 5, shape[-2], shape[-1]) all_n_obs = np.sum(all_n_obs, axis=1) pentad_grid.mask[ n_days_per_pentad < N_OBS ] = True # mask where fewer than 2 days have values # KW THIS IS ACTUALLY 2 - WHICH I THINK IS GOOD # the pentad containing feb 29th is the 11th in the year if calendar.isleap(year): # overwrite this with the me(di)an of a 6-day pentad if settings.doMedian: pentad_grid[:, 11, :, :] = utils.bn_median(incl_feb29th, axis=1) else: pentad_grid[:, 11, :, :] = np.ma.mean(incl_feb29th, axis=1) feb_n_days_per_pentad = np.ma.count(incl_feb29th, axis=1) pentad_grid.mask[:, 11, :, :][feb_n_days_per_pentad < N_OBS] = True n_days_per_pentad[:, 11, :, :] = feb_n_days_per_pentad all_n_obs[11, :, :] = np.sum(incl_feb29th_n_obs, axis=0) print "processed Feb 29th" times = utils.TimeVar("time", "time since 1/1/{} in hours".format(year), "hours", "time") times.data = np.arange(0, pentad_grid.shape[1]) * 5 * 24 out_filename = settings.DATA_LOCATION + settings.OUTROOT + "_1x1_pentad_{}_{}.nc".format(year, period) utils.netcdf_write( out_filename, pentad_grid, n_days_per_pentad[0], all_n_obs, OBS_ORDER, grid_lats, grid_lons, times, frequency="P", ) del pentad_grid del all_n_obs del n_days_per_pentad gc.collect() return # do_conversion
def do_merge(fileroot, mdi, suffix = "relax", clims = False, doMedian = False): ''' Merge the _day and _night files Do a np.ma.mean or median for the data and a sum for the n_obs and n_grids Output with a _both suffix :param str fileroot: root for filenames :param flt mdi: missing data indicator :param str suffix: "relax" or "strict" criteria :param bool clims: if climatologies then don't try and process anomalies. ''' OBS_ORDER = utils.make_MetVars(mdi, multiplier = False) if clims: # KW make OBS_ORDER only the actual variables - remove anomalies NEWOBS_ORDER = [] for v, var in enumerate(OBS_ORDER): if "anomalies" not in var.name: NEWOBS_ORDER.append(var) del OBS_ORDER OBS_ORDER = np.copy(NEWOBS_ORDER) del NEWOBS_ORDER # spin through both periods for p, period in enumerate(["day", "night"]): print period # go through the variables for v, var in enumerate(OBS_ORDER): print " {}".format(var.name) ncdf_file = ncdf.Dataset("{}_{}_{}.nc".format(fileroot, period, suffix),'r', format='NETCDF4') if v == 0 and p == 0: shape = list(ncdf_file.variables[var.name][:].shape) shape.insert(0, len(OBS_ORDER)+2) # add all the variables shape.insert(0, 2) # insert extra dimension to allow day + night all_data = np.ma.zeros(shape) all_data[p, v] = ncdf_file.variables[var.name][:] # get lats/lons of box centres lat_centres = ncdf_file.variables["latitude"] # KATE modified - this results in lats that go from 92.5 to -82,5 so I've switched the + for a - latitudes = lat_centres - (lat_centres[1] - lat_centres[0])/2. #latitudes = lat_centres + (lat_centres[1] - lat_centres[0])/2. # end lon_centres = ncdf_file.variables["longitude"] longitudes = lon_centres + (lon_centres[1] - lon_centres[0])/2. # get times - make a dummy object and then populate attributes times = utils.TimeVar("time", "time since 1/{}/{} in hours".format(1, 1973), "hours", "time") times.long_name = ncdf_file.variables["time"].long_name times.standard_name = ncdf_file.variables["time"].standard_name times.long_name = ncdf_file.variables["time"].long_name times.units = ncdf_file.variables["time"].units times.data = ncdf_file.variables["time"][:] else: all_data[p, v] = ncdf_file.variables[var.name][:] # and get n_obs and n_grids all_data[p, -2] = ncdf_file.variables["n_grids"][:] all_data[p, -1] = ncdf_file.variables["n_obs"][:] # invert latitudes latitudes = latitudes[::-1] all_data = all_data[:,:,:,::-1,:] # got all the info, now merge if doMedian: merged_data = utils.bn_median(all_data[:, :len(OBS_ORDER)], axis = 0) else: merged_data = np.ma.mean(all_data[:, :len(OBS_ORDER)], axis = 0) # and process the grids and observations (split off here so have incorporated latitude inversion) n_grids = np.ma.sum(all_data[:, -2], axis = 0) n_obs = np.ma.sum(all_data[:, -1], axis = 0) n_obs.fill_value = -1 n_grids.fill_value = -1 # write the output file utils.netcdf_write("{}_{}_{}.nc".format(fileroot, "both", suffix), merged_data, n_grids, n_obs, OBS_ORDER, latitudes, longitudes, times, frequency = "P") # test distribution of obs with grid boxes outfile = file("{}_{}_{}.txt".format(fileroot.split("/")[-1], "both", suffix), "w") utils.boxes_with_n_obs(outfile, n_obs, merged_data[0], "") return # do_merge
def calculate_climatology(suffix = "relax", start_year = 1981, end_year = 2010, period = "both", do3hr = False, doQC = False, doBC = False): ''' Make 1x1 pentad climatology :param str suffix: "relax" or "strict" criteria :param int start_year: start year to process :param int end_year: end year to process :param str period: which period to do day/night/both? :param bool do3hr: run on 3hr --> pentad data :param bool doQC: incorporate the QC flags or not :param bool doBC: work on the bias corrected data :returns: ''' settings = set_paths_and_vars.set(doBC = doBC, doQC = doQC) if suffix == "relax": N_YEARS_PRESENT = 10 # number of years present to calculate climatology elif suffix == "strict": N_YEARS_PRESENT = 15 # number of years present to calculate climatology print "Do 3hrly: {}".format(do3hr) N_YEARS = end_year - start_year + 1 # read in each variable - memory issues all_clims = np.ma.zeros([len(OBS_ORDER), 73, len(grid_lats), len(grid_lons)]) # KW - why set up as np.ones? all_clims.mask = np.zeros([len(OBS_ORDER), 73, len(grid_lats), len(grid_lons)]) all_stds = np.ma.zeros([len(OBS_ORDER), 73, len(grid_lats), len(grid_lons)]) all_stds.mask = np.zeros([len(OBS_ORDER), 73, len(grid_lats), len(grid_lons)]) # KW no mask??? I've set one with fill_value as -1 - should the mask be .zeros or .ones though? all_n_obs = np.ma.zeros([N_YEARS, 73, len(grid_lats), len(grid_lons)]) all_n_obs.mask = np.zeros([N_YEARS, 73, len(grid_lats), len(grid_lons)]) all_n_obs.fill_value = -1 for v, var in enumerate(OBS_ORDER): print var.name # number of pentads = 365/5 = 73 # set up empty data array all_pentads = np.ma.zeros([N_YEARS, 73, len(grid_lats), len(grid_lons)]) # sets up a mask of 'False' = not masked! all_pentads.mask = np.zeros([N_YEARS, 73, len(grid_lats), len(grid_lons)]) all_pentads.fill_value = settings.mdi # read in relevant years for y, year in enumerate(np.arange(start_year, end_year + 1)): print year if do3hr: filename = settings.DATA_LOCATION + "{}_1x1_pentad_from_3hrly_{}_{}_{}.nc".format(settings.OUTROOT, year, period, suffix) else: filename = settings.DATA_LOCATION + "{}_1x1_pentad_{}_{}_{}.nc".format(settings.OUTROOT, year, period, suffix) ncdf_file = ncdf.Dataset(filename,'r', format='NETCDF4') all_pentads[y, :, :, :] = ncdf_file.variables[var.name][:] if v == 0: all_n_obs[y, :, :, :] = ncdf_file.variables["n_obs"][:] # years x pentads x lats x lons n_grids = np.ma.count(all_pentads, axis = 0) # collapse down the years if settings.doMedian: all_clims[v, :, :, :] = utils.bn_median(all_pentads, axis = 0) else: all_clims[v, :, :, :] = np.ma.mean(all_pentads, axis = 0) all_stds[v, :, :, :] = np.ma.std(all_pentads, axis = 0) # mask where fewer than 50% of years have data locs = np.ma.where(n_grids < N_YEARS_PRESENT) all_clims[v, :, :, :].mask[locs] = True # KW should probably mask stdev too - although unmasked it does show the potential coverage all_stds[v, :, :, :].mask[locs] = True if settings.plots and v == 0: import matplotlib.pyplot as plt plt.clf() plt.hist(n_grids.reshape(-1), bins = np.arange(-1,32), align = "left", log = True, rwidth=0.5) plt.axvline(x = N_YEARS_PRESENT-0.5, color = "r") plt.title("Number of years present in each pentad") plt.xlabel("Number of years (max = 30)") plt.ylabel("Frequency (log scale)") plt.savefig(settings.PLOT_LOCATION + "pentad_clims_n_years_{}_{}_{}.png".format(year, period, suffix)) # now process number of observations (KW all_n_obs wasn't a masked array - so have set it up as one - BUT not really convinced this # is working as it should. No import numpy.ma? all_obs = np.ma.sum(all_n_obs, axis = 0) # set up time array times = utils.TimeVar("time", "time since 1/1/{} in days".format(1), "days", "time") times.data = np.arange(0, 73) * 5 # write files if do3hr: out_filename = settings.DATA_LOCATION + settings.OUTROOT + "_1x1_pentad_climatology_from_3hrly_{}_{}.nc".format(period, suffix) else: out_filename = settings.DATA_LOCATION + settings.OUTROOT + "_1x1_pentad_climatology_{}_{}.nc".format(period, suffix) utils.netcdf_write(out_filename, all_clims, n_grids, all_obs, OBS_ORDER, grid_lats, grid_lons, times, frequency = "P") if do3hr: out_filename = settings.DATA_LOCATION + settings.OUTROOT + "_1x1_pentad_stdev_from_3hrly_{}_{}.nc".format(period, suffix) else: out_filename = settings.DATA_LOCATION + settings.OUTROOT + "_1x1_pentad_stdev_{}_{}.nc".format(period, suffix) utils.netcdf_write(out_filename, all_stds, n_grids, all_obs, OBS_ORDER, grid_lats, grid_lons, times, frequency = "P") # test distribution of obs with grid boxes if do3hr: outfile = file(settings.OUTROOT + "_1x1_pentad_climatology_from_3hrly_{}_{}.txt".format(period, suffix), "w") else: outfile = file(settings.OUTROOT + "_1x1_pentad_climatology_{}_{}.txt".format(period, suffix), "w") utils.boxes_with_n_obs(outfile, all_obs, all_clims[0], N_YEARS_PRESENT) return # calculate_climatology
def do_conversion(suffix = "relax", start_year = defaults.START_YEAR, end_year = defaults.END_YEAR, period = "all", doQC = False, doBC = False): ''' Convert 3 hrlies to pentads 1x1 First get pentad average of 3hrly values (so values at 0, 3, 6, ... averaged over 5 days) Then get average over the pentad. :param str suffix: "relax" or "strict" criteria :param int start_year: start year to process :param int end_year: end year to process :param str period: which period to do day/night/all? :param bool doQC: incorporate the QC flags or not :param bool doBC: work on the bias corrected data :returns: ''' settings = set_paths_and_vars.set(doBC = doBC, doQC = doQC) # KW Added SUFFIX variable because all hourlies/dailies/monthlies now have suffix 'strict' (4/2 per daily/day-night) # or 'relax' (2/1 per daily/day-night) if suffix == "relax": N_OBS_OVER_DAYS = 1 # at least 1 obs at this 3 hr timestamp from 5 days in pentad N_OBS_OVER_PENTAD = 2 elif suffix == "strict": N_OBS_OVER_DAYS = 2 N_OBS_OVER_PENTAD = 4 # at least 4 timestamps (of 8) in pentad, could be 2 for local 'relax' setting N_OBS_PER_DAY = 24/DELTA_HOUR for year in np.arange(start_year, end_year + 1): all_pentads = np.ma.zeros([len(OBS_ORDER), 73, len(grid_lats), len(grid_lons)]) all_pentads.mask = np.zeros([len(OBS_ORDER), 73, len(grid_lats), len(grid_lons)]) # read in a years worth of 3hrly data for v, var in enumerate(OBS_ORDER): # arrays too massive to process all variables at once. print var.name var_3hrlys = read_data(settings, suffix, var.name, year, grid_lats, grid_lons, period, N_OBS_PER_DAY) # reshape to days x 3hrly obs (365(366),8,180,360) var_3hrlys = var_3hrlys.reshape(-1, N_OBS_PER_DAY, var_3hrlys.shape[1], var_3hrlys.shape[2]) # process the leap-year if appropriate if calendar.isleap(year): var_3hrlys, incl_feb29th = process_february(var_3hrlys, doMask = True) else: assert var_3hrlys.shape[0] == 365 # get pentadly values for each timestep (73,5,8,180,360) shape = var_3hrlys.shape var_3hrlys = var_3hrlys.reshape(-1, 5, shape[-3], shape[-2], shape[-1]) # n_pentads x days x hrs x lat x lon n_days_per_timestamp = np.ma.count(var_3hrlys, axis = 1) # n_pentads x hrs x lat x lon # get average at each timestamp across the pentad - so have N_OBS_PER_DAY averaged values per pentad if settings.doMedian: pentad_3hrly_grid = utils.bn_median(var_3hrlys, axis = 1) # n_pentads x hrs x lat x lon else: pentad_3hrly_grid = np.ma.mean(var_3hrlys, axis = 1) # n_pentads x hrs x lat x lon pentad_3hrly_grid.mask[n_days_per_timestamp < N_OBS_OVER_DAYS] = True # mask where fewer than N_OBS_OVER_DAYS days have values # clear up memory del var_3hrlys gc.collect() # the pentad containing feb 29th is the 11th in the year (KW actually its the 12th, so the 11 in array speak which is what you have done) if calendar.isleap(year): # overwrite this with the me(di)an of a 6-day pentad if settings.doMedian: pentad_3hrly_grid[11, :, :, :] = utils.bn_median(incl_feb29th, axis = 0) else: pentad_3hrly_grid[11, :, :, :] = np.ma.mean(incl_feb29th, axis = 0) feb_n_days_per_timestamp = np.ma.count(incl_feb29th, axis = 0) pentad_3hrly_grid.mask[11, :, :, :][feb_n_days_per_timestamp < N_OBS_OVER_DAYS] = True n_days_per_timestamp[11, :, :, :] = feb_n_days_per_timestamp print "processed Feb 29th" if settings.plots and v == 0: import matplotlib.pyplot as plt plt.clf() plt.hist(n_days_per_timestamp.reshape(-1), bins = np.arange(-1,7), align = "left", log = True, rwidth=0.5) plt.axvline(x = N_OBS_OVER_DAYS-0.5, color = "r") plt.title("Number of days with obs at each 3hrly timestamp (over entire year)") plt.xlabel("Number of days (max = 5)") plt.ylabel("Frequency (log scale)") plt.savefig(settings.PLOT_LOCATION + "pentads_n_days_{}_{}_{}.png".format(year, period, suffix)) # get single pentad values n_hrs_per_pentad = np.ma.count(pentad_3hrly_grid, axis = 1) # get the number of pentad-hours present in each pentad n_grids_per_pentad = np.sum(n_days_per_timestamp, axis = 1) # get the number of 3hrly 1x1 grids included per pentad 1x1 # get average at each timestamp across the pentad - so have N_OBS_PER_DAY values per pentad if settings.doMedian: pentad_grid = utils.bn_median(pentad_3hrly_grid, axis = 1) else: pentad_grid = np.ma.mean(pentad_3hrly_grid, axis = 1) if period == "all": # KW are you sure this should be n_hrs_per_pentad and not n_grids_per_pentad here? I think it should pentad_grid.mask[n_hrs_per_pentad < N_OBS_OVER_PENTAD] = True # mask where fewer than N_OBS_OVER_PENTAD hours have values else: # KW are you sure this should be n_hrs_per_pentad and not n_grids_per_pentad here? I think it should pentad_grid.mask[n_hrs_per_pentad < (N_OBS_OVER_PENTAD/2.)] = True # mask where fewer than N_OBS_OVER_PENTAD hours have values all_pentads[v, :, :, :] = pentad_grid # diagnostics plots of obs/grids per pentad if settings.plots and v == 0: plt.clf() plt.hist(n_hrs_per_pentad.reshape(-1), bins = np.arange(-1,10), align = "left", log = True, rwidth=0.5) if period == "all": plt.axvline(x = N_OBS_OVER_PENTAD-0.5, color = "r") else: plt.axvline(x = (N_OBS_OVER_PENTAD/2.)-0.5, color = "r") plt.title("Number of hrs with obs in each pentad (over entire year)") plt.xlabel("Number of days (max = 8)") plt.ylabel("Frequency (log scale)") plt.savefig(settings.PLOT_LOCATION + "pentads_n_hrs_{}_{}_{}.png".format(year, period, suffix)) # clear up memory del pentad_3hrly_grid del pentad_grid gc.collect() # done all main variables. Now for number of observations print "n_obs" n_obs = read_data(settings, suffix, "n_obs", year, grid_lats, grid_lons, period, N_OBS_PER_DAY) # KW so we've gone from 8*365hrs,lats,lons to 365,8,lats,lons n_obs = n_obs.reshape(-1, N_OBS_PER_DAY, n_obs.shape[1], n_obs.shape[2]) if calendar.isleap(year): n_obs, incl_feb29th = process_february(n_obs, doMask = True) else: assert n_obs.shape[0] == 365 shape = n_obs.shape # KW so we're now at pentads, 5days, 8hours, lats, lons n_obs = n_obs.reshape(-1, 5, shape[-3], shape[-2], shape[-1]) # pentads x days x hours x lat x lon # KW This should sum over the 5days leaving pentads, 8hrs, lats, lons # n_obs has -1 as missing data!!! So sum will not work properly # set up fill_value as -1 n_obs.fill_value = -1 n_obs_per_3hrly_pentad = np.ma.sum(n_obs, axis = 1) n_obs_per_3hrly_pentad.fill_value = -1 if calendar.isleap(year): n_obs_per_3hrly_pentad[11, :, :, :] = np.ma.sum(incl_feb29th, axis = 0) n_obs_per_pentad = np.ma.sum(n_obs_per_3hrly_pentad, axis = 1) # and write out times = utils.TimeVar("time", "time since 1/1/{} in hours".format(year), "hours", "time") times.data = np.arange(0, all_pentads.shape[1]) * 5 * 24 out_filename = settings.DATA_LOCATION + settings.OUTROOT + "_1x1_pentad_from_3hrly_{}_{}_{}.nc".format(year, period, suffix) utils.netcdf_write(out_filename, all_pentads, n_grids_per_pentad, n_obs_per_pentad, OBS_ORDER, grid_lats, grid_lons, times, frequency = "P") return # do_conversion