def apply_climatology(suffix = "relax", period = "both", daily = False, 
                      doQC = False, doQC1it = False, doQC2it = False, doQC3it = False, 
		      doBC = False, doBCtotal = False, doBChgt = False, doBCscn = False):
                      #doQC = False, doBC = False):
# end
    '''
    Apply monthly 5x5 climatology

    :param str suffix: "relax" or "strict" criteria
    :param str period: which period to do day/night/both?
    :param bool daily: run in 1x1 daily --> 5x5 monthly data
    :param bool doQC: incorporate the QC flags or not
# KATE modified
    :param bool doQC1it: incorporate the 1st iteration QC flags or not
    :param bool doQC2it: incorporate the 2nd iteration QC flags or not
    :param bool doQC3it: incorporate the 3rd iteration QC flags or not
# end
    :param bool doBC: work on the bias corrected data
# KATE modified
    :param bool doBCtotal: work on the bias corrected data
    :param bool doBChgt: work on the hieght only bias corrected data
    :param bool doBCscn: work on the screen only bias corrected data
# end

    :returns:
    '''
# KATE modified
    settings = set_paths_and_vars.set(doBC = doBC, doBCtotal = doBCtotal, doBChgt = doBChgt, doBCscn = doBCscn, doQC = doQC, doQC1it = doQC1it, doQC2it = doQC2it, doQC3it = doQC3it)
    #settings = set_paths_and_vars.set(doBC = doBC, doQC = doQC)
# end

    if suffix == "relax":
        N_YEARS_PRESENT = 10 # number of years present to calculate climatology
    elif suffix == "strict":
        N_YEARS_PRESENT = 15 # number of years present to calculate climatology


    print "Do daily: {}".format(daily)

    # set filenames
    if daily:
        climfilename = settings.DATA_LOCATION + "{}_5x5_monthly_climatology_from_daily_{}_{}.nc".format(settings.OUTROOT, period, suffix)
        obsfilename = settings.DATA_LOCATION + "{}_5x5_monthly_from_daily_{}_{}.nc".format(settings.OUTROOT, period, suffix)
    else:
        climfilename = settings.DATA_LOCATION + "{}_5x5_monthly_climatology_{}_{}.nc".format(settings.OUTROOT, period, suffix)
        obsfilename = settings.DATA_LOCATION + "{}_5x5_monthly_from_daily_{}_{}.nc".format(settings.OUTROOT, period, suffix)

    # load netCDF files
    clim_file = ncdf.Dataset(climfilename,'r', format='NETCDF4')
    obs_file = ncdf.Dataset(obsfilename,'r', format='NETCDF4')

    # simple - use a list and append
    all_anoms = []

    # spin through all variables
    for v, var in enumerate(OBS_ORDER):
        print var.name
        
        obs = obs_file.variables[var.name][:]
        clims = clim_file.variables[var.name][:]

        anomalies = obs - np.tile(clims, (obs.shape[0]/12.,1,1)) # make to same shape

        all_anoms += [anomalies]

    # finished - convert list to array
    all_anoms = np.ma.array(all_anoms)

    # extract remaining information to copy across
    n_obs = obs_file.variables["n_obs"][:]
    n_grids = obs_file.variables["n_grids"][:]

    # set up the time object and axis
    intimes = obs_file.variables["time"]
    times = utils.TimeVar("time", intimes.long_name, intimes.units, intimes.standard_name)
    times.data = intimes[:]

    # write file
    if daily:
# KATE modified - added renorm19812010 to the filename
        out_filename = settings.DATA_LOCATION + settings.OUTROOT + "_5x5_monthly_renorm19812010_anomalies_from_daily_{}_{}.nc".format(period, suffix)
        #out_filename = settings.DATA_LOCATION + settings.OUTROOT + "_5x5_monthly_anomalies_from_daily_{}_{}.nc".format(period, suffix)
# end
    else:
# KATE modified - added renorm19812010 to the filename
        out_filename = settings.DATA_LOCATION + settings.OUTROOT + "_5x5_monthly_renorm19812010_anomalies_{}_{}.nc".format(period, suffix)
        #out_filename = settings.DATA_LOCATION + settings.OUTROOT + "_5x5_monthly_anomalies_{}_{}.nc".format(period, suffix)
# end

# KATE modified - only outputting 90 to -90 now and have changed grid_lats above
    utils.netcdf_write(out_filename, all_anoms, n_grids, n_obs, OBS_ORDER, grid_lats, grid_lons, times, frequency = "Y")
    #if period == "both":
    #    utils.netcdf_write(out_filename, all_anoms, n_grids, n_obs, OBS_ORDER, grid_lats[::-1], grid_lons, times, frequency = "Y")
    #else:
    #    utils.netcdf_write(out_filename, all_anoms, n_grids, n_obs, OBS_ORDER, grid_lats, grid_lons, times, frequency = "Y")
# end
    return # apply_climatology
def do_conversion(start_year=defaults.START_YEAR, end_year=defaults.END_YEAR, period="all", doBC=False, doQC=True):
    """
    Convert dailies to pentads 1x1

    :param int start_year: start year to process
    :param int end_year: end year to process
    :param str period: which period to do day/night/all?
    :param bool doBC: work on the bias corrected data
    :param bool doQC: incorporate the QC flags or not


    :returns:
    """
    settings = set_paths_and_vars.set(doBC=doBC, doQC=doQC)

    OBS_ORDER = utils.make_MetVars(settings.mdi, multiplier=False)

    for year in np.arange(start_year, end_year + 1):

        # set up empty data array
        all_dailies = np.ma.zeros([len(OBS_ORDER), utils.days_in_year(year), len(grid_lats), len(grid_lons)])
        all_dailies.mask = np.zeros([len(OBS_ORDER), utils.days_in_year(year), len(grid_lats), len(grid_lons)])
        all_dailies.fill_value = settings.mdi

        all_n_obs = np.zeros([utils.days_in_year(year), len(grid_lats), len(grid_lons)])

        year_start = dt.datetime(year, 1, 1, 0, 0)

        for month in np.arange(12) + 1:
            print year, month

            month_start = utils.day_of_year(year, month)
            month_end = month_start + calendar.monthrange(year, month)[1]

            filename = "{}/{}_1x1_daily_{}{:02d}_{}.nc".format(
                settings.DATA_LOCATION, settings.OUTROOT, year, month, period
            )

            ncdf_file = ncdf.Dataset(filename, "r", format="NETCDF4")

            for v, var in enumerate(OBS_ORDER):

                if month == 12:
                    # run to end of year if december
                    all_dailies[v, month_start:, :, :] = ncdf_file.variables[var.name][:]
                else:
                    all_dailies[v, month_start:month_end, :, :] = ncdf_file.variables[var.name][:]

            # now get number of observations
            if month == 12:
                all_n_obs[month_start:, :, :] = ncdf_file.variables["n_obs"][:]
            else:
                all_n_obs[month_start:month_end, :, :] = ncdf_file.variables["n_obs"][:]

        if calendar.isleap(year):
            assert all_dailies.shape[1] == 366

            # extract 6-day pentad
            incl_feb29th = all_dailies[:, 55:61, :, :]

            # remove the data of Feb 29th from array
            # np.ma.delete doesn't exist, so have to copy mask separately
            mask = all_dailies.mask
            all_dailies = np.delete(all_dailies, 59, 1)
            mask = np.delete(mask, 59, 1)
            all_dailies = np.ma.array(all_dailies, mask=mask)
            del mask

            # number of observations
            incl_feb29th_n_obs = all_n_obs[55:61, :, :]
            all_n_obs = np.delete(all_n_obs, 59, 0)

        else:
            assert all_dailies.shape[1] == 365

        shape = all_dailies.shape
        all_dailies = all_dailies.reshape(shape[0], -1, 5, shape[-2], shape[-1])

        n_days_per_pentad = np.ma.count(all_dailies, axis=2)

        if settings.doMedian:
            pentad_grid = utils.bn_median(all_dailies, axis=2)
        else:
            pentad_grid = np.ma.mean(all_dailies, axis=2)

        # clear up memory
        del all_dailies
        gc.collect()

        all_n_obs = all_n_obs.reshape(-1, 5, shape[-2], shape[-1])
        all_n_obs = np.sum(all_n_obs, axis=1)

        pentad_grid.mask[
            n_days_per_pentad < N_OBS
        ] = True  # mask where fewer than 2 days have values # KW THIS IS ACTUALLY 2 - WHICH I THINK IS GOOD

        # the pentad containing feb 29th is the 11th in the year
        if calendar.isleap(year):
            #  overwrite this with the me(di)an of a 6-day pentad
            if settings.doMedian:
                pentad_grid[:, 11, :, :] = utils.bn_median(incl_feb29th, axis=1)
            else:
                pentad_grid[:, 11, :, :] = np.ma.mean(incl_feb29th, axis=1)

            feb_n_days_per_pentad = np.ma.count(incl_feb29th, axis=1)
            pentad_grid.mask[:, 11, :, :][feb_n_days_per_pentad < N_OBS] = True
            n_days_per_pentad[:, 11, :, :] = feb_n_days_per_pentad

            all_n_obs[11, :, :] = np.sum(incl_feb29th_n_obs, axis=0)

            print "processed Feb 29th"

        times = utils.TimeVar("time", "time since 1/1/{} in hours".format(year), "hours", "time")
        times.data = np.arange(0, pentad_grid.shape[1]) * 5 * 24

        out_filename = settings.DATA_LOCATION + settings.OUTROOT + "_1x1_pentad_{}_{}.nc".format(year, period)

        utils.netcdf_write(
            out_filename,
            pentad_grid,
            n_days_per_pentad[0],
            all_n_obs,
            OBS_ORDER,
            grid_lats,
            grid_lons,
            times,
            frequency="P",
        )

        del pentad_grid
        del all_n_obs
        del n_days_per_pentad
        gc.collect()

    return  # do_conversion
def calculate_climatology(suffix = "relax", start_year = 1981, end_year = 2010, period = "both", daily = False, 
                          doQC = False, doQC1it = False, doQC2it = False, doQC3it = False, 
			  doBC = False, doBCtotal = False, doBChgt = False, doBCscn = False):
#def calculate_climatology(suffix = "relax", start_year = 1981, end_year = 2010, period = "both", daily = False, doQC = False, doBC = False):
# end
    '''
    Make 5x5 monthly climatology

    :param str suffix: "relax" or "strict" criteria
    :param int start_year: start year to process
    :param int end_year: end year to process
    :param str period: which period to do day/night/both?
    :param bool daily: run in 1x1 daily --> 5x5 monthly data
    :param bool doQC: incorporate the QC flags or not
# KATE modified
    :param bool doQC1it: incorporate the 1st iteration QC flags or not
    :param bool doQC2it: incorporate the 2nd iteration QC flags or not
    :param bool doQC3it: incorporate the 3rd iteration QC flags or not
# end
    :param bool doBC: work on the bias corrected data
# KATE modified
    :param bool doBCtotal: work on the bias corrected data
    :param bool doBChgt: work on the height only bias corrected data
    :param bool doBCscn: work on the screen only bias corrected data
# end

    :returns:
    '''
# KATE modified
    settings = set_paths_and_vars.set(doBC = doBC, doBCtotal = doBCtotal, doBChgt = doBChgt, doBCscn = doBCscn, doQC = doQC, doQC1it = doQC1it, doQC2it = doQC2it, doQC3it = doQC3it)
    #settings = set_paths_and_vars.set(doBC = doBC, doQC = doQC)
# end

    if suffix == "relax":
        N_YEARS_PRESENT = 10 # number of years present to calculate climatology
    elif suffix == "strict":
        N_YEARS_PRESENT = 15 # number of years present to calculate climatology


    print "Do daily: {}".format(daily)

    N_YEARS = end_year - start_year + 1

    # read in each variable - memory issues

    all_clims = np.ma.zeros([len(OBS_ORDER), 12, len(grid_lats), len(grid_lons)])
    # KW - why set up as np.ones?
    all_clims.mask = np.zeros([len(OBS_ORDER), 12, len(grid_lats), len(grid_lons)])

    all_stds = np.ma.zeros([len(OBS_ORDER), 12, len(grid_lats), len(grid_lons)])
    all_stds.mask = np.zeros([len(OBS_ORDER), 12, len(grid_lats), len(grid_lons)])
    
    # KW no mask??? I've set one with fill_value as -1 - should the mask be .zeros or .ones though?
    all_n_obs = np.ma.zeros([N_YEARS * 12, len(grid_lats), len(grid_lons)])
    all_n_obs.mask = np.zeros([N_YEARS * 12, len(grid_lats), len(grid_lons)])
    all_n_obs.fill_value = -1
    
    if daily:
        filename = settings.DATA_LOCATION + "{}_5x5_monthly_from_daily_{}_{}.nc".format(settings.OUTROOT, period, suffix)
            
    else:
        filename = settings.DATA_LOCATION + "{}_5x5_monthly_{}_{}.nc".format(settings.OUTROOT, period, suffix)

    ncdf_file = ncdf.Dataset(filename,'r', format='NETCDF4')

    times = ncdf_file.variables["time"]
    data_start = int(times.long_name.split(" ")[2].split("/")[-1])
    clim_offset = (start_year - data_start) * 12
        
    for v, var in enumerate(OBS_ORDER):
	    
        print var.name

        # number of pentads = 365/5 = 73
        # set up empty data array
        all_months = np.ma.zeros([N_YEARS * 12, len(grid_lats), len(grid_lons)])
	# sets up a mask of 'False' = not masked!
        all_months.mask = np.zeros([N_YEARS * 12, len(grid_lats), len(grid_lons)])
        all_months.fill_value = settings.mdi

        all_months[:, :, :] = ncdf_file.variables[var.name][clim_offset:clim_offset + (30*12)]

        # months x lats x lons
        shape = all_months.shape
        all_months = all_months.reshape(-1, 12, shape[-2], shape[-1])

        n_grids = np.ma.count(all_months, axis = 0)

        # collapse down the years
# KATE MEDIAN WATCH
# KATE modified - forced to use MEAN
        all_clims[v, :, :, :] = np.ma.mean(all_months, axis = 0)
        #if settings.doMedian:
        #    all_clims[v, :, :, :] = utils.bn_median(all_months, axis = 0)
        #else:
        #    all_clims[v, :, :, :] = np.ma.mean(all_months, axis = 0)
# end
        all_stds[v, :, :, :] = np.ma.std(all_months, axis = 0)

        # mask where fewer than 50% of years have data
        locs = np.ma.where(n_grids < N_YEARS_PRESENT)
        all_clims[v, :, :, :].mask[locs] = True
        # KW should probably mask stdev too - although unmasked it does show the potential coverage
        all_stds[v, :, :, :].mask[locs] = True

        if settings.plots and v == 0:
            import matplotlib.pyplot as plt
            plt.clf()
            plt.hist(n_grids.reshape(-1), bins = np.arange(-1,32), align = "left", log = True, rwidth=0.5)
            plt.axvline(x = N_YEARS_PRESENT-0.5, color = "r")       
            plt.title("Number of years present in each pentad")
            plt.xlabel("Number of years (max = 30)")
            plt.ylabel("Frequency (log scale)")
            plt.savefig(settings.PLOT_LOCATION + "monthly_5x5_clims_n_years_{}_{}.png".format(period, suffix))

            
    # now process number of observations (KW all_n_obs wasn't a masked array - so have set it up as one - BUT not really convinced this 
    # is working as it should. No import numpy.ma?        
    all_n_obs[:, :, :] = ncdf_file.variables["n_obs"][clim_offset:clim_offset + (30*12)]
    all_n_obs = all_n_obs.reshape(-1, 12, shape[-2], shape[-1])
    all_obs = np.ma.sum(all_n_obs, axis = 0)

    # set up time array
    times = utils.TimeVar("time", "time since 1/1/{} in days".format(1), "days", "time")
    month_lengths = [calendar.monthrange(1, x + 1)[1] for x in range(12)]
    times.data = [sum(month_lengths[0:x]) for x in range(12)]

    # write files
    if daily:
        out_filename = settings.DATA_LOCATION + settings.OUTROOT + "_5x5_monthly_climatology_from_daily_{}_{}.nc".format(period, suffix)
    else:
        out_filename = settings.DATA_LOCATION + settings.OUTROOT + "_5x5_monthly_climatology_{}_{}.nc".format(period, suffix)

# KATE modified - only outputting 90 to -90 now and have changed grid_lats above
    utils.netcdf_write(out_filename, all_clims, n_grids, all_obs, OBS_ORDER, grid_lats, grid_lons, times, frequency = "Y")
    #if period == "both":
    #    utils.netcdf_write(out_filename, all_clims, n_grids, all_obs, OBS_ORDER, grid_lats[::-1], grid_lons, times, frequency = "Y")
    #else:
    #    utils.netcdf_write(out_filename, all_clims, n_grids, all_obs, OBS_ORDER, grid_lats, grid_lons, times, frequency = "Y")
# end
    if daily:
        out_filename = settings.DATA_LOCATION + settings.OUTROOT + "_5x5_monthly_stdev_from_daily_{}_{}.nc".format(period, suffix)
    else:
        out_filename = settings.DATA_LOCATION + settings.OUTROOT + "_5x5_monthly_stdev_{}_{}.nc".format(period, suffix)

# KATE modified - only outputting 90 to -90 now and have changed grid_lats above
    utils.netcdf_write(out_filename, all_stds, n_grids, all_obs, OBS_ORDER, grid_lats, grid_lons, times, frequency = "Y")
    #if period == "both":
    #    utils.netcdf_write(out_filename, all_stds, n_grids, all_obs, OBS_ORDER, grid_lats[::-1], grid_lons, times, frequency = "Y")
    #else:
    #    utils.netcdf_write(out_filename, all_stds, n_grids, all_obs, OBS_ORDER, grid_lats, grid_lons, times, frequency = "Y")
# end
    # test distribution of obs with grid boxes
    if daily:
        outfile = file(settings.OUTROOT + "_5x5_monthly_climatology_from_daily_{}_{}.txt".format(period, suffix), "w")
    else:
        outfile = file(settings.OUTROOT + "_5x5_monthly_climatology_{}_{}.txt".format(period, suffix), "w")
        
    utils.boxes_with_n_obs(outfile, all_obs, all_clims[0], N_YEARS_PRESENT)

    return # calculate_climatology
def combine_files(suffix = "relax", pentads = False, do3hr = False, months = False, daily = False, start_year = defaults.START_YEAR, end_year = defaults.END_YEAR, start_month = 1, end_month = 12, period = "both", 
                  doQC = False, doQC1it = False, doQC2it = False, doQC3it = False, doBC = False, doBCtotal = False, doBChgt = False, doBCscn = False):
#def combine_files(suffix = "relax", pentads = False, do3hr = False, months = False, daily = False, start_year = defaults.START_YEAR, end_year = defaults.END_YEAR, start_month = 1, end_month = 12, period = "both", doQC = False, doBC = False):
# end
    '''
    Combine the files, first the pentads 1x1, then the monthlies 5x5

    :param str suffix: "relax" or "strict" criteria
    :param bool pentads: run on pentads
    :param bool do3hr: run on pentads created from 3hrly data (if False then run on those from daily)
    :param bool months: run on 5x5 monthly data
    :param bool daily: run on monthlies created direct from dailies (if False the run on those from 1x1 monthlies)
    :param int start_year: start year to process
    :param int end_year: end year to process
    :param int start_month: start month to process
    :param int end_month: end month to process
    :param str period: which period to do day/night/both?
    :param bool doQC: incorporate the QC flags or not
# KATE modified
    :param bool doQC1it: incorporate the 1st iteration QC flags or not
    :param bool doQC2it: incorporate the 2nd iteration QC flags or not
    :param bool doQC3it: incorporate the 3rd iteration QC flags or not
# end
    :param bool doBC: work on the bias corrected data
# KATE modified
    :param bool doBCtotal: work on the bias corrected data
    :param bool doBChgt: work on the hieght only bias corrected data
    :param bool doBCscn: work on the screen only bias corrected data
# end

    :returns:
    '''

# KATE modified
    settings = set_paths_and_vars.set(doBC = doBC, doBCtotal = doBCtotal, doBChgt = doBChgt, doBCscn = doBCscn, doQC = doQC, doQC1it = doQC1it, doQC2it = doQC2it, doQC3it = doQC3it)
    #settings = set_paths_and_vars.set(doBC = doBC, doQC = doQC)
# end
    # pentads
    if pentads:

        OBS_ORDER = utils.make_MetVars(settings.mdi, multiplier = False)
        # KW make OBS_ORDER only the actual variables - remove anomalies
        NEWOBS_ORDER = []
        for v, var in enumerate(OBS_ORDER):
            if "anomalies" not in var.name:
                NEWOBS_ORDER.append(var)
        del OBS_ORDER
        OBS_ORDER = np.copy(NEWOBS_ORDER)
        del NEWOBS_ORDER     

        # set up the grids
        DELTA=1
        grid_lats = np.arange(-90+DELTA, 90+DELTA, DELTA)
        grid_lons = np.arange(-180+DELTA, 180+DELTA, DELTA)

        Nyears = end_year - start_year + 1

        # read in each variable - memory issues
        for v, var in enumerate(OBS_ORDER):

            print var.name

            all_pentads = np.ma.zeros((1, Nyears, 73, len(grid_lats), len(grid_lons)))
            all_pentads.mask = np.ones((1, Nyears, 73, len(grid_lats), len(grid_lons)))
            all_pentads.fill_value = settings.mdi

            n_obs = np.zeros((Nyears, 73, len(grid_lats), len(grid_lons)))
            n_grids = np.zeros((Nyears, 73, len(grid_lats), len(grid_lons)))


            for y, year in enumerate(np.arange(start_year, end_year + 1)):

                if do3hr:
                    filename = settings.DATA_LOCATION + "{}_1x1_pentad_from_3hrly_{}_{}_{}.nc".format(settings.OUTROOT, year, period, suffix)
                else:
                    filename = settings.DATA_LOCATION + "{}_1x1_pentad_{}_{}_{}.nc".format(settings.OUTROOT, year, period, suffix)

                ncdf_file = ncdf.Dataset(filename,'r', format='NETCDF4')

                time = ncdf_file.variables["time"]

                try:
                    assert time.long_name == "time since 1/1/{} in hours".format(year)

                except AssertionError:
                    print "time units are not as expected."
                    print "    expected time since 1/1/{} in hours".format(year)
                    print "    got {}".format(time.long_name)
                    sys.exit()

                all_pentads[0, y, :, :, :] = ncdf_file.variables[var.name][:]

                n_obs[y, :, :, :] = ncdf_file.variables["n_obs"][:]
                n_grids[y, :, :, :] = ncdf_file.variables["n_obs"][:]

                print year

                if y == 0 and period == "both":
                    lat_centres = ncdf_file.variables["latitude"]
# KATE modified - this results in lats that go from 92.5 to -82,5 or 90.5 to -88.5 so I've switched the + for a -
                    latitudes = lat_centres - (lat_centres[1] - lat_centres[0])/2.
                    #latitudes = lat_centres + (lat_centres[1] - lat_centres[0])/2.
# end
                    lon_centres = ncdf_file.variables["longitude"]
                    longitudes = lon_centres + (lon_centres[1] - lon_centres[0])/2.

                ncdf_file.close()

            all_pentads = all_pentads.reshape(1, -1, len(grid_lats), len(grid_lons))

            # sort the times
            times = utils.TimeVar("time", "time since 1/1/1973 in months", "months", "time")
            times.data = np.arange(all_pentads.shape[1])

            # and write file
            if do3hr:
                out_filename = settings.DATA_LOCATION + "{}_1x1_pentads_from_3hrly_{}_{}_{}.nc".format(settings.OUTROOT, var.name, period, suffix)
            else:
                out_filename = settings.DATA_LOCATION + "{}_1x1_pentads_{}_{}_{}.nc".format(settings.OUTROOT, var.name, period, suffix)

            if period == "both":
                utils.netcdf_write(out_filename, all_pentads, n_grids, n_obs, OBS_ORDER, latitudes, longitudes, times, frequency = "P", single = var)
            else:
                utils.netcdf_write(out_filename, all_pentads, n_grids, n_obs, OBS_ORDER, grid_lats, grid_lons, times, frequency = "P", single = var)


        # Reset the data holding arrays and objects

        del OBS_ORDER
        gc.collect()

    if months:

        OBS_ORDER = utils.make_MetVars(settings.mdi, multiplier = False)

        #*****************************
        # monthlies
        for y, year in enumerate(np.arange(start_year, end_year + 1)): 
            print year

            for month in np.arange(start_month, end_month + 1):
                print "   {}".format(month)

                if daily:
                    filename = settings.DATA_LOCATION + "{}_5x5_monthly_from_daily_{}{:02d}_{}_{}.nc".format(settings.OUTROOT, year, month, period, suffix)
                else:
                    filename = settings.DATA_LOCATION + "{}_5x5_monthly_{}{:02d}_{}_{}.nc".format(settings.OUTROOT, year, month, period, suffix)

                ncdf_file = ncdf.Dataset(filename,'r', format='NETCDF4')

                time = ncdf_file.variables["time"]

                try:
                    assert time.long_name == "time since 1/{}/{} in hours".format(month, year)

                except AssertionError:
                    print "time units are not as expected."
                    print "    expected time since 1/{}/{} in hours".format(month, year)
                    print "    got {}".format(time.long_name)
                    sys.exit()

                for v, var in enumerate(OBS_ORDER):

                    nc_var = ncdf_file.variables[var.name]

                    try:
                        var.data = utils.ma_append(var.data, nc_var[:], axis = 0)

                        if v == 0:
                            n_obs = utils.ma_append(n_obs, ncdf_file.variables["n_obs"][:], axis = 0)
                            n_grids = utils.ma_append(n_grids, ncdf_file.variables["n_grids"][:], axis = 0)

                    except AttributeError:
                        var.data = nc_var[:]
                        var.data.fill_value = nc_var.missing_value

                        if v == 0:
                            n_obs = ncdf_file.variables["n_obs"][:]
                            n_grids = ncdf_file.variables["n_grids"][:]


                if y == 0 and month == start_month and period == "both":
                    lat_centres = ncdf_file.variables["latitude"]
                    latitudes = lat_centres + (lat_centres[1] - lat_centres[0])/2.

                    lon_centres = ncdf_file.variables["longitude"]
                    longitudes = lon_centres + (lon_centres[1] - lon_centres[0])/2.

# KATE modified - added an extra loop so that we can flip the latitudes for day and night too
                if y == 0 and month == start_month and period != "both":
                    lat_centres = ncdf_file.variables["latitude"]
                    # THIS IS - RATHER THAN + READY TO FLIP THE LATS
		    latitudes = lat_centres - (lat_centres[1] - lat_centres[0])/2.

                    lon_centres = ncdf_file.variables["longitude"]
                    longitudes = lon_centres + (lon_centres[1] - lon_centres[0])/2.
# end                    
                ncdf_file.close()
            
        # write out into big array for netCDF file
        all_data = np.ma.zeros((len(OBS_ORDER), var.data.shape[0], var.data.shape[1], var.data.shape[2]))
        all_data.mask = np.zeros((len(OBS_ORDER), var.data.shape[0], var.data.shape[1], var.data.shape[2]))

        for v, var in enumerate(OBS_ORDER):
            all_data[v, :, :, :] = var.data

# KATE modified - switching the latitudes on day and night data for consistency with both
        if period == "day" or period == "night":
            # invert latitudes
            latitudes = latitudes[::-1]
            all_data = all_data[:,:,::-1,:] # variable, time, latitude, longitude
# end

        all_data.fill_value = var.data.fill_value

        # extra stuff for writing
# KATE modified - no longer need grid5 as we're using latitudes and longitudes
        #DELTA=5
        #grid5_lats = np.arange(-90+DELTA, 90+DELTA, DELTA)
        #grid5_lons = np.arange(-180+DELTA, 180+DELTA, DELTA)
# end
# KATE modified - START_YEAR not defined, should be start_year
        times = utils.TimeVar("time", "time since 1/1/{} in months".format(start_year), "months", "time")
        #times = utils.TimeVar("time", "time since 1/1/{} in months".format(START_YEAR), "months", "time")
# end
        times.data = np.arange(var.data.shape[0])

        # and write file
        if daily:
            out_filename = settings.DATA_LOCATION + settings.OUTROOT + "_5x5_monthly_from_daily_{}_{}.nc".format(period, suffix)
        else:
            out_filename = settings.DATA_LOCATION + settings.OUTROOT + "_5x5_monthly_{}_{}.nc".format(period, suffix)

# KATE modified - now always using latitudes and longitudes
        utils.netcdf_write(out_filename, all_data, n_grids, n_obs, OBS_ORDER, latitudes, longitudes, times, frequency = "Y")
        #if period == "both":
        #    utils.netcdf_write(out_filename, all_data, n_grids, n_obs, OBS_ORDER, latitudes, longitudes, times, frequency = "Y")
        #else:
        #    utils.netcdf_write(out_filename, all_data, n_grids, n_obs, OBS_ORDER, grid5_lats, grid5_lons, times, frequency = "Y")
# end
        

    return # combine_files
def calculate_climatology(suffix = "relax", start_year = 1981, end_year = 2010, period = "both", do3hr = False, doQC = False, doBC = False):
    '''
    Make 1x1 pentad climatology

    :param str suffix: "relax" or "strict" criteria
    :param int start_year: start year to process
    :param int end_year: end year to process
    :param str period: which period to do day/night/both?
    :param bool do3hr: run on 3hr --> pentad data
    :param bool doQC: incorporate the QC flags or not
    :param bool doBC: work on the bias corrected data

    :returns:
    '''
    settings = set_paths_and_vars.set(doBC = doBC, doQC = doQC)

    if suffix == "relax":
        N_YEARS_PRESENT = 10 # number of years present to calculate climatology
    elif suffix == "strict":
        N_YEARS_PRESENT = 15 # number of years present to calculate climatology


    print "Do 3hrly: {}".format(do3hr)

    N_YEARS = end_year - start_year + 1

    # read in each variable - memory issues

    all_clims = np.ma.zeros([len(OBS_ORDER), 73, len(grid_lats), len(grid_lons)])
    # KW - why set up as np.ones?
    all_clims.mask = np.zeros([len(OBS_ORDER), 73, len(grid_lats), len(grid_lons)])

    all_stds = np.ma.zeros([len(OBS_ORDER), 73, len(grid_lats), len(grid_lons)])
    all_stds.mask = np.zeros([len(OBS_ORDER), 73, len(grid_lats), len(grid_lons)])
    
    # KW no mask??? I've set one with fill_value as -1 - should the mask be .zeros or .ones though?
    all_n_obs = np.ma.zeros([N_YEARS, 73, len(grid_lats), len(grid_lons)])
    all_n_obs.mask = np.zeros([N_YEARS, 73, len(grid_lats), len(grid_lons)])
    all_n_obs.fill_value = -1
    
    for v, var in enumerate(OBS_ORDER):
	    
        print var.name

        # number of pentads = 365/5 = 73
        # set up empty data array
        all_pentads = np.ma.zeros([N_YEARS, 73, len(grid_lats), len(grid_lons)])
	# sets up a mask of 'False' = not masked!
        all_pentads.mask = np.zeros([N_YEARS, 73, len(grid_lats), len(grid_lons)])
        all_pentads.fill_value = settings.mdi

        # read in relevant years
        for y, year in enumerate(np.arange(start_year, end_year + 1)): 

            print year

            if do3hr:
                filename = settings.DATA_LOCATION + "{}_1x1_pentad_from_3hrly_{}_{}_{}.nc".format(settings.OUTROOT, year, period, suffix)
 
            else:
                filename = settings.DATA_LOCATION + "{}_1x1_pentad_{}_{}_{}.nc".format(settings.OUTROOT, year, period, suffix)

            ncdf_file = ncdf.Dataset(filename,'r', format='NETCDF4')

            all_pentads[y, :, :, :] = ncdf_file.variables[var.name][:]

            if v == 0:
                all_n_obs[y, :, :, :] = ncdf_file.variables["n_obs"][:]

        # years x pentads x lats x lons
        n_grids = np.ma.count(all_pentads, axis = 0)

        # collapse down the years
        if settings.doMedian:
            all_clims[v, :, :, :] = utils.bn_median(all_pentads, axis = 0)
        else:
            all_clims[v, :, :, :] = np.ma.mean(all_pentads, axis = 0)

        all_stds[v, :, :, :] = np.ma.std(all_pentads, axis = 0)

        # mask where fewer than 50% of years have data
        locs = np.ma.where(n_grids < N_YEARS_PRESENT)
        all_clims[v, :, :, :].mask[locs] = True
        # KW should probably mask stdev too - although unmasked it does show the potential coverage
        all_stds[v, :, :, :].mask[locs] = True

        if settings.plots and v == 0:
            import matplotlib.pyplot as plt
            plt.clf()
            plt.hist(n_grids.reshape(-1), bins = np.arange(-1,32), align = "left", log = True, rwidth=0.5)
            plt.axvline(x = N_YEARS_PRESENT-0.5, color = "r")       
            plt.title("Number of years present in each pentad")
            plt.xlabel("Number of years (max = 30)")
            plt.ylabel("Frequency (log scale)")
            plt.savefig(settings.PLOT_LOCATION + "pentad_clims_n_years_{}_{}_{}.png".format(year, period, suffix))

            
    # now process number of observations (KW all_n_obs wasn't a masked array - so have set it up as one - BUT not really convinced this 
    # is working as it should. No import numpy.ma?        
    all_obs = np.ma.sum(all_n_obs, axis = 0)

    # set up time array
    times = utils.TimeVar("time", "time since 1/1/{} in days".format(1), "days", "time")
    times.data = np.arange(0, 73) * 5

    # write files
    if do3hr:
        out_filename = settings.DATA_LOCATION + settings.OUTROOT + "_1x1_pentad_climatology_from_3hrly_{}_{}.nc".format(period, suffix)
    else:
        out_filename = settings.DATA_LOCATION + settings.OUTROOT + "_1x1_pentad_climatology_{}_{}.nc".format(period, suffix)

    utils.netcdf_write(out_filename, all_clims, n_grids, all_obs, OBS_ORDER, grid_lats, grid_lons, times, frequency = "P")

    if do3hr:
        out_filename = settings.DATA_LOCATION + settings.OUTROOT + "_1x1_pentad_stdev_from_3hrly_{}_{}.nc".format(period, suffix)
    else:
       out_filename = settings.DATA_LOCATION + settings.OUTROOT + "_1x1_pentad_stdev_{}_{}.nc".format(period, suffix)

    utils.netcdf_write(out_filename, all_stds, n_grids, all_obs, OBS_ORDER, grid_lats, grid_lons, times, frequency = "P")

    # test distribution of obs with grid boxes
    if do3hr:
        outfile = file(settings.OUTROOT + "_1x1_pentad_climatology_from_3hrly_{}_{}.txt".format(period, suffix), "w")
    else:
        outfile = file(settings.OUTROOT + "_1x1_pentad_climatology_{}_{}.txt".format(period, suffix), "w")

    utils.boxes_with_n_obs(outfile, all_obs, all_clims[0], N_YEARS_PRESENT)

    return # calculate_climatology
def do_merge(fileroot, mdi, suffix = "relax", clims = False, doMedian = False):
    '''
    Merge the _day and _night files

    Do a np.ma.mean or median for the data and a sum for the n_obs and n_grids

    Output with a _both suffix

    :param str fileroot: root for filenames
    :param flt mdi: missing data indicator
    :param str suffix: "relax" or "strict" criteria
    :param bool clims: if climatologies then don't try and process anomalies.
    '''

    OBS_ORDER = utils.make_MetVars(mdi, multiplier = False)

    if clims:
        # KW make OBS_ORDER only the actual variables - remove anomalies
        NEWOBS_ORDER = []
        for v, var in enumerate(OBS_ORDER):
            if "anomalies" not in var.name:
                NEWOBS_ORDER.append(var)
        del OBS_ORDER
        OBS_ORDER = np.copy(NEWOBS_ORDER)
        del NEWOBS_ORDER     


    # spin through both periods
    for p, period in enumerate(["day", "night"]):
        print period
        
        # go through the variables
        for v, var in enumerate(OBS_ORDER):

            print "   {}".format(var.name)

            ncdf_file = ncdf.Dataset("{}_{}_{}.nc".format(fileroot, period, suffix),'r', format='NETCDF4')

            if v == 0 and p == 0:

                shape = list(ncdf_file.variables[var.name][:].shape)
                shape.insert(0, len(OBS_ORDER)+2) # add all the variables
                shape.insert(0, 2) # insert extra dimension to allow day + night

                all_data = np.ma.zeros(shape)

                all_data[p, v] = ncdf_file.variables[var.name][:]

                # get lats/lons of box centres
                lat_centres = ncdf_file.variables["latitude"]
# KATE modified - this results in lats that go from 92.5 to -82,5 so I've switched the + for a -
                latitudes = lat_centres - (lat_centres[1] - lat_centres[0])/2.
                #latitudes = lat_centres + (lat_centres[1] - lat_centres[0])/2.
# end
                lon_centres = ncdf_file.variables["longitude"]
                longitudes = lon_centres + (lon_centres[1] - lon_centres[0])/2.

                # get times - make a dummy object and then populate attributes
                times = utils.TimeVar("time", "time since 1/{}/{} in hours".format(1, 1973), "hours", "time")

                times.long_name = ncdf_file.variables["time"].long_name
                times.standard_name = ncdf_file.variables["time"].standard_name
                times.long_name = ncdf_file.variables["time"].long_name
                times.units = ncdf_file.variables["time"].units

                times.data = ncdf_file.variables["time"][:]

            else:
                all_data[p, v] = ncdf_file.variables[var.name][:]

        # and get n_obs and n_grids
        all_data[p, -2] = ncdf_file.variables["n_grids"][:]
        all_data[p, -1] = ncdf_file.variables["n_obs"][:]

    # invert latitudes
    latitudes = latitudes[::-1]
    all_data = all_data[:,:,:,::-1,:]

    # got all the info, now merge
    if doMedian:
        merged_data = utils.bn_median(all_data[:, :len(OBS_ORDER)], axis = 0)
    else:
        merged_data = np.ma.mean(all_data[:, :len(OBS_ORDER)], axis = 0)

    # and process the grids and observations (split off here so have incorporated latitude inversion)
    n_grids = np.ma.sum(all_data[:, -2], axis = 0)
    n_obs = np.ma.sum(all_data[:, -1], axis = 0)
    n_obs.fill_value = -1
    n_grids.fill_value = -1

    # write the output file
    utils.netcdf_write("{}_{}_{}.nc".format(fileroot, "both", suffix), merged_data, n_grids, n_obs, OBS_ORDER, latitudes, longitudes, times, frequency = "P")

    # test distribution of obs with grid boxes
    outfile = file("{}_{}_{}.txt".format(fileroot.split("/")[-1], "both", suffix), "w")
    utils.boxes_with_n_obs(outfile, n_obs, merged_data[0], "")


    return # do_merge
def do_gridding(suffix = "relax", start_year = defaults.START_YEAR, end_year = defaults.END_YEAR, start_month = 1, end_month = 12, 
                doQC = False, doQC1it = False, doQC2it = False, doQC3it = False, doSST_SLP = False, 
		doBC = False, doBCtotal = False, doBChgt = False, doBCscn = False, doUncert = False):
#def do_gridding(suffix = "relax", start_year = defaults.START_YEAR, end_year = defaults.END_YEAR, start_month = 1, end_month = 12, doQC = False, doSST_SLP = False, doBC = False, doUncert = False):
# end
    '''
    Do the gridding, first to 3hrly 1x1, then to daily 1x1 and finally monthly 1x1 and 5x5

    :param str suffix: "relax" or "strict" criteria
    :param int start_year: start year to process
    :param int end_year: end year to process
    :param int start_month: start month to process
    :param int end_month: end month to process
    :param bool doQC: incorporate the QC flags or not
    :param bool doQC1it: incorporate the first iteration (no buddy) QC flags or not
    :param bool doQC2it: incorporate the second iteration (no buddy) QC flags or not
    :param bool doQC3it: incorporate the third iteration (buddy) QC flags or not
    :param bool doSST_SLP: process additional variables or not
    :param bool doBC: work on the bias corrected data
    :param bool doBCtotal: work on the full bias corrected data
    :param bool doBChgt: work on the height only bias corrected data
    :param bool doBCscn: work on the screen only bias corrected data
    :param bool doUncert: work on files with uncertainty information (not currently used)

    :returns:
    '''
# KATE modified    
    settings = set_paths_and_vars.set(doBC = doBC, doBCtotal = doBCtotal, doBChgt = doBChgt, doBCscn = doBCscn, doQC = doQC, doQC1it = doQC1it, doQC2it = doQC2it, doQC3it = doQC3it)
    #settings = set_paths_and_vars.set(doBC = doBC, doQC = doQC)
# end


# KATE modified  - added other BC options  
#    if doBC:
    if doBC | doBCtotal | doBChgt | doBCscn:
# end
        fields = mds.TheDelimitersExt # extended (BC)
    else:
        fields = mds.TheDelimitersStd # Standard

# KATE modified  - added other BC options  
#    OBS_ORDER = utils.make_MetVars(settings.mdi, doSST_SLP = doSST_SLP, multiplier = True, doBC = doBC) # ensure that convert from raw format at writing stage with multiplier
    OBS_ORDER = utils.make_MetVars(settings.mdi, doSST_SLP = doSST_SLP, multiplier = True, doBC = doBC, doBCtotal = doBCtotal, doBChgt = doBChgt, doBCscn = doBCscn) # ensure that convert from raw format at writing stage with multiplier
# end

    # KW switching between 4 ('_strict') for climatology build and 2 for anomaly buily ('_relax') - added subscripts to files
    if suffix == "relax":
        N_OBS_DAY = 2 # KW ok for anomalies but this was meant to be 4 for dailies_all? and 2 for dailies_night/day?
        N_OBS_FRAC_MONTH = 0.3

    elif suffix == "strict":
        N_OBS_DAY = 4
        N_OBS_FRAC_MONTH = 0.3


    # flags to check on and values to allow through
# KATE modified
    if doQC1it | doQC2it:
        these_flags = {"ATclim":0,"ATrep":0,"DPTclim":0,"DPTssat":0,"DPTrep":0,"DPTrepsat":0}
    else:
        these_flags = {"ATbud":0, "ATclim":0,"ATrep":0,"DPTbud":0,"DPTclim":0,"DPTssat":0,"DPTrep":0,"DPTrepsat":0}    
    #these_flags = {"ATbud":0, "ATclim":0,"ATrep":0,"DPTbud":0,"DPTclim":0,"DPTssat":0,"DPTrep":0,"DPTrepsat":0}
# end

    # spin through years and months to read files
    for year in np.arange(start_year, end_year + 1): 

        for month in np.arange(start_month, end_month + 1):

            times = utils.TimeVar("time", "time since 1/{}/{} in hours".format(month, year), "hours", "time")

            grid_hours = np.arange(0, 24 * calendar.monthrange(year, month)[1], DELTA_HOUR)

            times.data = grid_hours

            # process the monthly file
# KATE modified  - added other BC options  
#            if doBC:
            if doBC | doBCtotal | doBChgt | doBCscn:
# end
                filename = "new_suite_{}{:02d}_{}_extended.txt".format(year, month, settings.OUTROOT)
            else:
                filename = "new_suite_{}{:02d}_{}.txt".format(year, month, settings.OUTROOT)

# KATE modified  - added other BC options  
#            raw_platform_data, raw_obs, raw_meta, raw_qc = utils.read_qc_data(filename, settings.ICOADS_LOCATION, fields, doBC = doBC)
            raw_platform_data, raw_obs, raw_meta, raw_qc = utils.read_qc_data(filename, settings.ICOADS_LOCATION, fields, doBC = doBC, doBCtotal = doBCtotal, doBChgt = doBChgt, doBCscn = doBCscn)
# end

            # extract observation details
            lats, lons, years, months, days, hours = utils.process_platform_obs(raw_platform_data)

            # test dates *KW - SHOULDN'T NEED THIS - ONLY OBS PASSING DATE CHECK ARE INCLUDED*
            #  *RD* - hasn't run yet but will leave it in just in case of future use.
            if not utils.check_date(years, year, "years", filename):
                sys.exit(1)
            if not utils.check_date(months, month, "months", filename):
                sys.exit(1)

# KATE modified - seems to be an error with missing global name plots so have changed to settings.plots
            # Choose this one to only output once per decade
	    #if settings.plots and (year in [1973, 1983, 1993, 2003, 2013]):
	    # Choose this one to output a plot for each month
            if settings.plots:
            #if plots and (year in [1973, 1983, 1993, 2003, 2013]):
# end
                # plot the distribution of hours

                import matplotlib.pyplot as plt

                plt.clf()
                plt.hist(hours, np.arange(-100,2500,100))
                plt.ylabel("Number of observations")
                plt.xlabel("Hours")
                plt.xticks(np.arange(-300, 2700, 300))
                plt.savefig(settings.PLOT_LOCATION + "obs_distribution_{}{:02d}_{}.png".format(year, month, suffix))


                # only for a few of the variables
                for variable in OBS_ORDER:
                    if variable.name in ["marine_air_temperature", "dew_point_temperature", "specific_humidity", "relative_humidity", "marine_air_temperature_anomalies", "dew_point_temperature_anomalies", "specific_humidity_anomalies", "relative_humidity_anomalies"]:

                        #plot_qc_diagnostics.values_vs_lat(variable, lats, raw_obs[:, variable.column], raw_qc, these_flags, settings.PLOT_LOCATION + "qc_actuals_{}_{}{:02d}_{}.png".format(variable.name, year, month, suffix), multiplier = variable.multiplier, doBC = doBC)
                        plot_qc_diagnostics.values_vs_lat_dist(variable, lats, raw_obs[:, variable.column], raw_qc, these_flags, \
			        settings.PLOT_LOCATION + "qc_actuals_{}_{}{:02d}_{}.png".format(variable.name, year, month, suffix), multiplier = variable.multiplier, \
# KATE modified  - added other BC options  
				doBC = doBC, doBCtotal = doBCtotal, doBChgt = doBChgt, doBCscn = doBCscn)
# end

            # QC sub-selection
	    
# KATE modified - added QC iterations but also think this needs to include the bias corrected versions because the QC flags need to be applied to those too.
# Not sure what was happening previously with the doBC run - any masking to QC'd obs?
            if doQC | doQC1it | doQC2it | doQC3it | doBC | doBCtotal | doBChgt | doBCscn:
            #if doQC:
# end
                print "Using {} as flags".format(these_flags)
# KATE modified - BC options
#                mask = utils.process_qc_flags(raw_qc, these_flags, doBC = doBC)
                mask = utils.process_qc_flags(raw_qc, these_flags, doBC = doBC, doBCtotal = doBCtotal, doBChgt = doBChgt, doBCscn = doBCscn)
# end
		print "All Obs: ",len(mask)
		print "Good Obs: ",len(mask[np.where(mask == 0)])
		print "Bad Obs: ",len(mask[np.where(mask == 1)])
		#pdb.set_trace()
		

                complete_mask = np.zeros(raw_obs.shape)
                for i in range(raw_obs.shape[1]):
                    complete_mask[:,i] = mask
                clean_data = np.ma.masked_array(raw_obs, mask = complete_mask)

# end
            else:
                print "No QC flags selected"
                clean_data = np.ma.masked_array(raw_obs, mask = np.zeros(raw_obs.shape))


            # discretise hours
            hours = utils.make_index(hours, DELTA_HOUR, multiplier = 100)

            # get the hours since start of month
            hours_since = ((days - 1) * 24) + (hours * DELTA_HOUR)

            # discretise lats/lons
            lat_index = utils.make_index(lats, DELTA_LAT, multiplier = 100)
            lon_index = utils.make_index(lons, DELTA_LON, multiplier = 100)

            lat_index += ((len(grid_lats)-1)/2) # and as -ve indices are unhelpful, roll by offsetting by most westward
            lon_index += ((len(grid_lons)-1)/2) #    or most southerly so that (0,0) is (-90,-180)

            # NOTE - ALWAYS GIVING TOP-RIGHT OF BOX TO GIVE < HARD LIMIT (as opposed to <=)
            # do the gridding
            # extract the full grid, number of obs, and day/night flag
# KATE MEDIAN WATCH This is hard coded to doMedian (rather than settings.doMedian) - OK WITH MEDIAN HERE!!!
# KATE modified - to add settings.doMedian instead of just doMedian which seems to be consistent with the other bits and BC options
	    raw_month_grid, raw_month_n_obs, this_month_period = utils.grid_1by1_cam(clean_data, raw_qc, hours_since, lat_index, lon_index, \
	              grid_hours, grid_lats, grid_lons, OBS_ORDER, settings.mdi, doMedian = settings.doMedian, \
		      doBC = doBC, doBCtotal = doBCtotal, doBChgt = doBChgt, doBCscn = doBCscn)
	    #raw_month_grid, raw_month_n_obs, this_month_period = utils.grid_1by1_cam(clean_data, raw_qc, hours_since, lat_index, lon_index, grid_hours, grid_lats, grid_lons, OBS_ORDER, settings.mdi, doMedian = True, doBC = doBC)
# end
            print "successfully read data into 1x1 3hrly grids"

            # create matching array size
            this_month_period = np.tile(this_month_period, (len(OBS_ORDER),1,1,1))

            for period in ["all", "day", "night"]:

                if period == "day":
                    this_month_grid = np.ma.masked_where(this_month_period == 1, raw_month_grid)
                    this_month_obs = np.ma.masked_where(this_month_period[0] == 1, raw_month_n_obs) # and take first slice to re-match the array size
                elif period == "night":
                    this_month_grid = np.ma.masked_where(this_month_period == 0, raw_month_grid)
                    this_month_obs = np.ma.masked_where(this_month_period[0] == 0, raw_month_n_obs) # and take first slice to re-match the array size
                else:
                    this_month_grid = copy.deepcopy(raw_month_grid)
                    this_month_obs = copy.deepcopy(raw_month_n_obs)
                    
# KATE modified
                # If SwitchOutput == 1 then we're in test mode - output interim files!!!
		if (SwitchOutput == 1):
		    # have one month of gridded data.
                    out_filename = settings.DATA_LOCATION + settings.OUTROOT + "_1x1_3hr_{}{:02d}_{}_{}.nc".format(year, month, period, suffix)              

                    utils.netcdf_write(out_filename, this_month_grid, np.zeros(this_month_obs.shape), this_month_obs, OBS_ORDER, grid_lats, grid_lons, times, frequency = "H")
		## have one month of gridded data.
                #out_filename = settings.DATA_LOCATION + settings.OUTROOT + "_1x1_3hr_{}{:02d}_{}_{}.nc".format(year, month, period, suffix)              

                #utils.netcdf_write(out_filename, this_month_grid, np.zeros(this_month_obs.shape), this_month_obs, OBS_ORDER, grid_lats, grid_lons, times, frequency = "H")
# end
                # now average over time
                # Dailies
                daily_hours = grid_hours.reshape(-1, 24/DELTA_HOUR)

                shape = this_month_grid.shape
                this_month_grid = this_month_grid.reshape(shape[0], -1, 24/DELTA_HOUR, shape[2], shape[3])
                this_month_obs = this_month_obs.reshape(-1, 24/DELTA_HOUR, shape[2], shape[3])

# KATE MEDIAN WATCH - settings.doMedian is generally set to True - I think we may want the MEAN HERE!!!
# KATE modified - to hard wire in MEAN here
                daily_grid = np.ma.mean(this_month_grid, axis = 2)
                #if settings.doMedian:
                #    daily_grid = np.ma.median(this_month_grid, axis = 2)
                #else:
                #    daily_grid = np.ma.mean(this_month_grid, axis = 2)
# end
                daily_grid.fill_value = settings.mdi

                # filter on number of observations/day
                n_hrs_per_day = np.ma.count(this_month_grid, axis = 2) 
                n_obs_per_day = np.ma.sum(this_month_obs, axis = 1) 

                if period == "all":
                    bad_locs = np.where(n_hrs_per_day < N_OBS_DAY) # at least 2 of possible 8 3-hourly values (6hrly data *KW OR AT LEAST 4 3HRLY OBS PRESENT*)
                else:
                    bad_locs = np.where(n_hrs_per_day < np.floor(N_OBS_DAY / 2.)) # at least 1 of possible 8 3-hourly values (6hrly data *KW OR AT LEAST 4 3HRLY OBS PRESENT*)              
                daily_grid.mask[bad_locs] = True

# KATE modified - added SwitchOutput to if loop
                if (SwitchOutput == 1) and settings.plots and (year in [1973, 1983, 1993, 2003, 2013]):
                #if settings.plots and (year in [1973, 1983, 1993, 2003, 2013]):
# end
                    # plot the distribution of hours

                    plt.clf()
                    plt.hist(n_hrs_per_day.reshape(-1), bins = np.arange(-1,10), align = "left", log = True, rwidth=0.5)
                    if period == "all":
                        plt.axvline(x = N_OBS_DAY-0.5, color = "r")
                    else:
                        plt.axvline(x = np.floor(N_OBS_DAY / 2.)-0.5, color = "r")       

                    plt.title("Number of 1x1-3hrly in each 1x1-daily grid box")
                    plt.xlabel("Number of 3-hrly observations (max = 8)")
                    plt.ylabel("Frequency (log scale)")
                    plt.savefig(settings.PLOT_LOCATION + "n_grids_1x1_daily_{}{:02d}_{}_{}.png".format(year, month, period, suffix))

                    plt.clf()
                    plt.hist(n_obs_per_day.reshape(-1), bins = np.arange(-5,100,5),  log = True, rwidth=0.5)                 
                    plt.title("Total number of raw observations in each 1x1 daily grid box")
                    plt.xlabel("Number of raw observations")
                    plt.ylabel("Frequency (log scale)")
                    plt.savefig(settings.PLOT_LOCATION + "n_obs_1x1_daily_{}{:02d}_{}_{}.png".format(year, month, period, suffix))

                # clear up memory
                del this_month_grid
                del this_month_obs
                gc.collect()

# KATE modified
                # If SwitchOutput == 1 then we're in test mode - output interim files!!!
		if (SwitchOutput == 1):
                    # write dailies file
                    times.data = daily_hours[:,0]
                    out_filename = settings.DATA_LOCATION + settings.OUTROOT + "_1x1_daily_{}{:02d}_{}_{}.nc".format(year, month, period, suffix)

                    utils.netcdf_write(out_filename, daily_grid, n_hrs_per_day[0], n_obs_per_day, OBS_ORDER, grid_lats, grid_lons, times, frequency = "D")
                #times.data = daily_hours[:,0]
                #out_filename = settings.DATA_LOCATION + settings.OUTROOT + "_1x1_daily_{}{:02d}_{}_{}.nc".format(year, month, period, suffix)

                #utils.netcdf_write(out_filename, daily_grid, n_hrs_per_day[0], n_obs_per_day, OBS_ORDER, grid_lats, grid_lons, times, frequency = "D")
# end
                # Monthlies
                times.data = daily_hours[0,0]

# KATE modified - commenting out as we don't need this anymore
#                if settings.doMedian:
#                    monthly_grid = np.ma.median(daily_grid, axis = 1)
#                else:
#                    monthly_grid = np.ma.mean(daily_grid, axis = 1)
#
#                monthly_grid.fill_value = settings.mdi
#
#                # filter on number of observations/month
#                n_grids_per_month = np.ma.count(daily_grid, axis = 1) 
#                bad_locs = np.where(n_grids_per_month < calendar.monthrange(year, month)[1] * N_OBS_FRAC_MONTH) # 30% of possible daily values
#                monthly_grid.mask[bad_locs] = True
#
#                # number of raw observations
#                n_obs_per_month = np.ma.sum(n_obs_per_day, axis = 0)
#
#                if settings.plots and (year in [1973, 1983, 1993, 2003, 2013]):
#                    # plot the distribution of days
#
#                    plt.clf()
#                    plt.hist(n_obs_per_month.reshape(-1), bins = np.arange(-10,500,10),  log = True, rwidth=0.5)
#                    plt.title("Total number of raw observations in each 1x1 monthly grid box")
#                    plt.xlabel("Number of raw observations")
#                    plt.ylabel("Frequency (log scale)")
#                    plt.savefig(settings.PLOT_LOCATION + "n_obs_1x1_monthly_{}{:02d}_{}_{}.png".format(year, month, period, suffix))
#
#                    plt.clf()
#                    plt.hist(n_grids_per_month[0].reshape(-1), bins = np.arange(-2,40,2), align = "left",  log = True, rwidth=0.5)
#                    plt.axvline(x = calendar.monthrange(year, month)[1] * N_OBS_FRAC_MONTH, color="r")
#                    plt.title("Total number of 1x1 daily grids in each 1x1 monthly grid")
#                    plt.xlabel("Number of 1x1 daily grids")
#                    plt.ylabel("Frequency (log scale)")
#                    plt.savefig(settings.PLOT_LOCATION + "n_grids_1x1_monthly_{}{:02d}_{}_{}.png".format(year, month, period, suffix))
#
#                # write monthly 1x1 file
#                out_filename = settings.DATA_LOCATION + settings.OUTROOT + "_1x1_monthly_{}{:02d}_{}_{}.nc".format(year, month, period, suffix)
#                utils.netcdf_write(out_filename, monthly_grid, n_grids_per_month[0], n_obs_per_month, OBS_ORDER, grid_lats, grid_lons, times, frequency = "M")
#            
#                # now to re-grid to coarser resolution
#                # KW # Here we may want to use the mean because its a large area but could be sparsely
#                #             populated with quite different climatologies so we want 
#                # the influence of the outliers (we've done our best to ensure these are good values) 
#
#                # go from monthly 1x1 to monthly 5x5 - retained as limited overhead
#                monthly_5by5, monthly_5by5_n_grids, monthly_5by5_n_obs, grid5_lats, grid5_lons = utils.grid_5by5(monthly_grid, n_obs_per_month, grid_lats, grid_lons, doMedian = settings.doMedian, daily = False)
#                out_filename = settings.DATA_LOCATION + settings.OUTROOT + "_5x5_monthly_{}{:02d}_{}_{}.nc".format(year, month, period, suffix)
#
#                utils.netcdf_write(out_filename, monthly_5by5, monthly_5by5_n_grids, monthly_5by5_n_obs, OBS_ORDER, grid5_lats, grid5_lons, times, frequency = "M")
#
#                if settings.plots and (year in [1973, 1983, 1993, 2003, 2013]):
#                    # plot the distribution of days
#
#                    plt.clf()
#                    plt.hist(monthly_5by5_n_obs.reshape(-1), bins = np.arange(0,100,5), log = True, rwidth=0.5)
#                    plt.title("Total number of raw observations in each 5x5 monthly grid box")
#                    plt.xlabel("Number of raw observations")
#                    plt.ylabel("Frequency (log scale)")
#                    plt.savefig(settings.PLOT_LOCATION + "n_obs_5x5_monthly_{}{:02d}_{}_{}.png".format(year, month, period, suffix))
#
#                    plt.clf()
#                    plt.hist(monthly_5by5_n_grids.reshape(-1), bins = np.arange(-2,30,2), align = "left", log = True, rwidth=0.5)
#                    plt.axvline(x = 1, color="r")
#                    plt.title("Total number of 1x1 monthly grids in each 5x5 monthly grid")
#                    plt.xlabel("Number of 1x1 monthly grids")
#                    plt.ylabel("Frequency (log scale)")
#                    plt.savefig(settings.PLOT_LOCATION + "n_grids_5x5_monthly_{}{:02d}_{}_{}.png".format(year, month, period, suffix))
#
#                # clear up memory
#                del monthly_grid
#                del monthly_5by5
#                del monthly_5by5_n_grids
#                del monthly_5by5_n_obs
#                del n_grids_per_month
#                del n_obs_per_month
#                del n_hrs_per_day
#                gc.collect()
# end
                # go direct from daily 1x1 to monthly 5x5
# KATE MEDIAN WATCH - settings.doMedian is generally set to True - I think we may want the MEAN HERE!!!
# KATE modified - to hard wire in MEAN here
                monthly_5by5, monthly_5by5_n_grids, monthly_5by5_n_obs, grid5_lats, grid5_lons = utils.grid_5by5(daily_grid, n_obs_per_day, grid_lats, grid_lons, doMedian = False, daily = True)
                #monthly_5by5, monthly_5by5_n_grids, monthly_5by5_n_obs, grid5_lats, grid5_lons = utils.grid_5by5(daily_grid, n_obs_per_day, grid_lats, grid_lons, doMedian = settings.doMedian, daily = True)
# end
                out_filename = settings.DATA_LOCATION + settings.OUTROOT + "_5x5_monthly_from_daily_{}{:02d}_{}_{}.nc".format(year, month, period, suffix)
 
                utils.netcdf_write(out_filename, monthly_5by5, monthly_5by5_n_grids, monthly_5by5_n_obs, OBS_ORDER, grid5_lats, grid5_lons, times, frequency = "M")

                

                if settings.plots and (year in [1973, 1983, 1993, 2003, 2013]):
                    # plot the distribution of days

                    plt.clf()
                    plt.hist(monthly_5by5_n_obs.reshape(-1), bins = np.arange(-10,1000,10),  log = True, rwidth=0.5)
                    plt.title("Total number of raw observations in each 5x5 monthly grid box")
                    plt.xlabel("Number of raw observations")
                    plt.ylabel("Frequency (log scale)")
                    plt.savefig(settings.PLOT_LOCATION + "n_obs_5x5_monthly_from_daily_{}{:02d}_{}_{}.png".format(year, month, period, suffix))


                    plt.clf()
                    plt.hist(monthly_5by5_n_grids.reshape(-1), bins = np.arange(-5,100,5), align = "left", log = True, rwidth=0.5)
                    plt.axvline(x = (0.3 * daily_grid.shape[0]), color="r")
                    plt.title("Total number of 1x1 daily grids in each 5x5 monthly grid")
                    plt.xlabel("Number of 1x1 daily grids")
                    plt.ylabel("Frequency (log scale)")

                    plt.savefig(settings.PLOT_LOCATION + "n_grids_5x5_monthly_from_daily_{}{:02d}_{}_{}.png".format(year, month, period, suffix))


                del daily_grid
                del monthly_5by5
                del n_obs_per_day
                del monthly_5by5_n_grids
                del monthly_5by5_n_obs
                gc.collect()

    return # do_gridding
def do_conversion(suffix = "relax", start_year = defaults.START_YEAR, end_year = defaults.END_YEAR, period = "all", doQC = False, doBC = False):
    '''
    Convert 3 hrlies to pentads 1x1

    First get pentad average of 3hrly values (so values at 0, 3, 6, ... averaged over 5 days)
    Then get average over the pentad.

    :param str suffix: "relax" or "strict" criteria
    :param int start_year: start year to process
    :param int end_year: end year to process
    :param str period: which period to do day/night/all?
    :param bool doQC: incorporate the QC flags or not
    :param bool doBC: work on the bias corrected data

    :returns:
    '''
    settings = set_paths_and_vars.set(doBC = doBC, doQC = doQC)


    # KW Added SUFFIX variable because all hourlies/dailies/monthlies now have suffix 'strict' (4/2 per daily/day-night) 
    # or 'relax' (2/1 per daily/day-night)
    if suffix == "relax":
        N_OBS_OVER_DAYS = 1 # at least 1 obs at this 3 hr timestamp from 5 days in pentad
        N_OBS_OVER_PENTAD = 2

    elif suffix == "strict":
        N_OBS_OVER_DAYS = 2
        N_OBS_OVER_PENTAD = 4  # at least 4 timestamps (of 8) in pentad, could be 2 for local 'relax' setting


    N_OBS_PER_DAY = 24/DELTA_HOUR

    for year in np.arange(start_year, end_year + 1): 

        all_pentads =  np.ma.zeros([len(OBS_ORDER), 73, len(grid_lats), len(grid_lons)])
        all_pentads.mask =  np.zeros([len(OBS_ORDER), 73, len(grid_lats), len(grid_lons)])

        # read in a years worth of 3hrly data
        for v, var in enumerate(OBS_ORDER):
            # arrays too massive to process all variables at once.
            print var.name
       
            var_3hrlys = read_data(settings, suffix, var.name, year, grid_lats, grid_lons, period, N_OBS_PER_DAY)

            # reshape to days x 3hrly obs (365(366),8,180,360)
            var_3hrlys = var_3hrlys.reshape(-1, N_OBS_PER_DAY, var_3hrlys.shape[1], var_3hrlys.shape[2])

            # process the leap-year if appropriate
            if calendar.isleap(year):
                var_3hrlys, incl_feb29th  = process_february(var_3hrlys, doMask = True)
            else:
                assert var_3hrlys.shape[0] == 365

            # get pentadly values for each timestep (73,5,8,180,360)
            shape = var_3hrlys.shape
            var_3hrlys = var_3hrlys.reshape(-1, 5, shape[-3], shape[-2], shape[-1]) # n_pentads x days x hrs x lat x lon

            n_days_per_timestamp = np.ma.count(var_3hrlys, axis = 1) # n_pentads x hrs x lat x lon

            # get average at each timestamp across the pentad - so have N_OBS_PER_DAY averaged values per pentad
            if settings.doMedian:
                pentad_3hrly_grid = utils.bn_median(var_3hrlys, axis = 1) # n_pentads x hrs x lat x lon
            else:
                pentad_3hrly_grid = np.ma.mean(var_3hrlys, axis = 1) # n_pentads x hrs x lat x lon

            pentad_3hrly_grid.mask[n_days_per_timestamp < N_OBS_OVER_DAYS] = True # mask where fewer than N_OBS_OVER_DAYS days have values
            
            # clear up memory
            del var_3hrlys
            gc.collect()

            # the pentad containing feb 29th is the 11th in the year (KW actually its the 12th, so the 11 in array speak which is what you have done)
            if calendar.isleap(year):
                #  overwrite this with the me(di)an of a 6-day pentad
                if settings.doMedian:
                    pentad_3hrly_grid[11, :, :, :] = utils.bn_median(incl_feb29th, axis = 0)
                else:
                    pentad_3hrly_grid[11, :, :, :] = np.ma.mean(incl_feb29th, axis = 0)

                feb_n_days_per_timestamp = np.ma.count(incl_feb29th, axis = 0)
                pentad_3hrly_grid.mask[11, :, :, :][feb_n_days_per_timestamp < N_OBS_OVER_DAYS] = True
                n_days_per_timestamp[11, :, :, :] = feb_n_days_per_timestamp

                print "processed Feb 29th"

            if settings.plots and v == 0:
                import matplotlib.pyplot as plt
                plt.clf()
                plt.hist(n_days_per_timestamp.reshape(-1), bins = np.arange(-1,7), align = "left", log = True, rwidth=0.5)
                plt.axvline(x = N_OBS_OVER_DAYS-0.5, color = "r")       
                plt.title("Number of days with obs at each 3hrly timestamp (over entire year)")
                plt.xlabel("Number of days (max = 5)")
                plt.ylabel("Frequency (log scale)")
                plt.savefig(settings.PLOT_LOCATION + "pentads_n_days_{}_{}_{}.png".format(year, period, suffix))

            # get single pentad values
            n_hrs_per_pentad = np.ma.count(pentad_3hrly_grid, axis = 1) # get the number of pentad-hours present in each pentad
            n_grids_per_pentad = np.sum(n_days_per_timestamp, axis = 1) # get the number of 3hrly 1x1 grids included per pentad 1x1

            # get average at each timestamp across the pentad - so have N_OBS_PER_DAY values per pentad
            if settings.doMedian:
                pentad_grid = utils.bn_median(pentad_3hrly_grid, axis = 1)
            else:
                pentad_grid = np.ma.mean(pentad_3hrly_grid, axis = 1)

            if period == "all":
# KW are you sure this should be n_hrs_per_pentad and not n_grids_per_pentad here? I think it should
                pentad_grid.mask[n_hrs_per_pentad < N_OBS_OVER_PENTAD] = True # mask where fewer than N_OBS_OVER_PENTAD hours have values
            else:
# KW are you sure this should be n_hrs_per_pentad and not n_grids_per_pentad here? I think it should
                pentad_grid.mask[n_hrs_per_pentad < (N_OBS_OVER_PENTAD/2.)] = True # mask where fewer than N_OBS_OVER_PENTAD hours have values
            
            all_pentads[v, :, :, :] = pentad_grid

            # diagnostics plots of obs/grids per pentad
            if settings.plots and v == 0:
                plt.clf()
                plt.hist(n_hrs_per_pentad.reshape(-1), bins = np.arange(-1,10), align = "left", log = True, rwidth=0.5)
                if period == "all":
                    plt.axvline(x = N_OBS_OVER_PENTAD-0.5, color = "r")       
                else:
                    plt.axvline(x = (N_OBS_OVER_PENTAD/2.)-0.5, color = "r")       
                plt.title("Number of hrs with obs in each pentad (over entire year)")
                plt.xlabel("Number of days (max = 8)")
                plt.ylabel("Frequency (log scale)")
                plt.savefig(settings.PLOT_LOCATION + "pentads_n_hrs_{}_{}_{}.png".format(year, period, suffix))

            # clear up memory
            del pentad_3hrly_grid
            del pentad_grid
            gc.collect()

        # done all main variables.  Now for number of observations
        print "n_obs"
        n_obs = read_data(settings, suffix, "n_obs", year, grid_lats, grid_lons, period, N_OBS_PER_DAY)
	# KW so we've gone from 8*365hrs,lats,lons to 365,8,lats,lons
        n_obs = n_obs.reshape(-1, N_OBS_PER_DAY, n_obs.shape[1], n_obs.shape[2])
        if calendar.isleap(year):
            n_obs, incl_feb29th  = process_february(n_obs, doMask = True)
        else:
            assert n_obs.shape[0] == 365    

        shape = n_obs.shape
	# KW so we're now at pentads, 5days, 8hours, lats, lons
        n_obs = n_obs.reshape(-1, 5, shape[-3], shape[-2], shape[-1]) # pentads x days x hours x lat x lon
        
	# KW This should sum over the 5days leaving pentads, 8hrs, lats, lons
	# n_obs has -1 as missing data!!! So sum will not work properly
	# set up fill_value as -1
	n_obs.fill_value = -1
        n_obs_per_3hrly_pentad = np.ma.sum(n_obs, axis = 1)
        n_obs_per_3hrly_pentad.fill_value = -1

        if calendar.isleap(year):
            n_obs_per_3hrly_pentad[11, :, :, :] = np.ma.sum(incl_feb29th, axis = 0)

        n_obs_per_pentad = np.ma.sum(n_obs_per_3hrly_pentad, axis = 1)

        # and write out
        times = utils.TimeVar("time", "time since 1/1/{} in hours".format(year), "hours", "time")
        times.data = np.arange(0, all_pentads.shape[1]) * 5 * 24

        out_filename = settings.DATA_LOCATION + settings.OUTROOT + "_1x1_pentad_from_3hrly_{}_{}_{}.nc".format(year, period, suffix)
        
        utils.netcdf_write(out_filename, all_pentads, n_grids_per_pentad, n_obs_per_pentad, OBS_ORDER, grid_lats, grid_lons, times, frequency = "P")


    return # do_conversion