import datetime as dt
import numpy as np
import sys
import argparse
import matplotlib
matplotlib.use('Agg') 
import calendar
import netCDF4 as ncdf
import copy

import utils
import set_paths_and_vars
defaults = set_paths_and_vars.set()


OBS_ORDER = utils.make_MetVars(defaults.mdi, multiplier = False) 

# what size grid (lat/lon)
DELTA_LAT = 5
DELTA_LON = 5

# set up the grid
# set up the grid
# KATE modified - flipped the lats to go 90 to -90
grid_lats = np.arange(90 - DELTA_LAT, -90 - DELTA_LAT, -DELTA_LAT)
#grid_lats = np.arange(-90 + DELTA_LAT, 90 + DELTA_LAT, DELTA_LAT)
# end
grid_lons = np.arange(-180 + DELTA_LAT, 180 + DELTA_LON, DELTA_LON)


# subroutine start
def do_conversion(start_year=defaults.START_YEAR, end_year=defaults.END_YEAR, period="all", doBC=False, doQC=True):
    """
    Convert dailies to pentads 1x1

    :param int start_year: start year to process
    :param int end_year: end year to process
    :param str period: which period to do day/night/all?
    :param bool doBC: work on the bias corrected data
    :param bool doQC: incorporate the QC flags or not


    :returns:
    """
    settings = set_paths_and_vars.set(doBC=doBC, doQC=doQC)

    OBS_ORDER = utils.make_MetVars(settings.mdi, multiplier=False)

    for year in np.arange(start_year, end_year + 1):

        # set up empty data array
        all_dailies = np.ma.zeros([len(OBS_ORDER), utils.days_in_year(year), len(grid_lats), len(grid_lons)])
        all_dailies.mask = np.zeros([len(OBS_ORDER), utils.days_in_year(year), len(grid_lats), len(grid_lons)])
        all_dailies.fill_value = settings.mdi

        all_n_obs = np.zeros([utils.days_in_year(year), len(grid_lats), len(grid_lons)])

        year_start = dt.datetime(year, 1, 1, 0, 0)

        for month in np.arange(12) + 1:
            print year, month

            month_start = utils.day_of_year(year, month)
            month_end = month_start + calendar.monthrange(year, month)[1]

            filename = "{}/{}_1x1_daily_{}{:02d}_{}.nc".format(
                settings.DATA_LOCATION, settings.OUTROOT, year, month, period
            )

            ncdf_file = ncdf.Dataset(filename, "r", format="NETCDF4")

            for v, var in enumerate(OBS_ORDER):

                if month == 12:
                    # run to end of year if december
                    all_dailies[v, month_start:, :, :] = ncdf_file.variables[var.name][:]
                else:
                    all_dailies[v, month_start:month_end, :, :] = ncdf_file.variables[var.name][:]

            # now get number of observations
            if month == 12:
                all_n_obs[month_start:, :, :] = ncdf_file.variables["n_obs"][:]
            else:
                all_n_obs[month_start:month_end, :, :] = ncdf_file.variables["n_obs"][:]

        if calendar.isleap(year):
            assert all_dailies.shape[1] == 366

            # extract 6-day pentad
            incl_feb29th = all_dailies[:, 55:61, :, :]

            # remove the data of Feb 29th from array
            # np.ma.delete doesn't exist, so have to copy mask separately
            mask = all_dailies.mask
            all_dailies = np.delete(all_dailies, 59, 1)
            mask = np.delete(mask, 59, 1)
            all_dailies = np.ma.array(all_dailies, mask=mask)
            del mask

            # number of observations
            incl_feb29th_n_obs = all_n_obs[55:61, :, :]
            all_n_obs = np.delete(all_n_obs, 59, 0)

        else:
            assert all_dailies.shape[1] == 365

        shape = all_dailies.shape
        all_dailies = all_dailies.reshape(shape[0], -1, 5, shape[-2], shape[-1])

        n_days_per_pentad = np.ma.count(all_dailies, axis=2)

        if settings.doMedian:
            pentad_grid = utils.bn_median(all_dailies, axis=2)
        else:
            pentad_grid = np.ma.mean(all_dailies, axis=2)

        # clear up memory
        del all_dailies
        gc.collect()

        all_n_obs = all_n_obs.reshape(-1, 5, shape[-2], shape[-1])
        all_n_obs = np.sum(all_n_obs, axis=1)

        pentad_grid.mask[
            n_days_per_pentad < N_OBS
        ] = True  # mask where fewer than 2 days have values # KW THIS IS ACTUALLY 2 - WHICH I THINK IS GOOD

        # the pentad containing feb 29th is the 11th in the year
        if calendar.isleap(year):
            #  overwrite this with the me(di)an of a 6-day pentad
            if settings.doMedian:
                pentad_grid[:, 11, :, :] = utils.bn_median(incl_feb29th, axis=1)
            else:
                pentad_grid[:, 11, :, :] = np.ma.mean(incl_feb29th, axis=1)

            feb_n_days_per_pentad = np.ma.count(incl_feb29th, axis=1)
            pentad_grid.mask[:, 11, :, :][feb_n_days_per_pentad < N_OBS] = True
            n_days_per_pentad[:, 11, :, :] = feb_n_days_per_pentad

            all_n_obs[11, :, :] = np.sum(incl_feb29th_n_obs, axis=0)

            print "processed Feb 29th"

        times = utils.TimeVar("time", "time since 1/1/{} in hours".format(year), "hours", "time")
        times.data = np.arange(0, pentad_grid.shape[1]) * 5 * 24

        out_filename = settings.DATA_LOCATION + settings.OUTROOT + "_1x1_pentad_{}_{}.nc".format(year, period)

        utils.netcdf_write(
            out_filename,
            pentad_grid,
            n_days_per_pentad[0],
            all_n_obs,
            OBS_ORDER,
            grid_lats,
            grid_lons,
            times,
            frequency="P",
        )

        del pentad_grid
        del all_n_obs
        del n_days_per_pentad
        gc.collect()

    return  # do_conversion
def make_timeseries(
    suffix="relax",
    doQC=False,
    doQC1it=False,
    doQC2it=False,
    doQC3it=False,
    doBC=False,
    doBCtotal=False,
    doBChgt=False,
    doBCscn=False,
):
    # def make_timeseries(suffix = "relax", doQC = False, doBC = False):
    # end
    """
    Make the timeseries - plots and netCDF files

    :param str suffix: "relax" or "strict" criteria
    :param bool doQC: incorporate the QC flags or not
# KATE modified
    :param bool doQC1it: incorporate the first iteration QC flags or not
    :param bool doQC2it: incorporate the second iteration QC flags or not
    :param bool doQC3it: incorporate the third iteration QC flags or not
# end
    :param bool doBC: work on the bias corrected data
# KATE modified
    :param bool doBCtotal: work on the bias corrected data
    :param bool doBChgt: work on the bias corrected data
    :param bool doBCscn: work on the bias corrected data
# end

    :returns:
    """
    # KATE modified
    settings = set_paths_and_vars.set(
        doBC=doBC,
        doBCtotal=doBCtotal,
        doBChgt=doBChgt,
        doBCscn=doBCscn,
        doQC=doQC,
        doQC1it=doQC1it,
        doQC2it=doQC2it,
        doQC3it=doQC3it,
    )
    # settings = set_paths_and_vars.set(doBC = doBC, doQC = doQC)
    # end

    print "Do QC = {}".format(doQC)
    # KATE modified
    print "Do QC1it = {}".format(doQC1it)
    print "Do QC2it = {}".format(doQC2it)
    print "Do QC3it = {}".format(doQC3it)
    # end
    print "Do BC = {}".format(doBC)
    # KATE modified
    print "Do BCtotal = {}".format(doBCtotal)
    print "Do BChgt = {}".format(doBChgt)
    print "Do BCscn = {}".format(doBCscn)
    # end

    # monthly -> annual

    watermarkstring = (
        "/".join(os.getcwd().split("/")[4:])
        + "/"
        + os.path.basename(__file__)
        + "   "
        + dt.datetime.strftime(dt.datetime.now(), "%d-%b-%Y %H:%M")
    )

    # run on the actuals (which include anomalies from ERA) and the anomalies (calculated from obs-actuals, but also include the anomalies from ERA)
    # KATE modified to add new file name bit '_renorm19812010'
    for version in ["", "_renorm19812010_anomalies"]:
        # for version in ["", "_anomalies"]:
        # end
        if version == "":
            print "5x5 monthly Standard"
        elif version == "_anomalies":
            print "5x5 monthly Anomalies"

        for period in ["both", "day", "night"]:
            print period

            filename = "{}/{}_5x5_monthly{}_from_daily_{}_{}.nc".format(
                settings.DATA_LOCATION, settings.OUTROOT, version, period, suffix
            )

            print filename
            ncdf_file = ncdf.Dataset(filename, "r", format="NETCDF4")

            lat_centres = ncdf_file.variables["latitude"]
            lon_centres = ncdf_file.variables["longitude"]

            n_obs = utils.set_MetVar_attributes(
                "n_obs", "Number of Observations", "Number of Observations", 1, -1, np.dtype("int64"), 0
            )
            OBS_ORDER = utils.make_MetVars(settings.mdi, multiplier=False)
            OBS_ORDER += [n_obs]

            for v, var in enumerate(OBS_ORDER):
                print var.name

                var.data = ncdf_file.variables[var.name][:]

                # make annual and monthly timeseries

                mesh_lon, mesh_lat = np.meshgrid(lon_centres, lat_centres)
                cosines = np.cos(np.radians(mesh_lat))

                full_cosines = mask_and_normalise_weights(cosines, var.data)
                # masked weights now sum to one for each field

                if var.name == "n_obs":
                    weighted_data = var.data
                else:
                    weighted_data = var.data * full_cosines

                plot_values = np.zeros(weighted_data.shape[0])
                plot_times = []
                for y in range(weighted_data.shape[0]):

                    plot_values[y] = np.ma.sum(weighted_data[y])

                    plot_times += [dt.datetime(settings.START_YEAR + (y / 12), 1 + (y % 12), 1, 0, 0)]

                # plot the monthly data
                plt.clf()
                plt.plot(plot_times, plot_values, "r-", label="Monthly")

                var.mdata = plot_values
                monthly_times = plot_times

                # and annual
                plot_values = plot_values.reshape(-1, 12)

                if var.name != "n_obs":
                    plot_values = np.mean(plot_values, axis=1)
                    plot_times = [dt.datetime(settings.START_YEAR + y, 7, 1) for y in range(plot_values.shape[0])]
                    plt.plot(plot_times, plot_values, "b-", label="Annual")

                    plt.ylabel(var.units)

                else:
                    # if n_obs, then have second x-axis
                    plot_values = np.sum(plot_values, axis=1)
                    plot_times = [dt.datetime(settings.START_YEAR + y, 7, 1) for y in range(plot_values.shape[0])]

                    # finish off first axis
                    ax1 = plt.gca()
                    ax1.set_ylabel("Monthly", color="r")
                    for tl in ax1.get_yticklabels():
                        tl.set_color("r")

                    # add second axis
                    ax2 = ax1.twinx()
                    ax2.plot(plot_times, plot_values, "b-", label="Annual")
                    ax2.set_ylabel("Annual", color="b")
                    for tl in ax2.get_yticklabels():
                        tl.set_color("b")

                var.adata = plot_values
                annual_times = plot_times

                # and prettify the plot
                plt.title(" ".join([x.capitalize() for x in var.name.split("_")]))
                if var.name != "n_obs":
                    plt.legend()
                plt.figtext(0.01, 0.01, watermarkstring, size=6)

                plt.savefig(
                    "{}/{}_5x5_monthly{}_from_daily_{}_{}_ts.png".format(
                        settings.PLOT_LOCATION, settings.OUTROOT, version, period, var.name
                    )
                )

            # clean up
            ncdf_file.close()
            del (weighted_data)
            del (full_cosines)
            gc.collect()

            # write output files (annual and monthly)
            filename = "{}/{}_5x5_monthly{}_from_daily_{}_{}_ts_annual.nc".format(
                settings.DATA_LOCATION, settings.OUTROOT, version, period, suffix
            )

            if os.path.exists(filename):
                os.remove(filename)
            write_ncdf_ts(annual_times, OBS_ORDER, filename, annual=True, do_zip=True)

            filename = "{}/{}_5x5_monthly{}_from_daily_{}_{}_ts_monthly.nc".format(
                settings.DATA_LOCATION, settings.OUTROOT, version, period, suffix
            )

            if os.path.exists(filename):
                os.remove(filename)
            write_ncdf_ts(monthly_times, OBS_ORDER, filename, monthly=True, do_zip=True)

            # clean up
            del (plot_values)
            del (plot_times)
            del (OBS_ORDER)
            gc.collect()

    # not activated at present
    pentads = False
    if pentads:
        # pentad -> annual
        OBS_ORDER = utils.make_MetVars(settings.mdi, multiplier=False)

        for v, var in enumerate(OBS_ORDER):
            print var.name

            filename = "{}/{}_1x1_pentads_from_3hrly_{}_{}_{}.nc".format(
                settings.DATA_LOCATION, settings.OUTROOT, var.name, period, suffix
            )

            ncdf_file = ncdf.Dataset(filename, "r", format="NETCDF4")

            lat_centres = ncdf_file.variables["latitude"]
            lon_centres = ncdf_file.variables["longitude"]

            data_shape = ncdf_file.variables[var.name][:].shape

            # pentads
            mesh_lon, mesh_lat = np.meshgrid(lon_centres, lat_centres)
            cosines = np.cos(np.radians(mesh_lat))

            plot_values = np.zeros(data_shape[0])
            plot_times = []
            year = copy.deepcopy(settings.START_YEAR)

            for ts in range(data_shape[0]):

                data = ncdf_file.variables[var.name][ts]

                full_cosines = np.ma.array(cosines)
                full_cosines.mask = data.mask
                full_cosines = full_cosines / np.sum(full_cosines)

                weighted_data = data * full_cosines

                plot_values[ts] = np.ma.sum(weighted_data)

                if calendar.isleap(year) and ((ts + 1) * 5) % 365 > 60:
                    # account for 6 day pentad in leap years
                    plot_times += [dt.datetime(year, 1, 1, 0, 0) + dt.timedelta(days=((ts + 1) * 5) % 365 + 1)]
                else:
                    plot_times += [dt.datetime(year, 1, 1, 0, 0) + dt.timedelta(days=((ts + 1) * 5) % 365)]

                print year, ts, plot_times[-1]

                if ((ts + 1) * 5) % 365 == 0:
                    year += 1

            plt.clf()
            plt.plot(plot_times, plot_values, "r-")
            plt.title(var.name)
            plt.ylabel(var.units)

            # annual

            plot_values = plot_values.reshape(-1, 73, data_shape[-2], data_shape[-1])
            plot_values = np.mean(plot_values, axis=1)

            plt.plot(plot_times[36::73], plot_values, "b-")

            plt.savefig("{}/{}_pentads_all.png".format(settings.PLOT_LOCATION, var.name))

            raw_input("check")

    return  # make_timeseries
def combine_files(suffix = "relax", pentads = False, do3hr = False, months = False, daily = False, start_year = defaults.START_YEAR, end_year = defaults.END_YEAR, start_month = 1, end_month = 12, period = "both", 
                  doQC = False, doQC1it = False, doQC2it = False, doQC3it = False, doBC = False, doBCtotal = False, doBChgt = False, doBCscn = False):
#def combine_files(suffix = "relax", pentads = False, do3hr = False, months = False, daily = False, start_year = defaults.START_YEAR, end_year = defaults.END_YEAR, start_month = 1, end_month = 12, period = "both", doQC = False, doBC = False):
# end
    '''
    Combine the files, first the pentads 1x1, then the monthlies 5x5

    :param str suffix: "relax" or "strict" criteria
    :param bool pentads: run on pentads
    :param bool do3hr: run on pentads created from 3hrly data (if False then run on those from daily)
    :param bool months: run on 5x5 monthly data
    :param bool daily: run on monthlies created direct from dailies (if False the run on those from 1x1 monthlies)
    :param int start_year: start year to process
    :param int end_year: end year to process
    :param int start_month: start month to process
    :param int end_month: end month to process
    :param str period: which period to do day/night/both?
    :param bool doQC: incorporate the QC flags or not
# KATE modified
    :param bool doQC1it: incorporate the 1st iteration QC flags or not
    :param bool doQC2it: incorporate the 2nd iteration QC flags or not
    :param bool doQC3it: incorporate the 3rd iteration QC flags or not
# end
    :param bool doBC: work on the bias corrected data
# KATE modified
    :param bool doBCtotal: work on the bias corrected data
    :param bool doBChgt: work on the hieght only bias corrected data
    :param bool doBCscn: work on the screen only bias corrected data
# end

    :returns:
    '''

# KATE modified
    settings = set_paths_and_vars.set(doBC = doBC, doBCtotal = doBCtotal, doBChgt = doBChgt, doBCscn = doBCscn, doQC = doQC, doQC1it = doQC1it, doQC2it = doQC2it, doQC3it = doQC3it)
    #settings = set_paths_and_vars.set(doBC = doBC, doQC = doQC)
# end
    # pentads
    if pentads:

        OBS_ORDER = utils.make_MetVars(settings.mdi, multiplier = False)
        # KW make OBS_ORDER only the actual variables - remove anomalies
        NEWOBS_ORDER = []
        for v, var in enumerate(OBS_ORDER):
            if "anomalies" not in var.name:
                NEWOBS_ORDER.append(var)
        del OBS_ORDER
        OBS_ORDER = np.copy(NEWOBS_ORDER)
        del NEWOBS_ORDER     

        # set up the grids
        DELTA=1
        grid_lats = np.arange(-90+DELTA, 90+DELTA, DELTA)
        grid_lons = np.arange(-180+DELTA, 180+DELTA, DELTA)

        Nyears = end_year - start_year + 1

        # read in each variable - memory issues
        for v, var in enumerate(OBS_ORDER):

            print var.name

            all_pentads = np.ma.zeros((1, Nyears, 73, len(grid_lats), len(grid_lons)))
            all_pentads.mask = np.ones((1, Nyears, 73, len(grid_lats), len(grid_lons)))
            all_pentads.fill_value = settings.mdi

            n_obs = np.zeros((Nyears, 73, len(grid_lats), len(grid_lons)))
            n_grids = np.zeros((Nyears, 73, len(grid_lats), len(grid_lons)))


            for y, year in enumerate(np.arange(start_year, end_year + 1)):

                if do3hr:
                    filename = settings.DATA_LOCATION + "{}_1x1_pentad_from_3hrly_{}_{}_{}.nc".format(settings.OUTROOT, year, period, suffix)
                else:
                    filename = settings.DATA_LOCATION + "{}_1x1_pentad_{}_{}_{}.nc".format(settings.OUTROOT, year, period, suffix)

                ncdf_file = ncdf.Dataset(filename,'r', format='NETCDF4')

                time = ncdf_file.variables["time"]

                try:
                    assert time.long_name == "time since 1/1/{} in hours".format(year)

                except AssertionError:
                    print "time units are not as expected."
                    print "    expected time since 1/1/{} in hours".format(year)
                    print "    got {}".format(time.long_name)
                    sys.exit()

                all_pentads[0, y, :, :, :] = ncdf_file.variables[var.name][:]

                n_obs[y, :, :, :] = ncdf_file.variables["n_obs"][:]
                n_grids[y, :, :, :] = ncdf_file.variables["n_obs"][:]

                print year

                if y == 0 and period == "both":
                    lat_centres = ncdf_file.variables["latitude"]
# KATE modified - this results in lats that go from 92.5 to -82,5 or 90.5 to -88.5 so I've switched the + for a -
                    latitudes = lat_centres - (lat_centres[1] - lat_centres[0])/2.
                    #latitudes = lat_centres + (lat_centres[1] - lat_centres[0])/2.
# end
                    lon_centres = ncdf_file.variables["longitude"]
                    longitudes = lon_centres + (lon_centres[1] - lon_centres[0])/2.

                ncdf_file.close()

            all_pentads = all_pentads.reshape(1, -1, len(grid_lats), len(grid_lons))

            # sort the times
            times = utils.TimeVar("time", "time since 1/1/1973 in months", "months", "time")
            times.data = np.arange(all_pentads.shape[1])

            # and write file
            if do3hr:
                out_filename = settings.DATA_LOCATION + "{}_1x1_pentads_from_3hrly_{}_{}_{}.nc".format(settings.OUTROOT, var.name, period, suffix)
            else:
                out_filename = settings.DATA_LOCATION + "{}_1x1_pentads_{}_{}_{}.nc".format(settings.OUTROOT, var.name, period, suffix)

            if period == "both":
                utils.netcdf_write(out_filename, all_pentads, n_grids, n_obs, OBS_ORDER, latitudes, longitudes, times, frequency = "P", single = var)
            else:
                utils.netcdf_write(out_filename, all_pentads, n_grids, n_obs, OBS_ORDER, grid_lats, grid_lons, times, frequency = "P", single = var)


        # Reset the data holding arrays and objects

        del OBS_ORDER
        gc.collect()

    if months:

        OBS_ORDER = utils.make_MetVars(settings.mdi, multiplier = False)

        #*****************************
        # monthlies
        for y, year in enumerate(np.arange(start_year, end_year + 1)): 
            print year

            for month in np.arange(start_month, end_month + 1):
                print "   {}".format(month)

                if daily:
                    filename = settings.DATA_LOCATION + "{}_5x5_monthly_from_daily_{}{:02d}_{}_{}.nc".format(settings.OUTROOT, year, month, period, suffix)
                else:
                    filename = settings.DATA_LOCATION + "{}_5x5_monthly_{}{:02d}_{}_{}.nc".format(settings.OUTROOT, year, month, period, suffix)

                ncdf_file = ncdf.Dataset(filename,'r', format='NETCDF4')

                time = ncdf_file.variables["time"]

                try:
                    assert time.long_name == "time since 1/{}/{} in hours".format(month, year)

                except AssertionError:
                    print "time units are not as expected."
                    print "    expected time since 1/{}/{} in hours".format(month, year)
                    print "    got {}".format(time.long_name)
                    sys.exit()

                for v, var in enumerate(OBS_ORDER):

                    nc_var = ncdf_file.variables[var.name]

                    try:
                        var.data = utils.ma_append(var.data, nc_var[:], axis = 0)

                        if v == 0:
                            n_obs = utils.ma_append(n_obs, ncdf_file.variables["n_obs"][:], axis = 0)
                            n_grids = utils.ma_append(n_grids, ncdf_file.variables["n_grids"][:], axis = 0)

                    except AttributeError:
                        var.data = nc_var[:]
                        var.data.fill_value = nc_var.missing_value

                        if v == 0:
                            n_obs = ncdf_file.variables["n_obs"][:]
                            n_grids = ncdf_file.variables["n_grids"][:]


                if y == 0 and month == start_month and period == "both":
                    lat_centres = ncdf_file.variables["latitude"]
                    latitudes = lat_centres + (lat_centres[1] - lat_centres[0])/2.

                    lon_centres = ncdf_file.variables["longitude"]
                    longitudes = lon_centres + (lon_centres[1] - lon_centres[0])/2.

# KATE modified - added an extra loop so that we can flip the latitudes for day and night too
                if y == 0 and month == start_month and period != "both":
                    lat_centres = ncdf_file.variables["latitude"]
                    # THIS IS - RATHER THAN + READY TO FLIP THE LATS
		    latitudes = lat_centres - (lat_centres[1] - lat_centres[0])/2.

                    lon_centres = ncdf_file.variables["longitude"]
                    longitudes = lon_centres + (lon_centres[1] - lon_centres[0])/2.
# end                    
                ncdf_file.close()
            
        # write out into big array for netCDF file
        all_data = np.ma.zeros((len(OBS_ORDER), var.data.shape[0], var.data.shape[1], var.data.shape[2]))
        all_data.mask = np.zeros((len(OBS_ORDER), var.data.shape[0], var.data.shape[1], var.data.shape[2]))

        for v, var in enumerate(OBS_ORDER):
            all_data[v, :, :, :] = var.data

# KATE modified - switching the latitudes on day and night data for consistency with both
        if period == "day" or period == "night":
            # invert latitudes
            latitudes = latitudes[::-1]
            all_data = all_data[:,:,::-1,:] # variable, time, latitude, longitude
# end

        all_data.fill_value = var.data.fill_value

        # extra stuff for writing
# KATE modified - no longer need grid5 as we're using latitudes and longitudes
        #DELTA=5
        #grid5_lats = np.arange(-90+DELTA, 90+DELTA, DELTA)
        #grid5_lons = np.arange(-180+DELTA, 180+DELTA, DELTA)
# end
# KATE modified - START_YEAR not defined, should be start_year
        times = utils.TimeVar("time", "time since 1/1/{} in months".format(start_year), "months", "time")
        #times = utils.TimeVar("time", "time since 1/1/{} in months".format(START_YEAR), "months", "time")
# end
        times.data = np.arange(var.data.shape[0])

        # and write file
        if daily:
            out_filename = settings.DATA_LOCATION + settings.OUTROOT + "_5x5_monthly_from_daily_{}_{}.nc".format(period, suffix)
        else:
            out_filename = settings.DATA_LOCATION + settings.OUTROOT + "_5x5_monthly_{}_{}.nc".format(period, suffix)

# KATE modified - now always using latitudes and longitudes
        utils.netcdf_write(out_filename, all_data, n_grids, n_obs, OBS_ORDER, latitudes, longitudes, times, frequency = "Y")
        #if period == "both":
        #    utils.netcdf_write(out_filename, all_data, n_grids, n_obs, OBS_ORDER, latitudes, longitudes, times, frequency = "Y")
        #else:
        #    utils.netcdf_write(out_filename, all_data, n_grids, n_obs, OBS_ORDER, grid5_lats, grid5_lons, times, frequency = "Y")
# end
        

    return # combine_files
def do_merge(fileroot, mdi, suffix = "relax", clims = False, doMedian = False):
    '''
    Merge the _day and _night files

    Do a np.ma.mean or median for the data and a sum for the n_obs and n_grids

    Output with a _both suffix

    :param str fileroot: root for filenames
    :param flt mdi: missing data indicator
    :param str suffix: "relax" or "strict" criteria
    :param bool clims: if climatologies then don't try and process anomalies.
    '''

    OBS_ORDER = utils.make_MetVars(mdi, multiplier = False)

    if clims:
        # KW make OBS_ORDER only the actual variables - remove anomalies
        NEWOBS_ORDER = []
        for v, var in enumerate(OBS_ORDER):
            if "anomalies" not in var.name:
                NEWOBS_ORDER.append(var)
        del OBS_ORDER
        OBS_ORDER = np.copy(NEWOBS_ORDER)
        del NEWOBS_ORDER     


    # spin through both periods
    for p, period in enumerate(["day", "night"]):
        print period
        
        # go through the variables
        for v, var in enumerate(OBS_ORDER):

            print "   {}".format(var.name)

            ncdf_file = ncdf.Dataset("{}_{}_{}.nc".format(fileroot, period, suffix),'r', format='NETCDF4')

            if v == 0 and p == 0:

                shape = list(ncdf_file.variables[var.name][:].shape)
                shape.insert(0, len(OBS_ORDER)+2) # add all the variables
                shape.insert(0, 2) # insert extra dimension to allow day + night

                all_data = np.ma.zeros(shape)

                all_data[p, v] = ncdf_file.variables[var.name][:]

                # get lats/lons of box centres
                lat_centres = ncdf_file.variables["latitude"]
# KATE modified - this results in lats that go from 92.5 to -82,5 so I've switched the + for a -
                latitudes = lat_centres - (lat_centres[1] - lat_centres[0])/2.
                #latitudes = lat_centres + (lat_centres[1] - lat_centres[0])/2.
# end
                lon_centres = ncdf_file.variables["longitude"]
                longitudes = lon_centres + (lon_centres[1] - lon_centres[0])/2.

                # get times - make a dummy object and then populate attributes
                times = utils.TimeVar("time", "time since 1/{}/{} in hours".format(1, 1973), "hours", "time")

                times.long_name = ncdf_file.variables["time"].long_name
                times.standard_name = ncdf_file.variables["time"].standard_name
                times.long_name = ncdf_file.variables["time"].long_name
                times.units = ncdf_file.variables["time"].units

                times.data = ncdf_file.variables["time"][:]

            else:
                all_data[p, v] = ncdf_file.variables[var.name][:]

        # and get n_obs and n_grids
        all_data[p, -2] = ncdf_file.variables["n_grids"][:]
        all_data[p, -1] = ncdf_file.variables["n_obs"][:]

    # invert latitudes
    latitudes = latitudes[::-1]
    all_data = all_data[:,:,:,::-1,:]

    # got all the info, now merge
    if doMedian:
        merged_data = utils.bn_median(all_data[:, :len(OBS_ORDER)], axis = 0)
    else:
        merged_data = np.ma.mean(all_data[:, :len(OBS_ORDER)], axis = 0)

    # and process the grids and observations (split off here so have incorporated latitude inversion)
    n_grids = np.ma.sum(all_data[:, -2], axis = 0)
    n_obs = np.ma.sum(all_data[:, -1], axis = 0)
    n_obs.fill_value = -1
    n_grids.fill_value = -1

    # write the output file
    utils.netcdf_write("{}_{}_{}.nc".format(fileroot, "both", suffix), merged_data, n_grids, n_obs, OBS_ORDER, latitudes, longitudes, times, frequency = "P")

    # test distribution of obs with grid boxes
    outfile = file("{}_{}_{}.txt".format(fileroot.split("/")[-1], "both", suffix), "w")
    utils.boxes_with_n_obs(outfile, n_obs, merged_data[0], "")


    return # do_merge
def do_gridding(suffix = "relax", start_year = defaults.START_YEAR, end_year = defaults.END_YEAR, start_month = 1, end_month = 12, 
                doQC = False, doQC1it = False, doQC2it = False, doQC3it = False, doSST_SLP = False, 
		doBC = False, doBCtotal = False, doBChgt = False, doBCscn = False, doUncert = False):
#def do_gridding(suffix = "relax", start_year = defaults.START_YEAR, end_year = defaults.END_YEAR, start_month = 1, end_month = 12, doQC = False, doSST_SLP = False, doBC = False, doUncert = False):
# end
    '''
    Do the gridding, first to 3hrly 1x1, then to daily 1x1 and finally monthly 1x1 and 5x5

    :param str suffix: "relax" or "strict" criteria
    :param int start_year: start year to process
    :param int end_year: end year to process
    :param int start_month: start month to process
    :param int end_month: end month to process
    :param bool doQC: incorporate the QC flags or not
    :param bool doQC1it: incorporate the first iteration (no buddy) QC flags or not
    :param bool doQC2it: incorporate the second iteration (no buddy) QC flags or not
    :param bool doQC3it: incorporate the third iteration (buddy) QC flags or not
    :param bool doSST_SLP: process additional variables or not
    :param bool doBC: work on the bias corrected data
    :param bool doBCtotal: work on the full bias corrected data
    :param bool doBChgt: work on the height only bias corrected data
    :param bool doBCscn: work on the screen only bias corrected data
    :param bool doUncert: work on files with uncertainty information (not currently used)

    :returns:
    '''
# KATE modified    
    settings = set_paths_and_vars.set(doBC = doBC, doBCtotal = doBCtotal, doBChgt = doBChgt, doBCscn = doBCscn, doQC = doQC, doQC1it = doQC1it, doQC2it = doQC2it, doQC3it = doQC3it)
    #settings = set_paths_and_vars.set(doBC = doBC, doQC = doQC)
# end


# KATE modified  - added other BC options  
#    if doBC:
    if doBC | doBCtotal | doBChgt | doBCscn:
# end
        fields = mds.TheDelimitersExt # extended (BC)
    else:
        fields = mds.TheDelimitersStd # Standard

# KATE modified  - added other BC options  
#    OBS_ORDER = utils.make_MetVars(settings.mdi, doSST_SLP = doSST_SLP, multiplier = True, doBC = doBC) # ensure that convert from raw format at writing stage with multiplier
    OBS_ORDER = utils.make_MetVars(settings.mdi, doSST_SLP = doSST_SLP, multiplier = True, doBC = doBC, doBCtotal = doBCtotal, doBChgt = doBChgt, doBCscn = doBCscn) # ensure that convert from raw format at writing stage with multiplier
# end

    # KW switching between 4 ('_strict') for climatology build and 2 for anomaly buily ('_relax') - added subscripts to files
    if suffix == "relax":
        N_OBS_DAY = 2 # KW ok for anomalies but this was meant to be 4 for dailies_all? and 2 for dailies_night/day?
        N_OBS_FRAC_MONTH = 0.3

    elif suffix == "strict":
        N_OBS_DAY = 4
        N_OBS_FRAC_MONTH = 0.3


    # flags to check on and values to allow through
# KATE modified
    if doQC1it | doQC2it:
        these_flags = {"ATclim":0,"ATrep":0,"DPTclim":0,"DPTssat":0,"DPTrep":0,"DPTrepsat":0}
    else:
        these_flags = {"ATbud":0, "ATclim":0,"ATrep":0,"DPTbud":0,"DPTclim":0,"DPTssat":0,"DPTrep":0,"DPTrepsat":0}    
    #these_flags = {"ATbud":0, "ATclim":0,"ATrep":0,"DPTbud":0,"DPTclim":0,"DPTssat":0,"DPTrep":0,"DPTrepsat":0}
# end

    # spin through years and months to read files
    for year in np.arange(start_year, end_year + 1): 

        for month in np.arange(start_month, end_month + 1):

            times = utils.TimeVar("time", "time since 1/{}/{} in hours".format(month, year), "hours", "time")

            grid_hours = np.arange(0, 24 * calendar.monthrange(year, month)[1], DELTA_HOUR)

            times.data = grid_hours

            # process the monthly file
# KATE modified  - added other BC options  
#            if doBC:
            if doBC | doBCtotal | doBChgt | doBCscn:
# end
                filename = "new_suite_{}{:02d}_{}_extended.txt".format(year, month, settings.OUTROOT)
            else:
                filename = "new_suite_{}{:02d}_{}.txt".format(year, month, settings.OUTROOT)

# KATE modified  - added other BC options  
#            raw_platform_data, raw_obs, raw_meta, raw_qc = utils.read_qc_data(filename, settings.ICOADS_LOCATION, fields, doBC = doBC)
            raw_platform_data, raw_obs, raw_meta, raw_qc = utils.read_qc_data(filename, settings.ICOADS_LOCATION, fields, doBC = doBC, doBCtotal = doBCtotal, doBChgt = doBChgt, doBCscn = doBCscn)
# end

            # extract observation details
            lats, lons, years, months, days, hours = utils.process_platform_obs(raw_platform_data)

            # test dates *KW - SHOULDN'T NEED THIS - ONLY OBS PASSING DATE CHECK ARE INCLUDED*
            #  *RD* - hasn't run yet but will leave it in just in case of future use.
            if not utils.check_date(years, year, "years", filename):
                sys.exit(1)
            if not utils.check_date(months, month, "months", filename):
                sys.exit(1)

# KATE modified - seems to be an error with missing global name plots so have changed to settings.plots
            # Choose this one to only output once per decade
	    #if settings.plots and (year in [1973, 1983, 1993, 2003, 2013]):
	    # Choose this one to output a plot for each month
            if settings.plots:
            #if plots and (year in [1973, 1983, 1993, 2003, 2013]):
# end
                # plot the distribution of hours

                import matplotlib.pyplot as plt

                plt.clf()
                plt.hist(hours, np.arange(-100,2500,100))
                plt.ylabel("Number of observations")
                plt.xlabel("Hours")
                plt.xticks(np.arange(-300, 2700, 300))
                plt.savefig(settings.PLOT_LOCATION + "obs_distribution_{}{:02d}_{}.png".format(year, month, suffix))


                # only for a few of the variables
                for variable in OBS_ORDER:
                    if variable.name in ["marine_air_temperature", "dew_point_temperature", "specific_humidity", "relative_humidity", "marine_air_temperature_anomalies", "dew_point_temperature_anomalies", "specific_humidity_anomalies", "relative_humidity_anomalies"]:

                        #plot_qc_diagnostics.values_vs_lat(variable, lats, raw_obs[:, variable.column], raw_qc, these_flags, settings.PLOT_LOCATION + "qc_actuals_{}_{}{:02d}_{}.png".format(variable.name, year, month, suffix), multiplier = variable.multiplier, doBC = doBC)
                        plot_qc_diagnostics.values_vs_lat_dist(variable, lats, raw_obs[:, variable.column], raw_qc, these_flags, \
			        settings.PLOT_LOCATION + "qc_actuals_{}_{}{:02d}_{}.png".format(variable.name, year, month, suffix), multiplier = variable.multiplier, \
# KATE modified  - added other BC options  
				doBC = doBC, doBCtotal = doBCtotal, doBChgt = doBChgt, doBCscn = doBCscn)
# end

            # QC sub-selection
	    
# KATE modified - added QC iterations but also think this needs to include the bias corrected versions because the QC flags need to be applied to those too.
# Not sure what was happening previously with the doBC run - any masking to QC'd obs?
            if doQC | doQC1it | doQC2it | doQC3it | doBC | doBCtotal | doBChgt | doBCscn:
            #if doQC:
# end
                print "Using {} as flags".format(these_flags)
# KATE modified - BC options
#                mask = utils.process_qc_flags(raw_qc, these_flags, doBC = doBC)
                mask = utils.process_qc_flags(raw_qc, these_flags, doBC = doBC, doBCtotal = doBCtotal, doBChgt = doBChgt, doBCscn = doBCscn)
# end
		print "All Obs: ",len(mask)
		print "Good Obs: ",len(mask[np.where(mask == 0)])
		print "Bad Obs: ",len(mask[np.where(mask == 1)])
		#pdb.set_trace()
		

                complete_mask = np.zeros(raw_obs.shape)
                for i in range(raw_obs.shape[1]):
                    complete_mask[:,i] = mask
                clean_data = np.ma.masked_array(raw_obs, mask = complete_mask)

# end
            else:
                print "No QC flags selected"
                clean_data = np.ma.masked_array(raw_obs, mask = np.zeros(raw_obs.shape))


            # discretise hours
            hours = utils.make_index(hours, DELTA_HOUR, multiplier = 100)

            # get the hours since start of month
            hours_since = ((days - 1) * 24) + (hours * DELTA_HOUR)

            # discretise lats/lons
            lat_index = utils.make_index(lats, DELTA_LAT, multiplier = 100)
            lon_index = utils.make_index(lons, DELTA_LON, multiplier = 100)

            lat_index += ((len(grid_lats)-1)/2) # and as -ve indices are unhelpful, roll by offsetting by most westward
            lon_index += ((len(grid_lons)-1)/2) #    or most southerly so that (0,0) is (-90,-180)

            # NOTE - ALWAYS GIVING TOP-RIGHT OF BOX TO GIVE < HARD LIMIT (as opposed to <=)
            # do the gridding
            # extract the full grid, number of obs, and day/night flag
# KATE MEDIAN WATCH This is hard coded to doMedian (rather than settings.doMedian) - OK WITH MEDIAN HERE!!!
# KATE modified - to add settings.doMedian instead of just doMedian which seems to be consistent with the other bits and BC options
	    raw_month_grid, raw_month_n_obs, this_month_period = utils.grid_1by1_cam(clean_data, raw_qc, hours_since, lat_index, lon_index, \
	              grid_hours, grid_lats, grid_lons, OBS_ORDER, settings.mdi, doMedian = settings.doMedian, \
		      doBC = doBC, doBCtotal = doBCtotal, doBChgt = doBChgt, doBCscn = doBCscn)
	    #raw_month_grid, raw_month_n_obs, this_month_period = utils.grid_1by1_cam(clean_data, raw_qc, hours_since, lat_index, lon_index, grid_hours, grid_lats, grid_lons, OBS_ORDER, settings.mdi, doMedian = True, doBC = doBC)
# end
            print "successfully read data into 1x1 3hrly grids"

            # create matching array size
            this_month_period = np.tile(this_month_period, (len(OBS_ORDER),1,1,1))

            for period in ["all", "day", "night"]:

                if period == "day":
                    this_month_grid = np.ma.masked_where(this_month_period == 1, raw_month_grid)
                    this_month_obs = np.ma.masked_where(this_month_period[0] == 1, raw_month_n_obs) # and take first slice to re-match the array size
                elif period == "night":
                    this_month_grid = np.ma.masked_where(this_month_period == 0, raw_month_grid)
                    this_month_obs = np.ma.masked_where(this_month_period[0] == 0, raw_month_n_obs) # and take first slice to re-match the array size
                else:
                    this_month_grid = copy.deepcopy(raw_month_grid)
                    this_month_obs = copy.deepcopy(raw_month_n_obs)
                    
# KATE modified
                # If SwitchOutput == 1 then we're in test mode - output interim files!!!
		if (SwitchOutput == 1):
		    # have one month of gridded data.
                    out_filename = settings.DATA_LOCATION + settings.OUTROOT + "_1x1_3hr_{}{:02d}_{}_{}.nc".format(year, month, period, suffix)              

                    utils.netcdf_write(out_filename, this_month_grid, np.zeros(this_month_obs.shape), this_month_obs, OBS_ORDER, grid_lats, grid_lons, times, frequency = "H")
		## have one month of gridded data.
                #out_filename = settings.DATA_LOCATION + settings.OUTROOT + "_1x1_3hr_{}{:02d}_{}_{}.nc".format(year, month, period, suffix)              

                #utils.netcdf_write(out_filename, this_month_grid, np.zeros(this_month_obs.shape), this_month_obs, OBS_ORDER, grid_lats, grid_lons, times, frequency = "H")
# end
                # now average over time
                # Dailies
                daily_hours = grid_hours.reshape(-1, 24/DELTA_HOUR)

                shape = this_month_grid.shape
                this_month_grid = this_month_grid.reshape(shape[0], -1, 24/DELTA_HOUR, shape[2], shape[3])
                this_month_obs = this_month_obs.reshape(-1, 24/DELTA_HOUR, shape[2], shape[3])

# KATE MEDIAN WATCH - settings.doMedian is generally set to True - I think we may want the MEAN HERE!!!
# KATE modified - to hard wire in MEAN here
                daily_grid = np.ma.mean(this_month_grid, axis = 2)
                #if settings.doMedian:
                #    daily_grid = np.ma.median(this_month_grid, axis = 2)
                #else:
                #    daily_grid = np.ma.mean(this_month_grid, axis = 2)
# end
                daily_grid.fill_value = settings.mdi

                # filter on number of observations/day
                n_hrs_per_day = np.ma.count(this_month_grid, axis = 2) 
                n_obs_per_day = np.ma.sum(this_month_obs, axis = 1) 

                if period == "all":
                    bad_locs = np.where(n_hrs_per_day < N_OBS_DAY) # at least 2 of possible 8 3-hourly values (6hrly data *KW OR AT LEAST 4 3HRLY OBS PRESENT*)
                else:
                    bad_locs = np.where(n_hrs_per_day < np.floor(N_OBS_DAY / 2.)) # at least 1 of possible 8 3-hourly values (6hrly data *KW OR AT LEAST 4 3HRLY OBS PRESENT*)              
                daily_grid.mask[bad_locs] = True

# KATE modified - added SwitchOutput to if loop
                if (SwitchOutput == 1) and settings.plots and (year in [1973, 1983, 1993, 2003, 2013]):
                #if settings.plots and (year in [1973, 1983, 1993, 2003, 2013]):
# end
                    # plot the distribution of hours

                    plt.clf()
                    plt.hist(n_hrs_per_day.reshape(-1), bins = np.arange(-1,10), align = "left", log = True, rwidth=0.5)
                    if period == "all":
                        plt.axvline(x = N_OBS_DAY-0.5, color = "r")
                    else:
                        plt.axvline(x = np.floor(N_OBS_DAY / 2.)-0.5, color = "r")       

                    plt.title("Number of 1x1-3hrly in each 1x1-daily grid box")
                    plt.xlabel("Number of 3-hrly observations (max = 8)")
                    plt.ylabel("Frequency (log scale)")
                    plt.savefig(settings.PLOT_LOCATION + "n_grids_1x1_daily_{}{:02d}_{}_{}.png".format(year, month, period, suffix))

                    plt.clf()
                    plt.hist(n_obs_per_day.reshape(-1), bins = np.arange(-5,100,5),  log = True, rwidth=0.5)                 
                    plt.title("Total number of raw observations in each 1x1 daily grid box")
                    plt.xlabel("Number of raw observations")
                    plt.ylabel("Frequency (log scale)")
                    plt.savefig(settings.PLOT_LOCATION + "n_obs_1x1_daily_{}{:02d}_{}_{}.png".format(year, month, period, suffix))

                # clear up memory
                del this_month_grid
                del this_month_obs
                gc.collect()

# KATE modified
                # If SwitchOutput == 1 then we're in test mode - output interim files!!!
		if (SwitchOutput == 1):
                    # write dailies file
                    times.data = daily_hours[:,0]
                    out_filename = settings.DATA_LOCATION + settings.OUTROOT + "_1x1_daily_{}{:02d}_{}_{}.nc".format(year, month, period, suffix)

                    utils.netcdf_write(out_filename, daily_grid, n_hrs_per_day[0], n_obs_per_day, OBS_ORDER, grid_lats, grid_lons, times, frequency = "D")
                #times.data = daily_hours[:,0]
                #out_filename = settings.DATA_LOCATION + settings.OUTROOT + "_1x1_daily_{}{:02d}_{}_{}.nc".format(year, month, period, suffix)

                #utils.netcdf_write(out_filename, daily_grid, n_hrs_per_day[0], n_obs_per_day, OBS_ORDER, grid_lats, grid_lons, times, frequency = "D")
# end
                # Monthlies
                times.data = daily_hours[0,0]

# KATE modified - commenting out as we don't need this anymore
#                if settings.doMedian:
#                    monthly_grid = np.ma.median(daily_grid, axis = 1)
#                else:
#                    monthly_grid = np.ma.mean(daily_grid, axis = 1)
#
#                monthly_grid.fill_value = settings.mdi
#
#                # filter on number of observations/month
#                n_grids_per_month = np.ma.count(daily_grid, axis = 1) 
#                bad_locs = np.where(n_grids_per_month < calendar.monthrange(year, month)[1] * N_OBS_FRAC_MONTH) # 30% of possible daily values
#                monthly_grid.mask[bad_locs] = True
#
#                # number of raw observations
#                n_obs_per_month = np.ma.sum(n_obs_per_day, axis = 0)
#
#                if settings.plots and (year in [1973, 1983, 1993, 2003, 2013]):
#                    # plot the distribution of days
#
#                    plt.clf()
#                    plt.hist(n_obs_per_month.reshape(-1), bins = np.arange(-10,500,10),  log = True, rwidth=0.5)
#                    plt.title("Total number of raw observations in each 1x1 monthly grid box")
#                    plt.xlabel("Number of raw observations")
#                    plt.ylabel("Frequency (log scale)")
#                    plt.savefig(settings.PLOT_LOCATION + "n_obs_1x1_monthly_{}{:02d}_{}_{}.png".format(year, month, period, suffix))
#
#                    plt.clf()
#                    plt.hist(n_grids_per_month[0].reshape(-1), bins = np.arange(-2,40,2), align = "left",  log = True, rwidth=0.5)
#                    plt.axvline(x = calendar.monthrange(year, month)[1] * N_OBS_FRAC_MONTH, color="r")
#                    plt.title("Total number of 1x1 daily grids in each 1x1 monthly grid")
#                    plt.xlabel("Number of 1x1 daily grids")
#                    plt.ylabel("Frequency (log scale)")
#                    plt.savefig(settings.PLOT_LOCATION + "n_grids_1x1_monthly_{}{:02d}_{}_{}.png".format(year, month, period, suffix))
#
#                # write monthly 1x1 file
#                out_filename = settings.DATA_LOCATION + settings.OUTROOT + "_1x1_monthly_{}{:02d}_{}_{}.nc".format(year, month, period, suffix)
#                utils.netcdf_write(out_filename, monthly_grid, n_grids_per_month[0], n_obs_per_month, OBS_ORDER, grid_lats, grid_lons, times, frequency = "M")
#            
#                # now to re-grid to coarser resolution
#                # KW # Here we may want to use the mean because its a large area but could be sparsely
#                #             populated with quite different climatologies so we want 
#                # the influence of the outliers (we've done our best to ensure these are good values) 
#
#                # go from monthly 1x1 to monthly 5x5 - retained as limited overhead
#                monthly_5by5, monthly_5by5_n_grids, monthly_5by5_n_obs, grid5_lats, grid5_lons = utils.grid_5by5(monthly_grid, n_obs_per_month, grid_lats, grid_lons, doMedian = settings.doMedian, daily = False)
#                out_filename = settings.DATA_LOCATION + settings.OUTROOT + "_5x5_monthly_{}{:02d}_{}_{}.nc".format(year, month, period, suffix)
#
#                utils.netcdf_write(out_filename, monthly_5by5, monthly_5by5_n_grids, monthly_5by5_n_obs, OBS_ORDER, grid5_lats, grid5_lons, times, frequency = "M")
#
#                if settings.plots and (year in [1973, 1983, 1993, 2003, 2013]):
#                    # plot the distribution of days
#
#                    plt.clf()
#                    plt.hist(monthly_5by5_n_obs.reshape(-1), bins = np.arange(0,100,5), log = True, rwidth=0.5)
#                    plt.title("Total number of raw observations in each 5x5 monthly grid box")
#                    plt.xlabel("Number of raw observations")
#                    plt.ylabel("Frequency (log scale)")
#                    plt.savefig(settings.PLOT_LOCATION + "n_obs_5x5_monthly_{}{:02d}_{}_{}.png".format(year, month, period, suffix))
#
#                    plt.clf()
#                    plt.hist(monthly_5by5_n_grids.reshape(-1), bins = np.arange(-2,30,2), align = "left", log = True, rwidth=0.5)
#                    plt.axvline(x = 1, color="r")
#                    plt.title("Total number of 1x1 monthly grids in each 5x5 monthly grid")
#                    plt.xlabel("Number of 1x1 monthly grids")
#                    plt.ylabel("Frequency (log scale)")
#                    plt.savefig(settings.PLOT_LOCATION + "n_grids_5x5_monthly_{}{:02d}_{}_{}.png".format(year, month, period, suffix))
#
#                # clear up memory
#                del monthly_grid
#                del monthly_5by5
#                del monthly_5by5_n_grids
#                del monthly_5by5_n_obs
#                del n_grids_per_month
#                del n_obs_per_month
#                del n_hrs_per_day
#                gc.collect()
# end
                # go direct from daily 1x1 to monthly 5x5
# KATE MEDIAN WATCH - settings.doMedian is generally set to True - I think we may want the MEAN HERE!!!
# KATE modified - to hard wire in MEAN here
                monthly_5by5, monthly_5by5_n_grids, monthly_5by5_n_obs, grid5_lats, grid5_lons = utils.grid_5by5(daily_grid, n_obs_per_day, grid_lats, grid_lons, doMedian = False, daily = True)
                #monthly_5by5, monthly_5by5_n_grids, monthly_5by5_n_obs, grid5_lats, grid5_lons = utils.grid_5by5(daily_grid, n_obs_per_day, grid_lats, grid_lons, doMedian = settings.doMedian, daily = True)
# end
                out_filename = settings.DATA_LOCATION + settings.OUTROOT + "_5x5_monthly_from_daily_{}{:02d}_{}_{}.nc".format(year, month, period, suffix)
 
                utils.netcdf_write(out_filename, monthly_5by5, monthly_5by5_n_grids, monthly_5by5_n_obs, OBS_ORDER, grid5_lats, grid5_lons, times, frequency = "M")

                

                if settings.plots and (year in [1973, 1983, 1993, 2003, 2013]):
                    # plot the distribution of days

                    plt.clf()
                    plt.hist(monthly_5by5_n_obs.reshape(-1), bins = np.arange(-10,1000,10),  log = True, rwidth=0.5)
                    plt.title("Total number of raw observations in each 5x5 monthly grid box")
                    plt.xlabel("Number of raw observations")
                    plt.ylabel("Frequency (log scale)")
                    plt.savefig(settings.PLOT_LOCATION + "n_obs_5x5_monthly_from_daily_{}{:02d}_{}_{}.png".format(year, month, period, suffix))


                    plt.clf()
                    plt.hist(monthly_5by5_n_grids.reshape(-1), bins = np.arange(-5,100,5), align = "left", log = True, rwidth=0.5)
                    plt.axvline(x = (0.3 * daily_grid.shape[0]), color="r")
                    plt.title("Total number of 1x1 daily grids in each 5x5 monthly grid")
                    plt.xlabel("Number of 1x1 daily grids")
                    plt.ylabel("Frequency (log scale)")

                    plt.savefig(settings.PLOT_LOCATION + "n_grids_5x5_monthly_from_daily_{}{:02d}_{}_{}.png".format(year, month, period, suffix))


                del daily_grid
                del monthly_5by5
                del n_obs_per_day
                del monthly_5by5_n_grids
                del monthly_5by5_n_obs
                gc.collect()

    return # do_gridding