Exemple #1
0
def main(argv):
    """
    This program reads in data from ICOADS.2.5.1 and applies quality control processes to it, flagging data as 
    good or bad according to a set of different criteria.

    The first step of the process is to read in various SST and MAT climatologies from file. These are 1degree latitude 
    by 1 degree longitude by 73 pentad fields in NetCDF format.
    
    The program then loops over all specified years and months reads in the data needed to QC that month and then 
    does the QC. There are three stages in the QC
    
    basic QC - this proceeds one observation at a time. Checks are relatively simple and detect gross errors
    
    track check - this works on Voyages consisting of all the observations from a single ship (or at least a single ID) 
    and identifies observations which make for an implausible ship track
    
    buddy check - this works on Decks which are large collections of observations and compares observations to their
    neighbours
    """

    print('########################')
    print('Running make_and_full_qc')
    print('########################')

    parser = argparse.ArgumentParser(
        description='Marine QC system, main program')
    parser.add_argument('-config',
                        type=str,
                        default='configuration.txt',
                        help='name of config file')
    parser.add_argument('-tracking',
                        action='store_true',
                        help='perform tracking QC')
    parser.add_argument('-jobs',
                        type=str,
                        default='jobs.json',
                        help='name of job file')
    parser.add_argument('-job_index', type=int, default=0, help='job index')

    args = parser.parse_args()

    inputfile = args.config
    jobfile = args.jobs
    jobindex = args.job_index - 1
    tracking = args.tracking

    with open(jobfile) as fp:
        jobs = json.load(fp)

    year1 = jobs['jobs'][jobindex]['year1']
    year2 = jobs['jobs'][jobindex]['year2']
    month1 = jobs['jobs'][jobindex]['month1']
    month2 = jobs['jobs'][jobindex]['month2']
    input_schema = jobs['schema']
    code_tables = jobs['code_tables']

    verbose = True  # need set to read as arg in future

    print('Input file is {}'.format(inputfile))
    print('Running from {} {} to {} {}'.format(month1, year1, month2, year2))
    print('')

    config = ConfigParser.ConfigParser()
    config.read(inputfile)
    icoads_dir = config.get('Directories', 'ICOADS_dir')
    out_dir = config.get('Directories', 'out_dir')
    bad_id_file = config.get('Files', 'IDs_to_exclude')
    version = config.get('Icoads', 'icoads_version')

    print('ICOADS directory = {}'.format(icoads_dir))
    print('ICOADS version = {}'.format(version))
    print('Output to {}'.format(out_dir))
    print('List of bad IDs = {}'.format(bad_id_file))
    print('Parameter file = {}'.format(config.get('Files', 'parameter_file')))
    print('')

    ids_to_exclude = bf.process_bad_id_file(bad_id_file)

    # read in climatology files
    sst_pentad_stdev = clim.Climatology.from_filename(
        config.get('Climatologies', 'Old_SST_stdev_climatology'), 'sst')

    sst_stdev_1 = clim.Climatology.from_filename(
        config.get('Climatologies', 'SST_buddy_one_box_to_buddy_avg'), 'sst')
    sst_stdev_2 = clim.Climatology.from_filename(
        config.get('Climatologies', 'SST_buddy_one_ob_to_box_avg'), 'sst')
    sst_stdev_3 = clim.Climatology.from_filename(
        config.get('Climatologies', 'SST_buddy_avg_sampling'), 'sst')

    with open(config.get('Files', 'parameter_file'), 'r') as f:
        parameters = json.load(f)

    # read in high resolution SST climatology file
    for entry in parameters['hires_climatologies']:
        if entry[0] == 'SST' and entry[1] == 'mean':
            sst_climatology_file = entry[2]
            print("hires sst climatology file {}".format(sst_climatology_file))

    climlib = ex.ClimatologyLibrary()
    climlib.add_field(
        'SST', 'mean',
        clim.Climatology.from_filename(sst_climatology_file, 'temperature'))

    for year, month in qc.year_month_gen(year1, month1, year2, month2):

        print("{} {}".format(year, month))

        last_year, last_month = qc.last_month_was(year, month)
        next_year, next_month = qc.next_month_is(year, month)

        reps = ex.Deck()
        count = 0

        for readyear, readmonth in qc.year_month_gen(last_year, last_month,
                                                     next_year, next_month):

            print("{} {}".format(readyear, readmonth))

            #icoads_dir = '/gws/nopw/j04/c3s311a_lot2/data/level0/marine/sub_daily_data/IMMA1_R3.0.0T-QC/'
            filename = icoads_dir + '{:4d}-{:02d}.psv'.format(
                readyear, readmonth)
            # YR|MO|DY|HR|LAT|LON|DS|VS|ID|AT|SST|DPT|DCK|SLP|SID|PT|UID|W|D|IRF|bad_data|outfile
            imma_obj = pd.read_csv(filename,
                                   sep='|',
                                   header=None,
                                   names=[
                                       'YR', 'MO', 'DY', 'HR', 'LAT', 'LON',
                                       'DS', 'VS', 'ID', 'AT', 'SST', 'DPT',
                                       'DCK', 'SLP', 'SID', 'PT', 'UID', 'W',
                                       'D', 'IRF', 'bad_data', 'outfile'
                                   ],
                                   low_memory=False)

            imma_obj['ID'].replace(' ', '', inplace=True)
            imma_obj = imma_obj.sort_values(['YR', 'MO', 'DY', 'HR', 'ID'],
                                            axis=0,
                                            ascending=True)
            imma_obj = imma_obj.reset_index(drop=True)

            data_index = imma_obj.index

            rec = IMMA()

            for idx in data_index:
                # set missing values to None
                for k, v in imma_obj.loc[idx, ].to_dict().items():
                    rec.data[k] = to_none(v)

                readob = True
                if (not (rec.data['ID'] in ids_to_exclude) and readob
                        and rec.data['YR'] == readyear
                        and rec.data['MO'] == readmonth and rec.data['DY']
                        is not None):  # dyb - new line / check

                    rep = ex.MarineReportQC(rec)
                    del rec

                rep_clim = climlib.get_field('SST', 'mean').get_value(
                    rep.lat(), rep.lon(), rep.getvar('MO'), rep.getvar('DY'))
                rep.add_climate_variable('SST', rep_clim)

                rep.perform_base_sst_qc(parameters)
                rep.set_qc(
                    'POS', 'month_match',
                    qc.month_match(year, month, rep.getvar('YR'),
                                   rep.getvar('MO')))

                reps.append(rep)
                count += 1

                rec = IMMA()

            #icoads_file.close()

        print("Read {} ICOADS records".format(count))

        # filter the obs into passes and fails of basic positional QC
        filt = ex.QC_filter()
        filt.add_qc_filter('POS', 'date', 0)
        filt.add_qc_filter('POS', 'time', 0)
        filt.add_qc_filter('POS', 'pos', 0)
        filt.add_qc_filter('POS', 'blklst', 0)

        reps.add_filter(filt)

        # track check the passes one ship at a time
        count_ships = 0
        for one_ship in reps.get_one_platform_at_a_time():
            one_ship.sort(
            )  # corrections applied can move reports between months, corrections currently applied after reading IMMA
            one_ship.track_check(parameters['track_check'])
            one_ship.find_repeated_values(parameters['find_repeated_values'],
                                          intype='SST')
            count_ships += 1

        print("Track checked {} ships".format(count_ships))

        # SST buddy check
        filt = ex.QC_filter()
        filt.add_qc_filter('POS', 'is780', 0)
        filt.add_qc_filter('POS', 'date', 0)
        filt.add_qc_filter('POS', 'time', 0)
        filt.add_qc_filter('POS', 'pos', 0)
        filt.add_qc_filter('POS', 'blklst', 0)
        filt.add_qc_filter('POS', 'trk', 0)
        filt.add_qc_filter('SST', 'noval', 0)
        filt.add_qc_filter('SST', 'freez', 0)
        filt.add_qc_filter('SST', 'clim', 0)
        filt.add_qc_filter('SST', 'nonorm', 0)

        reps.add_filter(filt)

        reps.bayesian_buddy_check('SST', sst_stdev_1, sst_stdev_2, sst_stdev_3,
                                  parameters)
        reps.mds_buddy_check('SST', sst_pentad_stdev,
                             parameters['mds_buddy_check'])

        extdir = bf.safe_make_dir(out_dir, year, month)

        varnames_to_print = {
            'SST': [
                'bud', 'clim', 'nonorm', 'freez', 'noval', 'nbud', 'bbud',
                'rep', 'spike', 'hardlimit'
            ]
        }

        reps.write_qc('hires_' + parameters['runid'], extdir, year, month,
                      varnames_to_print)

        if tracking:
            # set QC for output by ID - buoys only and passes base SST QC
            filt = ex.QC_filter()
            filt.add_qc_filter('POS', 'month_match', 1)
            filt.add_qc_filter('POS', 'isdrifter', 1)

            reps.add_filter(filt)

            idfile = open(extdir + '/ID_file.txt', 'w')
            for one_ship in reps.get_one_platform_at_a_time():

                if len(one_ship) > 0:
                    thisid = one_ship.getrep(0).getvar('ID')
                    if thisid is not None:
                        idfile.write(thisid + ',' + ex.safe_filename(thisid) +
                                     '\n')
                        one_ship.write_qc('hires_' + parameters['runid'],
                                          extdir, year, month,
                                          varnames_to_print)
            idfile.close()

        del reps
Exemple #2
0
def main(argv):
    """
    This program reads in data from ICOADS.3.0.0/ICOADS.3.0.1 and applies quality control processes to it, flagging data
    as good or bad according to a set of different criteria. Optionally it will replace drifting buoy SST data in
    ICOADS.3.0.1 with drifter data taken from the GDBC portal.

    The first step of the process is to read in various SST and MAT climatologies from file. These are 1degree latitude 
    by 1 degree longitude by 73 pentad fields in NetCDF format.
    
    The program then loops over all specified years and months reads in the data needed to QC that month and then 
    does the QC. There are three stages in the QC
    
    basic QC - this proceeds one observation at a time. Checks are relatively simple and detect gross errors
    
    track check - this works on Voyages consisting of all the observations from a single ship (or at least a single ID) 
    and identifies observations which make for an implausible ship track
    
    buddy check - this works on Decks which are large collections of observations and compares observations to their
    neighbours
    """

    print('########################')
    print('Running make_and_full_qc')
    print('########################')

    parser = argparse.ArgumentParser(
        description='Marine QC system, main program')
    parser.add_argument('-config',
                        type=str,
                        default='configuration.txt',
                        help='name of config file')
    parser.add_argument('-year1',
                        type=int,
                        default=1850,
                        help='First year for processing')
    parser.add_argument('-year2',
                        type=int,
                        default=1850,
                        help='Final year for processing')
    parser.add_argument('-month1',
                        type=int,
                        default=1,
                        help='First month for processing')
    parser.add_argument('-month2',
                        type=int,
                        default=1,
                        help='Final month for processing')
    parser.add_argument('-tracking',
                        action='store_true',
                        help='perform tracking QC')
    args = parser.parse_args()

    inputfile = args.config
    year1 = args.year1
    year2 = args.year2
    month1 = args.month1
    month2 = args.month2
    tracking = args.tracking

    print("running on ICOADS, this is not a test!")

    print('Input file is {}'.format(inputfile))
    print('Running from {} {} to {} {}'.format(month1, year1, month2, year2))
    print('')

    config = ConfigParser.ConfigParser()
    config.read(inputfile)
    icoads_dir = config.get('Directories', 'ICOADS_dir')
    out_dir = config.get('Directories', 'out_dir')
    bad_id_file = config.get('Files', 'IDs_to_exclude')
    version = config.get('Icoads', 'icoads_version')

    print('ICOADS directory = {}'.format(icoads_dir))
    print('ICOADS version = {}'.format(version))
    print('Output to {}'.format(out_dir))
    print('List of bad IDs = {}'.format(bad_id_file))
    print('Parameter file = {}'.format(config.get('Files', 'parameter_file')))
    print('')

    ids_to_exclude = bf.process_bad_id_file(bad_id_file)

    # read in climatology files
    sst_pentad_stdev = clim.Climatology.from_filename(
        config.get('Climatologies', 'Old_SST_stdev_climatology'), 'sst')

    sst_stdev_1 = clim.Climatology.from_filename(
        config.get('Climatologies', 'SST_buddy_one_box_to_buddy_avg'), 'sst')
    sst_stdev_2 = clim.Climatology.from_filename(
        config.get('Climatologies', 'SST_buddy_one_ob_to_box_avg'), 'sst')
    sst_stdev_3 = clim.Climatology.from_filename(
        config.get('Climatologies', 'SST_buddy_avg_sampling'), 'sst')

    with open(config.get('Files', 'parameter_file'), 'r') as f:
        parameters = json.load(f)

    print("Reading climatologies from parameter file")
    climlib = ex.ClimatologyLibrary()
    for entry in parameters['climatologies']:
        print("{} {}".format(entry[0], entry[1]))
        climlib.add_field(entry[0], entry[1],
                          clim.Climatology.from_filename(entry[2], entry[3]))

    for year, month in qc.year_month_gen(year1, month1, year2, month2):

        print("{} {}".format(year, month))

        last_year, last_month = qc.last_month_was(year, month)
        next_year, next_month = qc.next_month_is(year, month)

        reps = ex.Deck()
        count = 0
        lastday = -99

        for readyear, readmonth in qc.year_month_gen(last_year, last_month,
                                                     next_year, next_month):

            print("{} {}".format(readyear, readmonth))

            ostia_bg_var = None
            if tracking:
                ostia_bg_var = clim.Climatology.from_filename(
                    config.get('Climatologies',
                               qc.season(readmonth) + '_ostia_background'),
                    'bg_var')

            filename = bf.icoads_filename_from_stub(
                parameters['icoads_dir'], parameters['icoads_filenames'],
                readyear, readmonth)
            try:
                icoads_file = gzip.open(filename, "r")
            except IOError:
                print("no ICOADS file for {} {}".format(readyear, readmonth))
                continue

            rec = IMMA()

            for line in icoads_file:

                try:
                    rec.readstr(line)
                    readob = True
                except:
                    readob = False
                    print("Rejected ob {}".format(line))

                if (not (rec.data['ID'] in ids_to_exclude) and readob
                        and rec.data['YR'] == readyear
                        and rec.data['MO'] == readmonth):

                    rep = ex.MarineReportQC(rec)
                    del rec

                    # if day has changed then read in OSTIA field if available and append SST and sea-ice fraction
                    # to the observation metadata
                    if tracking and readyear >= 1985 and rep.getvar(
                            'DY') is not None:
                        if rep.getvar('DY') != lastday:
                            lastday = rep.getvar('DY')
                            y_year, y_month, y_day = qc.yesterday(
                                readyear, readmonth, lastday)

                            #                            ofname = ostia_filename(ostia_dir, y_year, y_month, y_day)
                            ofname = bf.get_background_filename(
                                parameters['background_dir'],
                                parameters['background_filenames'], y_year,
                                y_month, y_day)

                            climlib.add_field(
                                'OSTIA', 'background',
                                clim.Climatology.from_filename(
                                    ofname, 'analysed_sst'))
                            climlib.add_field(
                                'OSTIA', 'ice',
                                clim.Climatology.from_filename(
                                    ofname, 'sea_ice_fraction'))

                        rep_clim = climlib.get_field(
                            'OSTIA', 'background').get_value_ostia(
                                rep.lat(), rep.lon())
                        if rep_clim is not None:
                            rep_clim -= 273.15

                        rep.setext('OSTIA', rep_clim)
                        rep.setext(
                            'ICE',
                            climlib.get_field('OSTIA', 'ice').get_value_ostia(
                                rep.lat(), rep.lon()))
                        rep.setext(
                            'BGVAR',
                            ostia_bg_var.get_value_mds_style(
                                rep.lat(), rep.lon(), rep.getvar('MO'),
                                rep.getvar('DY')))

                    for varname in ['SST']:
                        rep_clim = climlib.get_field(
                            varname, 'mean').get_value_mds_style(
                                rep.lat(), rep.lon(), rep.getvar('MO'),
                                rep.getvar('DY'))
                        rep.add_climate_variable(varname, rep_clim)

                    rep.perform_base_qc(parameters)
                    rep.set_qc(
                        'POS', 'month_match',
                        qc.month_match(year, month, rep.getvar('YR'),
                                       rep.getvar('MO')))

                    reps.append(rep)
                    count += 1

                rec = IMMA()

            icoads_file.close()

        print("Read {} ICOADS records".format(count))

        # filter the obs into passes and fails of basic positional QC
        filt = ex.QC_filter()
        filt.add_qc_filter('POS', 'date', 0)
        filt.add_qc_filter('POS', 'time', 0)
        filt.add_qc_filter('POS', 'pos', 0)
        filt.add_qc_filter('POS', 'blklst', 0)

        reps.add_filter(filt)

        # track check the passes one ship at a time
        count_ships = 0
        for one_ship in reps.get_one_platform_at_a_time():

            one_ship.track_check(parameters['track_check'])
            one_ship.iquam_track_check(parameters['IQUAM_track_check'])
            one_ship.spike_check(parameters['IQUAM_spike_check'])
            one_ship.find_saturated_runs(parameters['saturated_runs'])
            one_ship.find_multiple_rounded_values(
                parameters['multiple_rounded_values'])

            for varname in ['SST']:
                one_ship.find_repeated_values(
                    parameters['find_repeated_values'], intype=varname)

            count_ships += 1

        print("Track checked {} ships".format(count_ships))

        # SST buddy check
        filt = ex.QC_filter()
        filt.add_qc_filter('POS', 'is780', 0)
        filt.add_qc_filter('POS', 'date', 0)
        filt.add_qc_filter('POS', 'time', 0)
        filt.add_qc_filter('POS', 'pos', 0)
        filt.add_qc_filter('POS', 'blklst', 0)
        filt.add_qc_filter('POS', 'trk', 0)
        filt.add_qc_filter('SST', 'noval', 0)
        filt.add_qc_filter('SST', 'freez', 0)
        filt.add_qc_filter('SST', 'clim', 0)
        filt.add_qc_filter('SST', 'nonorm', 0)

        reps.add_filter(filt)

        reps.bayesian_buddy_check('SST', sst_stdev_1, sst_stdev_2, sst_stdev_3,
                                  parameters)
        reps.mds_buddy_check('SST', sst_pentad_stdev,
                             parameters['mds_buddy_check'])

        extdir = bf.safe_make_dir(out_dir, year, month)

        if tracking:
            # set QC for output by ID - buoys only and passes base SST QC
            filt = ex.QC_filter()
            filt.add_qc_filter('POS', 'month_match', 1)
            filt.add_qc_filter('POS', 'isdrifter', 1)

            reps.add_filter(filt)

            idfile = open(extdir + '/ID_file.txt', 'w')
            for one_ship in reps.get_one_platform_at_a_time():

                if len(one_ship) > 0:
                    thisid = one_ship.getrep(0).getvar('ID')
                    if thisid is not None:
                        idfile.write(thisid + ',' + ex.safe_filename(thisid) +
                                     '\n')
                        one_ship.write_output(parameters['runid'], extdir,
                                              year, month)
            idfile.close()

        del reps
Exemple #3
0
def main(argv):
    """
    This program reads in data from ICOADS.3.0.0/ICOADS.3.0.1 and applies quality control processes to it, flagging data
    as good or bad according to a set of different criteria. Optionally it will replace drifting buoy SST data in
    ICOADS.3.0.1 with drifter data taken from the GDBC portal.

    The first step of the process is to read in various SST and MAT climatologies from file. These are 1degree latitude
    by 1 degree longitude by 73 pentad fields in NetCDF format.

    The program then loops over all specified years and months reads in the data needed to QC that month and then
    does the QC. There are three stages in the QC

    basic QC - this proceeds one observation at a time. Checks are relatively simple and detect gross errors

    track check - this works on Voyages consisting of all the observations from a single ship (or at least a single ID)
    and identifies observations which make for an implausible ship track

    buddy check - this works on Decks which are large collections of observations and compares observations to their
    neighbours
    """

    print('########################')
    print('Running make_and_full_qc')
    print('########################')

    parser = argparse.ArgumentParser(
        description='Marine QC system, main program')
    parser.add_argument('-config',
                        type=str,
                        default='configuration.txt',
                        help='name of config file')
    parser.add_argument('-tracking',
                        action='store_true',
                        help='perform tracking QC')
    parser.add_argument('-jobs',
                        type=str,
                        default='jobs.json',
                        help='name of job file')
    parser.add_argument('-job_index', type=int, default=0, help='job index')

    args = parser.parse_args()

    inputfile = args.config
    jobfile = args.jobs
    jobindex = args.job_index - 1
    tracking = args.tracking

    with open(jobfile) as fp:
        jobs = json.load(fp)

    year1 = jobs['jobs'][jobindex]['year1']
    year2 = jobs['jobs'][jobindex]['year2']
    month1 = jobs['jobs'][jobindex]['month1']
    month2 = jobs['jobs'][jobindex]['month2']

    verbose = True  # need set to read as arg in future

    print("running on ICOADS, this is not a test!")

    print('Input file is {}'.format(inputfile))
    print('Running from {} {} to {} {}'.format(month1, year1, month2, year2))
    print('')

    config = configparser.ConfigParser()
    config.read(inputfile)
    icoads_dir = config.get('Directories', 'ICOADS_dir')
    out_dir = config.get('Directories', 'out_dir')
    bad_id_file = config.get('Files', 'IDs_to_exclude')
    version = config.get('Icoads', 'icoads_version')

    print('ICOADS directory = {}'.format(icoads_dir))
    print('ICOADS version = {}'.format(version))
    print('Output to {}'.format(out_dir))
    print('List of bad IDs = {}'.format(bad_id_file))
    print('Parameter file = {}'.format(config.get('Files', 'parameter_file')))
    print('')

    ids_to_exclude = bf.process_bad_id_file(bad_id_file)

    # read in climatology files
    sst_pentad_stdev = clim.Climatology.from_filename(
        config.get('Climatologies', 'Old_SST_stdev_climatology'), 'sst')

    sst_stdev_1 = clim.Climatology.from_filename(
        config.get('Climatologies', 'SST_buddy_one_box_to_buddy_avg'), 'sst')
    sst_stdev_2 = clim.Climatology.from_filename(
        config.get('Climatologies', 'SST_buddy_one_ob_to_box_avg'), 'sst')
    sst_stdev_3 = clim.Climatology.from_filename(
        config.get('Climatologies', 'SST_buddy_avg_sampling'), 'sst')

    with open(config.get('Files', 'parameter_file'), 'r') as f:
        parameters = json.load(f)

    print("Reading climatologies from parameter file")
    climlib = ex.ClimatologyLibrary()
    for entry in parameters['climatologies']:
        print("{} {}".format(entry[0], entry[1]))
        climlib.add_field(entry[0], entry[1],
                          clim.Climatology.from_filename(entry[2], entry[3]))

    for year, month in qc.year_month_gen(year1, month1, year2, month2):

        print("INFO({}): {} {}".format(
            datetime.now().time().isoformat(timespec='milliseconds'), year,
            month))

        last_year, last_month = qc.last_month_was(year, month)
        next_year, next_month = qc.next_month_is(year, month)

        reps = ex.Deck()
        count = 0
        lastday = -99

        for readyear, readmonth in qc.year_month_gen(last_year, last_month,
                                                     next_year, next_month):

            print("INFO({}): {} {}".format(
                datetime.now().time().isoformat(timespec='milliseconds'),
                readyear, readmonth))

            ostia_bg_var = None
            if tracking:
                ostia_bg_var = clim.Climatology.from_filename(
                    config.get('Climatologies',
                               qc.season(readmonth) + '_ostia_background'),
                    'bg_var')

            filename = icoads_dir + '{:4d}-{:02d}.psv'.format(
                readyear, readmonth)

            imma_obj = pd.read_csv(filename,
                                   sep='|',
                                   header=None,
                                   names=[
                                       'YR', 'MO', 'DY', 'HR', 'LAT', 'LON',
                                       'DS', 'VS', 'ID', 'AT', 'SST', 'DPT',
                                       'DCK', 'SLP', 'SID', 'PT', 'UID', 'W',
                                       'D', 'IRF', 'bad_data', 'outfile'
                                   ],
                                   low_memory=False)

            # replace ' ' in ID field with '' (corrections introduce bug)
            imma_obj['ID'].replace(' ', '', inplace=True)
            imma_obj = imma_obj.sort_values(['YR', 'MO', 'DY', 'HR', 'ID'],
                                            axis=0,
                                            ascending=True)
            imma_obj = imma_obj.reset_index(drop=True)

            data_index = imma_obj.index

            rec = IMMA()
            print('INFO({}): Data read, applying first QC'.format(
                datetime.now().time().isoformat(timespec='milliseconds')))
            dyb_count = 0
            for idx in data_index:
                # set missing values to None
                for k, v in imma_obj.loc[idx, ].to_dict().items():
                    rec.data[k] = to_none(v)
                readob = True
                if (not (rec.data['ID'] in ids_to_exclude) and readob
                        and rec.data['YR'] == readyear
                        and rec.data['MO'] == readmonth
                        and rec.data['DY'] is not None):

                    rep = ex.MarineReportQC(rec)
                    del rec

                    rep.setvar('AT2', rep.getvar('AT'))

                    # if day has changed then read in OSTIA field if available and append SST and sea-ice fraction
                    # to the observation metadata
                    if tracking and readyear >= 1985 and rep.getvar(
                            'DY') is not None:
                        if rep.getvar('DY') != lastday:
                            lastday = rep.getvar('DY')
                            y_year, y_month, y_day = qc.yesterday(
                                readyear, readmonth, lastday)

                            #                            ofname = ostia_filename(ostia_dir, y_year, y_month, y_day)
                            ofname = bf.get_background_filename(
                                parameters['background_dir'],
                                parameters['background_filenames'], y_year,
                                y_month, y_day)

                            climlib.add_field(
                                'OSTIA', 'background',
                                clim.Climatology.from_filename(
                                    ofname, 'analysed_sst'))
                            climlib.add_field(
                                'OSTIA', 'ice',
                                clim.Climatology.from_filename(
                                    ofname, 'sea_ice_fraction'))

                        rep_clim = climlib.get_field(
                            'OSTIA', 'background').get_value_ostia(
                                rep.lat(), rep.lon())
                        if rep_clim is not None:
                            rep_clim -= 273.15

                        rep.setext('OSTIA', rep_clim)
                        rep.setext(
                            'ICE',
                            climlib.get_field('OSTIA', 'ice').get_value_ostia(
                                rep.lat(), rep.lon()))
                        rep.setext(
                            'BGVAR',
                            ostia_bg_var.get_value_mds_style(
                                rep.lat(), rep.lon(), rep.getvar('MO'),
                                rep.getvar('DY')))

                    for varname in ['SST', 'AT']:
                        rep_clim = climlib.get_field(
                            varname, 'mean').get_value_mds_style(
                                rep.lat(), rep.lon(), rep.getvar('MO'),
                                rep.getvar('DY'))
                        rep.add_climate_variable(varname, rep_clim)

                    for varname in ['SLP2', 'SHU', 'CRH', 'CWB', 'DPD']:
                        rep_clim = climlib.get_field(varname,
                                                     'mean').get_value(
                                                         rep.lat(), rep.lon(),
                                                         rep.getvar('MO'),
                                                         rep.getvar('DY'))
                        rep.add_climate_variable(varname, rep_clim)

                    for varname in ['DPT', 'AT2', 'SLP']:
                        rep_clim = climlib.get_field(varname,
                                                     'mean').get_value(
                                                         rep.lat(), rep.lon(),
                                                         rep.getvar('MO'),
                                                         rep.getvar('DY'))
                        rep_stdev = climlib.get_field(varname,
                                                      'stdev').get_value(
                                                          rep.lat(), rep.lon(),
                                                          rep.getvar('MO'),
                                                          rep.getvar('DY'))
                        rep.add_climate_variable(varname, rep_clim, rep_stdev)

                    rep.calculate_humidity_variables(
                        ['SHU', 'VAP', 'CRH', 'CWB', 'DPD'])

                    rep.perform_base_qc(parameters)
                    rep.set_qc(
                        'POS', 'month_match',
                        qc.month_match(year, month, rep.getvar('YR'),
                                       rep.getvar('MO')))

                    reps.append(rep)
                    count += 1

                rec = IMMA()
                dyb_count += 1
                if dyb_count % 1000 == 0:
                    print('INFO({}): {} out of {} processed'.format(
                        datetime.now().time().isoformat(
                            timespec='milliseconds'), dyb_count,
                        imma_obj.index.size))

                # icoads_file.close()

    print("INFO({}): Read {} ICOADS records".format(
        datetime.now().time().isoformat(timespec='milliseconds'), count))

    # filter the obs into passes and fails of basic positional QC
    filt = ex.QC_filter()
    filt.add_qc_filter('POS', 'date', 0)
    filt.add_qc_filter('POS', 'time', 0)
    filt.add_qc_filter('POS', 'pos', 0)
    filt.add_qc_filter('POS', 'blklst', 0)

    reps.add_filter(filt)

    if verbose:
        print('INFO ({}) .... Track checking individual ships'.format(
            datetime.now().time().isoformat(timespec='milliseconds')))

        # track check the passes one ship at a time
    count_ships = 0
    for one_ship in reps.get_one_platform_at_a_time():
        one_ship.track_check(parameters['track_check'])
        one_ship.iquam_track_check(parameters['IQUAM_track_check'])
        one_ship.spike_check(parameters['IQUAM_spike_check'])
        one_ship.find_saturated_runs(parameters['saturated_runs'])
        one_ship.find_multiple_rounded_values(
            parameters['multiple_rounded_values'])

        for varname in ['SST', 'AT', 'AT2', 'DPT', 'SLP']:
            one_ship.find_repeated_values(parameters['find_repeated_values'],
                                          intype=varname)

        count_ships += 1

    print("Track checked {} ships".format(count_ships))

    if verbose:
        print('INFO ({}) .... Applying buddy checks'.format(
            datetime.now().time().isoformat(timespec='milliseconds')))
    if verbose:
        print('INFO ({}) ........ SST'.format(
            datetime.now().time().isoformat(timespec='milliseconds')))
        # SST buddy check
    filt = ex.QC_filter()
    filt.add_qc_filter('POS', 'is780', 0)
    filt.add_qc_filter('POS', 'date', 0)
    filt.add_qc_filter('POS', 'time', 0)
    filt.add_qc_filter('POS', 'pos', 0)
    filt.add_qc_filter('POS', 'blklst', 0)
    filt.add_qc_filter('POS', 'trk', 0)
    filt.add_qc_filter('SST', 'noval', 0)
    filt.add_qc_filter('SST', 'freez', 0)
    filt.add_qc_filter('SST', 'clim', 0)
    filt.add_qc_filter('SST', 'nonorm', 0)

    reps.add_filter(filt)

    reps.bayesian_buddy_check('SST', sst_stdev_1, sst_stdev_2, sst_stdev_3,
                              parameters)
    reps.mds_buddy_check('SST', sst_pentad_stdev,
                         parameters['mds_buddy_check'])

    if verbose:
        print('INFO ({}) ........ NMAT'.format(
            datetime.now().time().isoformat(timespec='milliseconds')))
        # NMAT buddy check
    filt = ex.QC_filter()
    filt.add_qc_filter('POS', 'isship', 1)  # only do ships mat_blacklist
    filt.add_qc_filter('AT', 'mat_blacklist', 0)
    filt.add_qc_filter('POS', 'date', 0)
    filt.add_qc_filter('POS', 'time', 0)
    filt.add_qc_filter('POS', 'pos', 0)
    filt.add_qc_filter('POS', 'blklst', 0)
    filt.add_qc_filter('POS', 'trk', 0)
    filt.add_qc_filter('POS', 'day', 0)
    filt.add_qc_filter('AT', 'noval', 0)
    filt.add_qc_filter('AT', 'clim', 0)
    filt.add_qc_filter('AT', 'nonorm', 0)

    reps.add_filter(filt)

    reps.bayesian_buddy_check('AT', sst_stdev_1, sst_stdev_2, sst_stdev_3,
                              parameters)
    reps.mds_buddy_check('AT', sst_pentad_stdev, parameters['mds_buddy_check'])

    # DPT buddy check #NB no day check for this one
    filt = ex.QC_filter()
    filt.add_qc_filter('DPT', 'hum_blacklist', 0)
    filt.add_qc_filter('POS', 'date', 0)
    filt.add_qc_filter('POS', 'time', 0)
    filt.add_qc_filter('POS', 'pos', 0)
    filt.add_qc_filter('POS', 'blklst', 0)
    filt.add_qc_filter('POS', 'trk', 0)
    filt.add_qc_filter('DPT', 'noval', 0)
    filt.add_qc_filter('DPT', 'clim', 0)
    filt.add_qc_filter('DPT', 'nonorm', 0)

    reps.add_filter(filt)

    reps.mds_buddy_check('DPT', climlib.get_field('DPT', 'stdev'),
                         parameters['mds_buddy_check'])

    if verbose:
        print('INFO ({}) ........ SLP'.format(
            datetime.now().time().isoformat(timespec='milliseconds')))
        # SLP buddy check
    filt = ex.QC_filter()
    filt.add_qc_filter('POS', 'date', 0)
    filt.add_qc_filter('POS', 'time', 0)
    filt.add_qc_filter('POS', 'pos', 0)
    filt.add_qc_filter('POS', 'blklst', 0)
    filt.add_qc_filter('POS', 'trk', 0)
    filt.add_qc_filter('SLP', 'noval', 0)
    filt.add_qc_filter('SLP', 'clim', 0)
    filt.add_qc_filter('SLP', 'nonorm', 0)

    reps.add_filter(filt)

    reps.mds_buddy_check('SLP', climlib.get_field('SLP', 'stdev'),
                         parameters['slp_buddy_check'])

    extdir = bf.safe_make_dir(out_dir, year, month)
    reps.write_output(parameters['runid'], extdir, year, month)

    if tracking:

        if verbose:
            print('INFO ({}) .... Tracking'.format(
                datetime.now().time().isoformat(timespec='milliseconds')))

            # set QC for output by ID - buoys only and passes base SST QC
        filt = ex.QC_filter()
        filt.add_qc_filter('POS', 'month_match', 1)
        filt.add_qc_filter('POS', 'isdrifter', 1)

        reps.add_filter(filt)

        idfile = open(extdir + '/ID_file.txt', 'w')
        for one_ship in reps.get_one_platform_at_a_time():

            if len(one_ship) > 0:
                thisid = one_ship.getrep(0).getvar('ID')
                if thisid is not None:
                    idfile.write(thisid + ',' + ex.safe_filename(thisid) +
                                 '\n')
                    one_ship.write_output(parameters['runid'], extdir, year,
                                          month)
        idfile.close()

    del reps
def main(argv):
    '''
    This program reads in data from ICOADS.2.5.1 and applies quality control processes to it, flagging data as 
    good or bad according to a set of different criteria.

    The first step of the process is to read in various SST and MAT climatologies from file. These are 1degree latitude 
    by 1 degree longitude by 73 pentad fields in NetCDF format.
    
    The program then loops over all specified years and months reads in the data needed to QC that month and then 
    does the QC. There are three stages in the QC
    
    basic QC - this proceeds one observation at a time. Checks are relatively simple and detect gross errors
    
    track check - this works on Voyages consisting of all the observations from a single ship (or at least a single ID) 
    and identifies observations which make for an implausible ship track
    
    buddy check - this works on Decks which are large collections of observations and compares observations to their neighbours
    '''
    
    print '########################'
    print 'Running make_and_full_qc'
    print '########################'

    parser = argparse.ArgumentParser(description='Marine QC system, main program')
    parser.add_argument('-config', type=str, default='configuration.txt', help='name of config file')
    parser.add_argument('-year1', type=int, default=1850, help='First year for processing')
    parser.add_argument('-year2', type=int, default=1850, help='Final year for processing')
    parser.add_argument('-month1', type=int, default=1, help='First month for processing')
    parser.add_argument('-month2', type=int, default=1, help='Final month for processing')
    parser.add_argument('-test', action='store_true', help='run test suite')
    args = parser.parse_args() 

    inputfile = args.config
    year1 = args.year1
    year2 = args.year2
    month1 = args.month1
    month2 = args.month2
    Test = args.test 

    print 'Input file is ', inputfile
    print 'Running from ', month1, year1, ' to ', month2, year2
    print ''

    config = ConfigParser.ConfigParser()    
    config.read(inputfile)

    sst_climatology_file  = '/project/mds/HADISST2/OIv2_clim_MDS_6190_0.25x0.25xdaily_365.nc'

    icoads_dir = config.get('Directories', 'ICOADS_dir')
    out_dir = config.get('Directories', 'out_dir')
    bad_id_file = config.get('Files', 'IDs_to_exclude')
    version = config.get('Icoads', 'icoads_version')

    print 'ICOADS directory =', icoads_dir
    print 'ICOADS version =', version
    print 'List of bad IDs =', bad_id_file 
    print ''

    ids_to_exclude = process_bad_id_file(bad_id_file)

#read in climatology files
    sst_pentad_stdev = clim.Climatology.from_filename(config.get('Climatologies', 'Old_SST_stdev_climatology'), 'sst')

    sst_stdev_1 = clim.Climatology.from_filename(config.get('Climatologies', 'SST_buddy_one_box_to_buddy_avg'), 'sst')
    sst_stdev_2 = clim.Climatology.from_filename(config.get('Climatologies', 'SST_buddy_one_ob_to_box_avg'), 'sst')
    sst_stdev_3 = clim.Climatology.from_filename(config.get('Climatologies', 'SST_buddy_avg_sampling'), 'sst')

    with open(config.get('Files','parameter_file'), 'r') as f:
        parameters = json.load(f)

    climlib = ex.ClimatologyLibrary()
    climlib.add_field('SST', 'mean', clim.Climatology.from_filename(sst_climatology_file, 'temperature'))

    for year, month in qc.year_month_gen(year1, month1, year2, month2):

        print year, month

        last_year, last_month = qc.last_month_was(year, month)
        next_year, next_month = qc.next_month_is(year, month)

        reps = ex.Deck()
        count = 0
        count2 = 0

        for readyear, readmonth in qc.year_month_gen(last_year, 
                                                     last_month, 
                                                     next_year, 
                                                     next_month):

            print readyear, readmonth
            syr = str(readyear)
            smn = "%02d" % (readmonth)

            filename = icoads_filename(icoads_dir, readyear, 
                                       readmonth, version)

            try:
                icoads_file = gzip.open(filename, "r")
            except IOError:
                print "no ICOADS file ",filename," for ", readyear, readmonth
                continue

            rec = IMMA()

            for line in icoads_file:

                try:
                    rec.readstr(line)
                    readob = True
                except:
                    readob = False
                    print "Rejected ob", line
                    
#if this is not on the exclusion list, readable and not a buoy in the NRT runs
                if (not(rec.data['ID'] in ids_to_exclude) and 
                    readob and
                    rec.data['YR'] == readyear and
                    rec.data['MO'] == readmonth):

                    rep = ex.MarineReportQC(rec)
                    del rec
                    rep_clim = climlib.get_field('SST', 'mean').get_value(rep.lat(), rep.lon(), rep.getvar('MO'), rep.getvar('DY')) 
                    rep.add_climate_variable('SST', rep_clim)
                    rep.perform_base_sst_qc(parameters)
                    reps.append(rep)
                    count += 1
                rec = IMMA()
            icoads_file.close()

        print "Read ", count, " ICOADS records"

#filter the obs into passes and fails of basic positional QC        
        filt = ex.QC_filter()
        filt.add_qc_filter('POS', 'date',   0)
        filt.add_qc_filter('POS', 'time',   0)
        filt.add_qc_filter('POS', 'pos',    0)
        filt.add_qc_filter('POS', 'blklst', 0)
         
        reps.add_filter(filt)

#track check the passes one ship at a time
        count_ships = 0
        for one_ship in reps.get_one_platform_at_a_time():

            one_ship.track_check(parameters['track_check'])
            one_ship.find_repeated_values(parameters['find_repeated_values'], intype='SST')
            count_ships += 1

        print "Track checked ", count_ships, " ships"

#SST buddy check
        filt = ex.QC_filter()
        filt.add_qc_filter('POS', 'is780',  0)
        filt.add_qc_filter('POS', 'date',   0)
        filt.add_qc_filter('POS', 'time',   0)
        filt.add_qc_filter('POS', 'pos',    0)
        filt.add_qc_filter('POS', 'blklst', 0)
        filt.add_qc_filter('POS', 'trk',    0)
        filt.add_qc_filter('SST', 'noval',  0)
        filt.add_qc_filter('SST', 'freez',  0)
        filt.add_qc_filter('SST', 'clim',   0)
        filt.add_qc_filter('SST', 'nonorm', 0)

        reps.add_filter(filt)

        reps.bayesian_buddy_check('SST', sst_stdev_1, sst_stdev_2, sst_stdev_3, parameters)
        reps.mds_buddy_check('SST', sst_pentad_stdev, parameters['mds_buddy_check'])

        varnames_to_print = {'SST':['bud', 'clim', 'nonorm', 'freez', 'noval', 'nbud', 'bbud', 'rep', 'spike', 'hardlimit']}
        
        reps.write_qc('hires_'+parameters['runid'], out_dir, year, month, varnames_to_print)

        del reps
def main(argv):
    '''
    The buddy check compares observations to other nearby observations. If the observation differs 
    substantially from the neighbour-average, the observation will be rejected.
    '''

    print '###################'
    print 'Running buddy_check'
    print '###################'
    
    inputfile = 'configuration.txt'

    try:
        opts, args = getopt.getopt(argv, "hi:", 
                                   ["ifile=", 
                                    "year1=", 
                                    "year2="])
    except getopt.GetoptError:
        print 'Usage Make_DB.py -i <configuration_file> '+\
        '--year1 <start year> --year2 <end year>'
        sys.exit(2)
    for opt, arg in opts:
        if opt == '-h':
            print 'test.py -i <inputfile> -o <outputfile>'
            sys.exit()
        elif opt in ("-i", "--ifile"):
            inputfile = arg
        elif opt in ("-x", "--year1"):
            try:
                year1 = int(arg)
            except:
                sys.exit("Failed: year1 not an integer")
        elif opt in ("-y", "--year2"):
            try:
                year2 = int(arg)
            except:
                sys.exit("Failed: year2 not an integer")

    print 'Input file is ', inputfile
    print 'Running from ', year1, ' to ', year2
    print ''

    config = qc.get_config(inputfile)

    sst_climatology_file  = config['SST_climatology'] 
    nmat_climatology_file = config['MAT_climatology'] 
    icoads_dir            = config['ICOADS_dir'] 
    sst_stdev_climatology_file  = config['Old_SST_stdev_climatology']
    data_base_host        = config['data_base_host']
    data_base_name        = config['data_base_name'] 

    print 'Data base host =', data_base_host
    print 'Data base name =', data_base_name
    print 'SST climatology =', sst_climatology_file
    print 'NMAT climatology =', nmat_climatology_file
    print 'ICOADS directory =', icoads_dir
    print ''

#read in the pentad climatology of standard deviations
    climatology = Dataset(sst_stdev_climatology_file)
    sst_pentad_stdev = climatology.variables['sst'][:]

    connection = MySQLdb.connect(host=data_base_host, 
                                 user='******',
                                 db=data_base_name)
    cursor  = connection.cursor() #read
    cursor2 = connection.cursor() #write
    
    for years, months in qc.year_month_gen(year1, 1, year2, 12):

#want to get a month either side of the 
#target month, which may be in different years
        last_year, last_month = qc.last_month_was(years, months)
        next_year, next_month = qc.next_month_is(years, months)
        
        print years, months
        
        first_year = min([last_year, years, next_year])
        final_year = max([last_year, years, next_year])
        
        if first_year < 1850:
            first_year = 1850
        if final_year > 2014:
            final_year = 2014

#first and last julian days are +- approximately one month
        month_lengths = qc.month_lengths(years)
        jul1 = qc.jul_day(years, months, 1)-25
        jul2 = qc.jul_day(years, months, month_lengths[months-1])+25
        
        for check_variable in ['SST','MAT']:
        
            reps = []
            for yyy in range(first_year, final_year+1):
                
                qcfilter = db.Quality_Control_Filter()
                qcfilter.jul1 = jul1
                qcfilter.jul2 = jul2
                qcfilter.set_multiple_qc_flags_to_pass(['bad_position',
                                                        'bad_date',
                                                        'blacklist'])
                
                if check_variable == 'SST':
                    qcfilter.set_multiple_qc_flags_to_pass(['no_sst',
                                                            'sst_below_freezing',
                                                            'no_sst_normal',
                                                            'sst_climatology_fail'])
                elif check_variable == 'MAT':
                    qcfilter.set_multiple_qc_flags_to_pass(['no_mat',
                                                            'no_mat_normal',
                                                            'mat_climatology_fail'])
                else:
                    print "no such type ", check_variable
                    assert False

                sql_request = db.build_sql_query(yyy, qcfilter)
                
                cursor.execute(sql_request)
                numrows = cursor.rowcount

                for i in range(numrows):
                    rows = cursor.fetchone()
                    rep = qc.MarineReport.report_from_array(rows)
                    reps.append(rep)

            print len(reps)," observations read in"

#Do the buddy check
            if check_variable == 'SST':
                qcs = qc_buddy_check.mds_buddy_check(reps, 
                                                     sst_pentad_stdev, 
                                                     'SST')
            elif check_variable == 'MAT':
                qcs = qc_buddy_check.mds_buddy_check(reps, 
                                                     sst_pentad_stdev, 
                                                     'MAT')
            else:
                print "no such type ", check_variable
                assert False

#put updated QC flags into data base
            for rep in reps:
                if rep.month == months:
                    if check_variable == 'SST':
                        result = db.update_db_qc_single_flag(rep,
                                                             rep.sst_buddy_fail,
                                                             'sst_qc',
                                                             'sst_buddy_fail',
                                                             years,
                                                             cursor2)
                    elif check_variable == 'MAT':
                        result = db.update_db_qc_single_flag(rep,
                                                             rep.mat_buddy_fail,
                                                             'mat_qc',
                                                             'mat_buddy_fail',
                                                             years,
                                                             cursor2)
                    else:
                        print "no such type ", check_variable
                        assert False

            print "Of "+str(len(qcs))+" observations "+\
            str(np.sum(qcs))+" failed "+check_variable+\
            " buddy check"

        connection.commit() #Each month
        #db.report_qc_counts(cursor, years, months)

    connection.close()

    
    print "All Done :)"
def main(argv):
    '''
    This program builds the marine data base which will be used to store the subset of ICOADS used in QC and 
    other data processing. The current version reads in IMMA1 data from ICOADS.2.5.1 and the UID is used as the 
    primary key for the data base so that it can be easily matched to individual obs if need be.
    
    #KW added para
    The database is now just a set of ascii files for each year/month. Later it may be the SQL database.

    The first step of the process is to read in the SST and MAT climatologies from file. These are 1degree latitude 
    by 1 degree longitude by 73 pentad fields in NetCDF format. The data are read into numpy arrays.

    Next a connection is made to the data base, which may or may not already exist. If it does not exist, a database 
    will be created.
    
    The program then loops over all years and months and DROPs existing tables for each year if they already exist and 
    then recreates them. It then loops over all months in the year, opens the appropriate IMMA file and reads in 
    the data one observation at a time.
    '''
    
    print '########################'
    print 'Running make_and_full_qc'
    print '########################'
    
    inputfile = 'configuration.txt'
    month1 = 1
    month2 = 1
    year1 = 1880
    year2 = 1880
# KW Querying second instance of inputfile - I have commented this out for now    
#    inputfile = 'configuration_local.txt'
    
    try:
        opts, args = getopt.getopt(argv, "hi:", 
                                   ["ifile=", 
                                    "year1=", 
                                    "year2=",
                                    "month1=",
                                    "month2="])
    except getopt.GetoptError:
# KW changed Make_DB.py to make_and_full_qc.py
        print 'Usage make_and_full_qc.py -i <configuration_file> '+\
        '--year1 <start year> --year2 <end year> '+\
        '--month1 <start month> --month2 <end month>'
        sys.exit(2)

    inputfile, year1, year2, month1, month2 = qc.get_arguments(opts)

    print 'Input file is ', inputfile
    print 'Running from ', year1, ' to ', year2
    print ''

    config = qc.get_config(inputfile)

# KW Added a 'switch' to tell the code whether to run in HadISDH only (HadISDHSwitch == True) mode or 
# full mode (HadISDHSwitch == False)
    HadISDHSwitch = config['HadISDHSwitch']

    sst_climatology_file  = config['SST_climatology'] 
    nmat_climatology_file = config['MAT_climatology'] 
# KW Added climatology files for the humidity variables 
    at_climatology_file  = config['AT_climatology']
    dpt_climatology_file  = config['DPT_climatology']
    shu_climatology_file  = config['SHU_climatology']
    vap_climatology_file  = config['VAP_climatology']
    crh_climatology_file  = config['CRH_climatology']
    cwb_climatology_file  = config['CWB_climatology']
    dpd_climatology_file  = config['DPD_climatology']
# KW Added climatology file for the SLP which is needed if no SLP ob exists, or if it has failed qc - or if we choose to derive humidity using climatological P (which we have)
    slp_climatology_file  = config['SLP_climatology']
    icoads_dir            = config['ICOADS_dir'] 
#KW Added the 'recent' ICOADS dir for files 2015+
    recent_icoads_dir            = config['RECENT_ICOADS_dir'] 
    bad_id_file           = config['IDs_to_exclude']
# KW added an item for the database dir to write out the QC'd ascii data to - hijacking SQL data_base_dir for now
    data_base_dir	  = config['data_base_dir']
# KW added an item as a suffix for the output file name to note which iteration we're on
    output_suffix         = config['output_suffix']    

# KW Noting this is set to read the OLD SST stdevs - nothing reads in the newer OSTIA one yet.       
    sst_stdev_climatology_file  = config['Old_SST_stdev_climatology']
    
    sst_stdev_1_file = config['SST_buddy_one_box_to_buddy_avg']
    sst_stdev_2_file = config['SST_buddy_one_ob_to_box_avg']
    sst_stdev_3_file = config['SST_buddy_avg_sampling']

# KW added standard deviation files for AT and DPT - for MDSKate_buddy_check
    at_stdev_climatology_file  = config['AT_stdev_climatology']
    dpt_stdev_climatology_file  = config['DPT_stdev_climatology']
    
# KW Added a look for hardwired limits passed through the config file or set to None
    if ('HardLimits' in config): 
	HardLimit = np.float(config['HardLimits'])
    else:
        HardLimit = None	   
    print "This is the provided HardLimit: ",HardLimit
    #pdb.set_trace()

    print 'SST climatology =', sst_climatology_file
    print 'NMAT climatology =', nmat_climatology_file
# KW Added climatology files for the humidity variables 
    print 'DPT climatology =', dpt_climatology_file
    print 'SHU climatology =', shu_climatology_file
    print 'VAP climatology =', vap_climatology_file
    print 'CRH climatology =', crh_climatology_file
    print 'CWB climatology =', cwb_climatology_file
    print 'DPD climatology =', dpd_climatology_file
## KW Added climatology files for SLP for calculation of humidity variables if no good quality SLP ob exists
    print 'SLP climatology =', slp_climatology_file
    print 'ICOADS directory =', icoads_dir
# KW added 'recent' icoads dir
    print 'RECENT ICOADS directory =', recent_icoads_dir
    print 'List of bad IDs =', bad_id_file 
# KW added an item for the database dir to write out the QC'd ascii data to - hijacking SQL data_base_dir for now
    print 'QCd Database directory =', data_base_dir 
    print 'QCd File Suffix =', output_suffix 
    print ''

    ids_to_exclude = process_bad_id_file(bad_id_file)

#read in climatology files
    climsst = read_climatology(sst_climatology_file, 'sst')
    climnmat = read_climatology(nmat_climatology_file, 'nmat')
# KW Added climatology read in files for the humidity variables
    climat = read_climatology(at_climatology_file, 't2m_clims')
    climdpt = read_climatology(dpt_climatology_file, 'td2m_clims')
    climshu = read_climatology(shu_climatology_file, 'q2m_clims')
    climvap = read_climatology(vap_climatology_file, 'e2m_clims')
    climcrh = read_climatology(crh_climatology_file, 'rh2m_clims')
    climcwb = read_climatology(cwb_climatology_file, 'tw2m_clims')
    climdpd = read_climatology(dpd_climatology_file, 'dpd2m_clims')
## KW Added climatology read in files for SLP for calculating humidity variabls if no SLP value exists
    climslp = read_climatology(slp_climatology_file, 'p2m_clims')

# KW Note that if this points to OLD_SST_stdev_climatology then it is a 73,180,360 array whereas the SST_stdev_climatology file is just 180,360
    sst_pentad_stdev = read_climatology(sst_stdev_climatology_file, 'sst')
    
    sst_stdev_1 = read_climatology(sst_stdev_1_file, 'sst')
    sst_stdev_2 = read_climatology(sst_stdev_2_file, 'sst')
    sst_stdev_3 = read_climatology(sst_stdev_3_file, 'sst')

# KW added standard deviation files for AT and DPT - for MDSKate_buddy_check
    at_pentad_stdev = read_climatology(at_stdev_climatology_file, 't2m_stdevs')
    dpt_pentad_stdev = read_climatology(dpt_stdev_climatology_file, 'td2m_stdevs')
    
    print 'Read climatology files'

    tim00 = time.time()

    for year, month in qc.year_month_gen(year1, month1, year2, month2):

        tim0 = time.time()

        print year, month

        last_year, last_month = qc.last_month_was(year, month)
        next_year, next_month = qc.next_month_is(year, month)

        if last_year < 1850:
            last_year = 1850 # KW don't understand why last year forced to be 1850 yet
            last_month = 1

        print last_year, last_month, next_year, next_month

        reps = ex.Deck()
        count = 0

# KW This takes a long time to read in each year/month and process
# For every candidate year/month the year/month before and after are also read in
# Can we store the candidate year/month and following year/month for the next loop?
# Hopefully there will be enough memory on spice
# HOWEVER - IF WE RUN MANY YEARS IN PARALELL THEN OK TO READ IN EACH TIME

        for readyear, readmonth in qc.year_month_gen(last_year, 
                                                     last_month, 
                                                     next_year, 
                                                     next_month):

            print readyear, readmonth

            syr = str(readyear)
            smn = "%02d" % (readmonth)

# KW THIS BIT IS FOR 2.5.0/1    
#            filename = icoads_dir+'/R2.5.1.'+syr+'.'+smn+'.gz'
# KW FOUND A BUG - changed 'year' to 'readyear' below because it was trying to 
# read R2.5.2.2007.12.gz because 'year'=2008, 'month'=1
# KW Now added a catch for 'recent' years - at present this is anything from 2015 onwards - data only available in IMMA (not IMMA2) format - no UID!
#            if ((readyear > 2007) & (readyear < 2015)):
#                filename = icoads_dir+'/R2.5.2.'+syr+'.'+smn+'.gz'
#            if (readyear >= 2015):
#                filename = recent_icoads_dir+'/IMMA.'+syr+'.'+smn+'.gz'
# KW THIS BIT IS FOR 3.0.0/1
            filename = icoads_dir+'/IMMA1_R3.0.0_'+syr+'-'+smn+'.gz'
            if (readyear >= 2015):
                filename = recent_icoads_dir+'/IMMA1_R3.0.1_'+syr+'-'+smn+'.gz'
    
            icoads_file = gzip.open(filename,"r")

# KW Noted that this creates an object of whole month of IMMA data separated into all available parameters from all available attachments
# The rec.read bit later could be speeded up by ignoring the attachments we are not interested in in the first place?    
# The rec object has a .data dictionary of all variables (see IMMA2.py for variable IDs/keys
            rec = IMMA()
   
            EOF = False
    
            while not(EOF):

#need to wrap the read in a exception catching thingy 
#becasuse there are some IMMA records which contain control 
#characters
                try:
                    result = rec.read(icoads_file)
                    if result == None:
                        EOF = True
                        # KW are we sure this isn't doing anything silly later when rec is overwritten with a new rec - could
			# this overwrite ids_to_exclude[0]?
			rec.data['ID'] = ids_to_exclude[0]
                except:
                    rec.data['ID'] = ids_to_exclude[0]


                if not(rec.data['ID'] in ids_to_exclude):

#strip everything out of the IMMA record except what we # KW (Kate Robert and John)# need
# KW this should work for both IMMA and IMMA1 e.g. C4 (IMMA) and C7 (IMMA1) use same 'key's so it 'should' find
# them because both are encoded in IMMA2.py
		    keys = []
                    for key in rec.data:
                        keys.append(key)
                    for key in keys:
# KW Added quite a few things in here - assume these don't have to be all from attachment 0 because UID isn't
# Assume they don't have to be in a particular order either
# I've put them in the order they appear in the attachments
# See: RequiredIMMAColumnsforHadISDH.xlsx
# Only a few of these will be written out but they are useful in the QC and bias adjustment process
# May remove some of these later if they are not useful - to save time/memory
#                        if not(key in ['YR','MO','DY','HR','LAT','LON',
#                                       'SST','AT','DCK','ID','PT','SI',
#                                       'SIM','DS','VS','SLP','UID','SID']):
                        if not(key in ['YR','MO','DY','HR','LAT','LON',
				       'DS','VS','II','ID','C1',
				       'DI','D','WI','W','VI','VV','SLP',
				       'IT','AT','WBTI','WBT','DPTI','DPT','SI','SST',
				       'DCK','SID','PT','DUPS',
				       'COR','TOB','TOT','EOT','TOH','EOH',
				       'SIM','LOV','HOP','HOT','HOB','HOA','SMF',
				       'UID']):
                            if key in rec.data: del rec.data[key]
# KW So I've noticed that if one of the listed keys above isn't in the ob then a data['key'] isn't
# set up (makes sense!) so when I come to print them later it all goes to pot
# So, I loop through the non-core0 keys here to add blank keys where they are missing
# KW Added 'UID' to this list because it is not present in the RECENT_ICOADS (2015+)
		    for inkey in ['DUPS','COR','TOB','TOT','EOT',
		                  'TOH','EOH','SIM','LOV','HOP','HOT','HOB','HOA','SMF','UID']:
		        if not(inkey in keys):
			    #print("Missing key: ",inkey)
			    rec.data[inkey] = None
			    					
                    rep = ex.MarineReport(rec)
                    del rec

#************HadISDH ONLY*******************************
# KW Added a catch here to check the platform type and whether there is both a T (AT) and DPT  present.
# Only keep the ob if it is from a ship (0,1,2,3,4,5) or moored platform/buoy (6,8,9,10,15) and has 
# AT and DPT present.
# This may not be desirable for a full run but should save time/memory for HadISDH
# If HadISDHSwitch == True then the ob needs to pass the test else all obs are processed
# No QC performed yet so cannot call get_qc - qc.value_check returns 0 if present and 1 if noval
# Previously I had also pulled through PT=14 but this can be a coastal or island station - so not what we want.
# KW Oct 2016 - I've now decided that future runs shoudl NOT include any platforms. We don't have height
# info and they can vary from <10 to >200m so its just too screwy
#		    if (not (HadISDHSwitch)) | ((rep.data['PT']  in [0,1,2,3,4,5,6,8,9,10,15]) & 
		    if (not (HadISDHSwitch)) | ((rep.data['PT']  in [0,1,2,3,4,5,6,8]) & 
		                                (qc.value_check(rep.getvar('AT')) == 0) & 
						(qc.value_check(rep.getvar('DPT')) == 0)):

# KW TESTED: WORKS IF VALUES ARE BLANK AT LEAST
# KW CHECK THAT THIS KICKS OUT OBS WITH REPORTED MISSING VALUES (e.g. -99.9 or 99.9) FOR AT or DPT		    
#*******************************************************

# KW Call my rep.setvar routine that I built into the MarineReport in Extended_IMMA.py
# Use this to add blank var containers for the humidity variables that are calculated 
# later
                        rep.setvar(['SHU','VAP','CRH','CWB','DPD'])

# KW Get climatologies for slp to calculate humidity values if no good quality qc ob exists
                        rep_slp_clim = get_clim(rep, climslp)
			#print('SLP: ',rep_slp_clim)
			#if (count == 10):
			#    pdb.set_trace()
                        rep.add_climate_variable('SLP', rep_slp_clim)

# KW Calculate humidity variables here - so we can then kick out anything really silly e.g. RH>150
# Very silly values can cause longer line lengths at output which is an extra problem for post processing
# For the longer term these could be set to missing but we just want to focus on 'good' humidity obs for now
# Use my new routine as part of the Extended_IMMA MarineReport class rep.calcvar() 
# This routine returns values as None if there is no climslp or if RH is < 0 or > 150.
                        rep.calcvar(['SHU','VAP','CRH','CWB','DPD'])
			
# Now we have the checker for very silly values - which will just break the loop
# No RH - means that there is either an AT or DPT missing
# RH must be between 0 and 150
# AT must be between -80 and 65
# DPT must be between -80 and 65
# SHU must be greater than 0.0
# Inadvertantly, this kicks out any ob for which no climatology is available - the ones that would late fail pos or date checks
# Later on - we may change this to just set the humidity values to missing rather than delete the ob. SST might be ok after all.
                        if (rep.getvar('CRH') == None):
#			    print('Found a SILLINESS ',rep.getvar('AT'),rep.getvar('DPT'))
#			    pdb.set_trace()
			    # delete the rep to keep things tidy
			    del rep
			    # create a new rec because we're skipping the end of the WHILE loop
			    rec = IMMA()
			    continue
                        if ((rep.getvar('CRH') <= 0.0) | (rep.getvar('CRH') > 150.0)):
#			    print('Found a SILLINESS ',rep.getvar('AT'),rep.getvar('DPT'))
#			    pdb.set_trace()
			    # delete the rep to keep things tidy
			    del rep
			    # create a new rec because we're skipping the end of the WHILE loop
			    rec = IMMA()
			    continue
                        if ((rep.getvar('AT') < -80.) | (rep.getvar('AT') > 65.)):
#			    print('Found a SILLINESS ',rep.getvar('AT'),rep.getvar('DPT'))
#			    pdb.set_trace()
			    # delete the rep to keep things tidy
			    del rep
			    # create a new rec because we're skipping the end of the WHILE loop
			    rec = IMMA()
			    continue
                        if ((rep.getvar('DPT') < -80.) | (rep.getvar('DPT') > 65.)):
#			    print('Found a SILLINESS ',rep.getvar('AT'),rep.getvar('DPT'))
#			    pdb.set_trace()
			    # delete the rep to keep things tidy
			    del rep
			    # create a new rec because we're skipping the end of the WHILE loop
			    rec = IMMA()
			    continue
                        if (rep.getvar('SHU') <= 0.0):
#			    print('Found a SILLINESS ',rep.getvar('AT'),rep.getvar('DPT'))
#			    pdb.set_trace()
			    # delete the rep to keep things tidy
			    del rep
			    # create a new rec because we're skipping the end of the WHILE loop
			    rec = IMMA()
			    continue
					
# Get climatologies for all variables (for outlier test and anomaly creation [done in buddy check and for final print out] - if AT or DPT are missing (None) then do not carry on processing that variable
# If we're using OBSclims then there are missing data which will be returned as None (NOT A STRING!!!)
# KW Added bit to find and store climatological stdev for AT and DPT - for outlier test 
                        rep_sst_clim = get_clim(rep, climsst)
                        rep.add_climate_variable('SST', rep_sst_clim)

# KW Set to read in ERA (or OBS+ERA) clim file for AT (not NMAT)
#                        rep_mat_clim = get_clim(rep, climnmat)
                        rep_mat_clim = get_clim(rep, climat)
                        rep_mat_stdev = get_clim(rep, at_pentad_stdev)
			#print(rep_mat_clim,rep_mat_stdev)
			#pdb.set_trace()
## KW added to test clim value pulled out
#			print(rep.getvar('UID'),rep.getvar('AT'),rep_mat_clim,rep.getnorm('AT'))			
#			print(rep.getvar('UID'),rep.getvar('AT'),rep_mat_stdev,rep.getstdev('AT'))			
#			if (count == 10):
#			    pdb.set_trace() 
## KW This seems to be pulling out the correct climatological value 		    
                        if ((rep_mat_clim == None) | (rep_mat_stdev == None)):
			    del rep
			    # create a new rec because we're skipping the end of the WHILE loop
			    rec = IMMA()
			    continue
			else:			
                            rep.add_climate_variable('AT', rep_mat_clim)
                            rep.add_stdev_variable('AT', rep_mat_stdev)

                        rep_dpt_clim = get_clim(rep, climdpt)
                        rep_dpt_stdev = get_clim(rep, dpt_pentad_stdev)
                        if ((rep_dpt_clim == None) | (rep_dpt_stdev == None)):
			    del rep
			    rec = IMMA()
			    continue
			else:			
                            rep.add_climate_variable('DPT', rep_dpt_clim)
                            rep.add_stdev_variable('DPT', rep_dpt_stdev)

                        rep_shu_clim = get_clim(rep, climshu)
                        if (rep_shu_clim == None) : # if there is no SHU then either an AT or DPT would be missing I think so loop shoudld already be stopped
			    del rep
			    rec = IMMA()
			    continue
			else:			
                            rep.add_climate_variable('SHU', rep_shu_clim)

			rep_vap_clim = get_clim(rep, climvap)
                        if (rep_vap_clim == None) : # if there is no SHU then either an AT or DPT would be missing I think so loop shoudld already be stopped
			    del rep
			    rec = IMMA()
			    continue
			else:			
                            rep.add_climate_variable('VAP', rep_vap_clim)

		        rep_crh_clim = get_clim(rep, climcrh)
                        if (rep_crh_clim == None) : # if there is no SHU then either an AT or DPT would be missing I think so loop shoudld already be stopped
			    del rep
			    rec = IMMA()
			    continue
			else:			
                            rep.add_climate_variable('CRH', rep_crh_clim)

			rep_cwb_clim = get_clim(rep, climcwb)
                        if (rep_cwb_clim == None) : # if there is no SHU then either an AT or DPT would be missing I think so loop shoudld already be stopped
			    del rep
			    rec = IMMA()
			    continue
			else:			
                            rep.add_climate_variable('CWB', rep_cwb_clim)

			rep_dpd_clim = get_clim(rep, climdpd)
                        if (rep_dpd_clim == None) : # if there is no SHU then either an AT or DPT would be missing I think so loop shoudld already be stopped
			    del rep
			    rec = IMMA()
			    continue
			else:			
                            rep.add_climate_variable('DPD', rep_dpd_clim)
					
#Deck 701 has a whole bunch of otherwise good obs with missing Hours.
#Set to 0000UTC and recalculate the ob time
                        if (rep.getvar('DCK') == 701 and 
                            rep.getvar('YR') < 1860 and 
                            rep.getvar('HR') == None):
                            rep.data['HR'] = 0
                            rep.calculate_dt()

# KW Added a HardLimit variable that has to be passed to the base_qc_report
                        #rep = base_qc_report(rep)
                        rep = base_qc_report(rep,HardLimit)

#			print(rep.getvar('ID'),rep.getvar('AT'),rep.getvar('DPT'),rep.getvar('SHU'),rep.getvar('CRH'),rep.getvar('VAP'))
#                        pdb.set_trace()

                        reps.append(rep)
                        count += 1

                rec = IMMA()

            icoads_file.close()

        tim1 = time.time()
        print count, " obs read and base QC ", tim1-tim0
        
#filter the obs into passes and fails of basic positional QC        
# KW NOtes that this uses the month before and after to apply track check - and so actually spends time applying
# track check to the month before and month after too, which will then be ignored and redone later, with its following month
# Is there scope to save effort here by only checking the candidate month while still passing the surrounding months for info
        reps.sort()
        filt = ex.QC_filter()
        filt.add_qc_filter('POS', 'date',   0)
        filt.add_qc_filter('POS', 'pos',    0)
        filt.add_qc_filter('POS', 'blklst', 0)
        passes, reps = filt.split_reports(reps)
        passes.sort()

        tim2 = time.time()
        print "obs filtered and sorted in ", tim2-tim1, len(reps)+len(passes)

# KW So in here we could put some kind of parsing loop to say that if you are looping through more than one month
# then you could save the candidate and previous month

# KW ALSO NOW ONLY CARRY ON WITH THOSE OBS THAT PASS BASE QC (date, pos, blacklist)
# KW commented out the following:
##all fails pass track check 
#        reps.set_qc('POS', 'trk', 0)
#        reps.set_qc('POS', 'few', 0)
#        reps.set_qc('SST', 'rep', 0)
#        reps.set_qc('AT',  'rep', 0)
## KW Added for DPT
#        reps.set_qc('DPT',  'rep', 0)
#	reps.set_qc('DPT', 'repsat', 0)
# KW End of commenting out
# KW now clear and reset reps so that it gets overwritten and filled with only passes
        del reps
	reps = ex.Deck()

#track check the passes one ship at a time
        for one_ship in passes.get_one_ship_at_a_time():
            one_ship.track_check()
# KW I don't think we need to spend time doing this for SST so have commented out
#            one_ship.find_repeated_values(threshold=0.7, intype='SST')
# KW FOr AT and DPT this procedure now also looks at the proportion of obs in a track (>20 obs - same as rep value check) that have .0 precision
# Where >=50% obs end in .0 the ATround or DPTround flag is set to 1
            one_ship.find_repeated_values(threshold=0.7, intype='AT')
# KW Added for DPT
# KW For DPT this QC procedure now also searches for persistant streaks of 100% RH (AT == DPT) and flags repsat
            one_ship.find_repeated_values(threshold=0.7, intype='DPT')

            for rep in one_ship.rep_feed():
                rep.reset_ext()
                reps.append(rep)

        del passes

        reps.sort()

        tim3 = time.time()
        print "obs track checked in ", tim3-tim2, len(reps)

#*******************************
# KW Commented out for now to save time on debug
##SST buddy check
## KW NOtes that this uses the month before and after to apply track check - and so actually spends time applying
## track check to the month before and month after too, which will then be ignored and redone later, with its following month
## Is there scope to save effort here by only checking the candidate month while still passing the surrounding months for info
#        filt = ex.QC_filter()
#        filt.add_qc_filter('POS', 'date',   0)
#        filt.add_qc_filter('POS', 'pos',    0)
#        filt.add_qc_filter('POS', 'blklst', 0)
#        filt.add_qc_filter('POS', 'trk',    0)
#        filt.add_qc_filter('SST', 'noval',  0)
#        filt.add_qc_filter('SST', 'freez',  0)
#        filt.add_qc_filter('SST', 'clim',   0)
#        filt.add_qc_filter('SST', 'nonorm', 0)
#
## KW Notes splitting marine obs into passes and fails
#        passes, reps = filt.split_reports(reps)
#
## KW Thinks this only buddy checks those obs that pass the filter of QC above
#        passes.bayesian_buddy_check('SST', sst_stdev_1, sst_stdev_2, sst_stdev_3)
#        passes.mds_buddy_check('SST', sst_pentad_stdev)
#
#******************************************
## KW Thinks all fails obs that do not pass teh QC filter above are not buddy checked - they are set to 0
## which means pass but should not be used later because they fail one of the other basic checks
#        reps.set_qc('SST', 'bbud', 0)
#        reps.set_qc('SST', 'bud',  0)

#****************************************
# KW Commented out to save time
#        for i in range(0, len(passes)):
#            rep = passes.pop(0)
#            reps.append(rep)
#
#        del passes
#
#        reps.sort()
#****************************************
        tim4 = time.time()
        print "obs SST buddy checked in ", tim4-tim3, len(reps)

#NMAT buddy check
# KW NOtes that this uses the month before and after to apply track check - and so actually spends time applying
# track check to the month before and month after too, which will then be ignored and redone later, with its following month
# Is there scope to save effort here by only checking the candidate month while still passing the surrounding months for info?
# For now I've made mdsKATE_buddy_check which only applies actual check to candidate month and year. It also uses actual pentad
# for that time of year rather than the average pentad stdev.
        filt = ex.QC_filter()
## KW Commented out date/pos/blklst as these have already been filtered out
#        filt.add_qc_filter('POS', 'date',   0)
#        filt.add_qc_filter('POS', 'pos',    0)
#        filt.add_qc_filter('POS', 'blklst', 0)
        filt.add_qc_filter('POS', 'trk',    0)
# KW commented out because we want to try to use all obs for AT and SPT
#        filt.add_qc_filter('POS', 'day',    0)
# KW Commented out because we've already filtered so that only present obs are retained
#        filt.add_qc_filter('AT',  'noval',  0)
        filt.add_qc_filter('AT',  'clim',   0)
        filt.add_qc_filter('AT',  'nonorm', 0)
# KW Notes that 'reps' are those obs that have failed one of the tests in the filter above
        passes, reps = filt.split_reports(reps)

# KW Notes that passes is an object containing a months worth of marine obs that pass (flag=0) for all above filters
# Both the bayesian buddy check and the mds buddy check test for distance to neighbours in space and time and flag
# with a 1 where it is too great/fails.
# KW NOT GOING TO APPLY BAYESIAN BUDDY CHECK BECAUSE WE CAN'T USE IT FOR DPT AND 
# ITS EXPERIMENTAL???
#        passes.bayesian_buddy_check('AT', sst_stdev_1, sst_stdev_2, sst_stdev_3)
# KW Commented out original mds_buddy_check to use mdsKATE_buddy_check instead (like DPT) which uses the seasonal stdev
# rather than the average and only applies buddy check to candidate month
# ALSO = we now use clim T stdevs from ERA (will eventually be obs+ERA combo?)
#        passes.mds_buddy_check('AT', sst_pentad_stdev)
# KW Added a HardLimit variable that has to be passed to mdsKATE_buddy_check for the stdev multiplier
        passes.mdsKATE_buddy_check('AT',  at_pentad_stdev, year, month, HardLimit)

# KW - all fails (reps) are set to have a flag of 0 which means to pass the buddy checks.because there is no point in spending
# further time buddy checking them, same as for track checks
# KW NOT GOING TO APPLY BAYESIAN BUDDY CHECK BECAUSE WE CAN'T USE IT FOR DPT AND 
# ITS EXPERIMENTAL???
#        reps.set_qc('AT', 'bbud', 8)
        reps.set_qc('AT', 'bud', 8)

        for i in range(0, len(passes)):
            rep = passes.pop(0)
            reps.append(rep)

        del passes

        reps.sort()

        tim5 = time.time()
        print "obs MAT buddy checked in ", tim5-tim4, len(reps)

# Don't think we need to set - if its not set it will be 9!
## KW Added buddy check for DPT - NOT RUNNING BAYESIAN BECAUSE WE DON'T HAVE APPROPRIATE DATA - SET FLAG TO 8!
#        reps.set_qc('DPT', 'bbud', 8)

#DPT buddy check
# KW NOtes that this uses the month before and after to apply track check - and so actually spends time applying
# track check to the month before and month after too, which will then be ignored and redone later, with its following month
# Is there scope to save effort here by only checking the candidate month while still passing the surrounding months for info
        filt = ex.QC_filter()
# KW commented out date, pos, blklst because we've already got rid of those that fail these
#        filt.add_qc_filter('POS', 'date',   0)
#        filt.add_qc_filter('POS', 'pos',    0)
#        filt.add_qc_filter('POS', 'blklst', 0)
        filt.add_qc_filter('POS', 'trk',    0)
# KW Commented out day because we want to try to use all obs for DPT and AT
#        filt.add_qc_filter('POS', 'day',    0) # Hmmm so only checking the nightime obs
# KW Commented out because we've already filtered so that only present obs are retained
#        filt.add_qc_filter('DPT',  'noval',  0)
        filt.add_qc_filter('DPT',  'clim',   0)
# KW commented out nonorm because there will always be a norm (if using ERA or combo ERA+obs)
#        filt.add_qc_filter('DPT',  'nonorm', 0) # KW could change this to ERANorm when we have actual climatologies from data - more useful because there always will be a norm from ERA
# KW Notes that 'reps' are those obs that have failed one of the tests in the filter above
        passes, reps = filt.split_reports(reps)

# KW Notes that passes is an object containing a months worth of marine obs that pass (flag=0) for all above filters
# Both the bayesian buddy check and the mds buddy check test for distance to neighbours in space and time and flag
# with a 1 where it is too great/fails.
#        passes.bayesian_buddy_check('DPT', sst_stdev_1, sst_stdev_2, sst_stdev_3)
#        passes.mds_buddy_check('DPT', dpt_pentad_stdev)
# KW Added a HardLimit variable that has to be passed to mdsKATE_buddy_check for the stdev multiplier
# KW Using Kate's version of MDS buddy check now which has a stdev for each pentad and only checks candidate month
        passes.mdsKATE_buddy_check('DPT', dpt_pentad_stdev, year, month, HardLimit)

# KW - all fails (reps) are set to have a flag of 0 which means to pass the buddy checks.because there is no point in spending
# further time buddy checking them, same as for track checks
#        reps.set_qc('DPT', 'bbud', 8)
        reps.set_qc('DPT', 'bud', 8) # KW set as 8 for now

        for i in range(0, len(passes)):
            rep = passes.pop(0)
            reps.append(rep)

        del passes

        reps.sort()

        tim6 = time.time()
        print "obs DPT buddy checked in ", tim6-tim5, len(reps)

        syr = str(year)
        smn = "%02d" % (month)
# KW changed outfile from icoards_dir to data_base_dir so that it writes to a different place to where the original 
# data are stored - don't want to mess with John's working version.
        outfile = open(data_base_dir+'/new_suite_'+syr+smn+'_'+output_suffix+'.txt', 'w')
        for rep in reps.reps:
            if rep.data['YR'] == year and rep.data['MO'] == month:
                outfile.write(rep.print_report())
        outfile.close()

        del reps

        tim11 = time.time()
        print year, " so far in ", tim11-tim00
def main(argv):
    
    '''
    The new track check program. First the program gets a list of all unique IDs in the month 
    that is to be track checked. It then reads in three months of data at a time: the month 
    you want to track check, a month before and a month after. For each unique ID, the track 
    check is run.
    
    Track check comprises as set of related tests
    
    This program checks positional data for individual ships and buoys for internal consistency; 
    checking reported positions against positions calculated using reported speeds and directions.
    
    The obs are sorted by call-sign then date. Obs can only be checked if they have a valid call-sign 
    that is unique to one ship or buoy, so obs with no call-sign or with the generic call-signs 'SHIP' 
    or 'PLAT' are passed unchecked. The call-sign '0102' was apparently shared by several ships, so obs 
    with this call-sign are also passed unchecked.
    '''
    
    print '###################'
    print 'Running New Track Check'
    print '###################'
    
    inputfile = 'configuration.txt'
    
    try:
        opts, args = getopt.getopt(argv, 
                                   "hi:", 
                                   ["ifile=", 
                                    "year1=", 
                                    "year2="])
    except getopt.GetoptError:
        print 'Usage Make_DB.py -i <configuration_file>'+\
        ' --year1 <start year> --year2 <end year>'
        sys.exit(2)
    for opt, arg in opts:
        if opt == '-h':
            print 'test.py -i <inputfile> -o <outputfile>'
            sys.exit()
        elif opt in ("-i", "--ifile"):
            inputfile = arg
        elif opt in ("-x", "--year1"):
            try:
                year1 = int(arg)
            except:
                sys.exit("Failed: year1 not an integer")
        elif opt in ("-y", "--year2"):
            try:
                year2 = int(arg)
            except:
                sys.exit("Failed: year2 not an integer")
                
    print 'Input file is ', inputfile
    print 'Running from ', year1, ' to ', year2
    print ''
    
    config = qc.get_config(inputfile)
    
    data_base_host        = config['data_base_host']
    data_base_name        = config['data_base_name'] 

    print 'Data base host =', data_base_host
    print 'Data base name =', data_base_name
 
    print ''

    connection = MySQLdb.connect(host=data_base_host, 
                                 user='******',
                                 db=data_base_name)

    #need two cursors, one for reading and one for making QC changes
    cursor = connection.cursor()
    cursor2 = connection.cursor()
    
    t00 = time.time()
    
    for years, months in qc.year_month_gen(year1, 1, year2, 12):
    
    #want to get a month either side of the target month, 
    #which may be in different years
        last_year, last_month = qc.last_month_was(years, months)
        next_year, next_month = qc.next_month_is(years, months)
        
        print years, months
    
        t0 = time.time()
        
        first_year = min([last_year, years, next_year])
        final_year = max([last_year, years, next_year])
    
        if first_year < 1850:
            first_year = 1850
        if final_year > 1990:
            final_year = 1990
    
    #first and last julian days are +- approximately one month
        month_lengths = qc.month_lengths(years)
        jul1 = qc.jul_day(years, months, 1)-10
        jul2 = qc.jul_day(years, months, month_lengths[months-1])+10
        
        '''Get all unique IDs for this month and fill a dictionary 
        with all the distinct ids that we want to QC as keys and an 
        empty Voyage for each key'''            
        allids = db.get_unique_ids(cursor, years, months)
        reps = {}
        for idrows in allids:
            thisid = idrows[0]
            reps[thisid] = qc.Voyage()
        
        t1 = time.time()
        print "got all IDs ",t1-t0
        
    #extract all data for this month and a month either side
        for yyy in range(first_year, final_year+1):
            
            '''
            Build filter for extracting data from data base and then extract. 
            In this case, we want observations between jul1 and jul2 which pass 
            the base QC checks. 
            '''
            qcfilter = db.Quality_Control_Filter()
            qcfilter.jul1 = jul1
            qcfilter.jul2 = jul2
            qcfilter.set_multiple_qc_flags_to_pass(['bad_position',
                                                    'bad_date',
                                                    'blacklist'])
            
            sql_request = db.build_sql_query(yyy, qcfilter)
            
            cursor.execute(sql_request)
            numrows = cursor.rowcount

    #put each ob into the dictionary if there is a key for it
            for i in range(numrows):
                rows = cursor.fetchone()
                rep = qc.ExtendedMarineReport.report_from_array(rows)
                if rep.id in reps:
                    reps[rep.id].add_report(rep)

        t2 = time.time()
        print "read all obs from DB",t2-t1

    #loop over all the distinct callsigns, extract the obs 
    #where the callsign matches and track check them
        for idrows in allids:
            thisid = idrows[0]
            matches = reps[thisid]
            matches.sort()

#run improved track check with spherical geometry etc.
            mqcs = qc_new_track_check.mds_full_track_check(matches)
            matches.find_repeated_values()

            for rep in matches.reps:
                if rep.month == months:
                    result = db.update_db_qc_single_flag(rep,rep.bad_track,
                                                         'extra_qc',
                                                         'bayesian_track_check',
                                                         years,cursor2)
                    result = db.update_db_qc_single_flag(rep,rep.repeated_value,
                                                         'extra_qc',
                                                         'repeated_value',
                                                         years,cursor2)

            split_matches = qc.split_generic_callsign(matches)

            for split in split_matches:
                qcs = qc_new_track_check.mds_full_track_check(split)

#update QC in the data base but only for the target month
                for i, rep in enumerate(split.reps):
                    if rep.month == months:
                        result = db.update_db_qc_single_flag(rep,
                                                             qcs[i],
                                                             'extra_qc',
                                                             'new_track_check',
                                                             years,
                                                             cursor2)
                        result = db.update_db_qc_single_flag(rep,
                                                             rep.fewsome_check,
                                                             'base_qc',
                                                             'fewsome_check',
                                                             years,
                                                             cursor2)

        connection.commit()

        t3 = time.time()
        print "done ",t3-t2

        #db.report_qc_counts(cursor, years, months)
    
    connection.close()
    
    print "All Done :)"