Esempio n. 1
0
def hfr(station, var_list, full=False, plots=False, diagnostics=False):
    """
    Run through the variables and pass to the High Flag Rate Check

    :param Station station: Station Object for the station
    :param list var_list: list of variables to test
    :param bool full: run a full update (unused here)
    :param book plots: turn on plots
    :param bool diagnostics: turn on diagnostic output

    :returns: int : number of variables on which these flags have been set
    """
    vars_set = []  # Keep track of where these flags are set.

    for var in var_list:

        obs_var = getattr(station, var)

        flags, any_set = high_flag_rate(obs_var,
                                        plots=plots,
                                        diagnostics=diagnostics)

        obs_var.flags = utils.insert_flags(obs_var.flags, flags)

        if any_set:
            vars_set += [var]

    # Now double check the list of variables where "H" flags have been set.
    #  If one of a synergistic pair is, then do the other (wind speed/direction,
    #  sea/station level pressure).
    # Using exclusive or.  This only passes if one is True and the other is False.
    if ("sea_level_pressure" in vars_set) is not ("station_level_pressure"
                                                  in vars_set):

        if "sea_level_pressure" in vars_set:
            set_synergistic_flags(station, "station_level_pressure")
        elif "station_level_pressure" in vars_set:
            set_synergistic_flags(station, "sea_level_pressure")

    if ("wind_speed" in vars_set) is not ("wind_direction" in vars_set):

        if "wind_speed" in vars_set:
            set_synergistic_flags(station, "wind_direction")
        elif "wind_direction" in vars_set:
            set_synergistic_flags(station, "wind_speed")

    # For synergistically flagged, just count once, so this return is correct.
    return len(vars_set)  # hfr
Esempio n. 2
0
def identify_multiple_values(obs_var,
                             times,
                             config_file,
                             plots=False,
                             diagnostics=False):
    """
    Use config_file to read in critical values, and then assess to find 

    :param MetVar obs_var: meteorological variable object
    :param array times: array of times (usually in minutes)
    :param str config_file: configuration file to store critical values
    :param bool plots: turn on plots
    :param bool diagnostics: turn on diagnostic output
    """

    # TODO check works with missing data (compressed?)
    # TODO monthly?

    flags = np.array(["" for i in range(obs_var.data.shape[0])])

    time_diffs = np.ma.diff(times) / np.timedelta64(1,
                                                    "m")  # presuming minutes
    value_diffs = np.ma.diff(obs_var.data)

    multiple_obs_at_time, = np.where(time_diffs == 0)
    #    if diagnostics:
    #        print("number of identical timestamps {}".format(multiple_obs_at_time.shape[0]))

    suspect_locs, = np.ma.where(value_diffs[multiple_obs_at_time] != 0)

    # set the first of the obs, then the second which make the diff
    flags[multiple_obs_at_time[suspect_locs]] = "T"
    flags[multiple_obs_at_time[suspect_locs] + 1] = "T"

    obs_var.flags = utils.insert_flags(obs_var.flags, flags)

    if diagnostics:

        print("Timestamp {}".format(obs_var.name))
        print("   Cumulative number of flags set: {}".format(
            len(np.where(flags != "")[0])))

    return  # identify_multiple_values
Esempio n. 3
0
def mcu(station, var_list, full=False, plots=False, diagnostics=False):
    """
    Run through the variables and pass to monthly clean up

    :param Station station: Station Object for the station
    :param list var_list: list of variables to test
    :param bool full: run a full update (unused here)
    :param book plots: turn on plots
    :param bool diagnostics: turn on diagnostic output
    """

    for var in var_list:

        obs_var = getattr(station, var)

        flags = clean_up(obs_var, station, plots=plots, diagnostics=diagnostics)

        obs_var.flags = utils.insert_flags(obs_var.flags, flags)

    return # wrc
Esempio n. 4
0
def set_synergistic_flags(station, var):
    """
    Set the flags on a synergistic variable.

    :param Station station: Station Object for the station
    :param str var: name of variable
    """
    obs_var = getattr(station, var)

    new_flags = np.array(["" for i in range(obs_var.data.shape[0])])
    old_flags = obs_var.flags
    obs_locs, = np.where(obs_var.data.mask == False)

    if obs_locs.shape[0] > 10 * utils.DATA_COUNT_THRESHOLD:
        # require sufficient observations to make a flagged fraction useful.

        # As synergistically flagged, add to all flags.
        new_flags[obs_locs] = "H"

    obs_var.flags = utils.insert_flags(obs_var.flags, new_flags)

    return  # set_synergistic_flags
Esempio n. 5
0
def wrc(station, var_list, full=False, plots=False, diagnostics=False):
    """
    Run through the variables and pass to the World Record Check.

    :param Station station: Station Object for the station
    :param list var_list: list of variables to test
    :param bool full: run a full update (unused here)
    :param book plots: turn on plots
    :param bool diagnostics: turn on diagnostic output
    """

    for var in var_list:

        obs_var = getattr(station, var)

        flags = record_check(obs_var,
                             station.continent,
                             plots=plots,
                             diagnostics=diagnostics)

        obs_var.flags = utils.insert_flags(obs_var.flags, flags)

    return  # wrc
Esempio n. 6
0
def lc(station, var_list, full=False, plots=False, diagnostics=False):
    """
    Run through the variables and pass to the Logic Checks

    :param Station station: Station Object for the station
    :param list var_list: list of variables to test
    :param bool full: run a full update (unused here)
    :param book plots: turn on plots
    :param bool diagnostics: turn on diagnostic output
    """
    # https://github.com/glamod/glamod-dm/blob/master/glamod-parser/glamod/parser/filters/observations_table.py
    # database parser has these, for future reference

    # station level (from inventory listing, not for each timestamp)
    return_code = 0
    if station.lat < -90 or station.lat > 90:
        write_logic_error(station,
                          "Bad latitude: {}".format(station.lat),
                          diagnostics=diagnostics)
        if diagnostics:
            print("Bad latitude: {}".format(station.lat))
        return_code = -1

    if station.lon < -180 or station.lon > 180:
        write_logic_error(station,
                          "Bad longtitude: {}".format(station.lon),
                          diagnostics=diagnostics)
        if diagnostics:
            print("Bad longtitude: {}".format(station.lon))
        return_code = -1

    if station.lon == 0 and station.lat == 0:
        write_logic_error(
            station,
            "Bad longtitude & latitude combination: lon={}, lat={}".format(
                station.lon, station.lat),
            diagnostics=diagnostics)
        if diagnostics:
            print("Bad longtitude/latitude: {} & {}".format(
                station.lon, station.lat))
        return_code = -1

    # Missing elevation acceptable - removed this for the moment (7 November 2019, RJHD)
    #       missing could be -999, -999.9, -999.999 or even 9999.0 etc hence using string comparison
    if (station.elev < -432.65 or station.elev > 8850.):
        if str(station.elev)[:4] not in ["-999", "9999"]:
            write_logic_error(station,
                              "Bad elevation: {}".format(station.elev),
                              diagnostics=diagnostics)
            if diagnostics:
                print("Bad elevation: {}".format(station.elev))
            return_code = -1
        else:
            if diagnostics:
                print("Missing elevation, but not flagged: {}".format(
                    station.elev))

    if station.times.iloc[0] < dt.datetime(1650, 1, 1):
        write_logic_error(station,
                          "Bad start time: {}".format(station.times[0]),
                          diagnostics=diagnostics)
        if diagnostics:
            print("Bad start time: {}".format(station.times[0]))
        return_code = -1

    elif station.times.iloc[-1] > dt.datetime.now():
        write_logic_error(station,
                          "Bad end time: {}".format(station.times[-1]),
                          diagnostics=diagnostics)
        if diagnostics:
            print("Bad end time: {}".format(station.times[-1]))
        return_code = -1

    # observation level
    for var in var_list:

        obs_var = getattr(station, var)

        flags = logic_check(obs_var, plots=plots, diagnostics=diagnostics)

        obs_var.flags = utils.insert_flags(obs_var.flags, flags)

    return return_code  # lc
Esempio n. 7
0
def identify_spikes(obs_var,
                    times,
                    config_file,
                    plots=False,
                    diagnostics=False):
    """
    Use config_file to read in critical values, and then assess to find spikes

    :param MetVar obs_var: meteorological variable object
    :param array times: array of times (usually in minutes)
    :param str config_file: configuration file to store critical values
    :param bool plots: turn on plots
    :param bool diagnostics: turn on diagnostic output
    """

    # TODO check works with missing data (compressed?)
    # TODO monthly?

    masked_times = np.ma.masked_array(times, mask=obs_var.data.mask)

    time_diffs = np.ma.diff(masked_times) / np.timedelta64(
        1, "m")  # presuming minutes
    value_diffs = np.ma.diff(obs_var.data)

    if len(value_diffs.mask.shape) == 0:
        # single mask value, replace with array of True/False's
        if value_diffs.mask:
            value_diffs.mask = np.ones(value_diffs.shape)
        else:
            value_diffs.mask = np.zeros(value_diffs.shape)

    # get thresholds for each unique time differences
    unique_diffs = np.unique(time_diffs.compressed())

    # retrieve the critical values
    critical_values = {}
    for t_diff in unique_diffs:
        try:
            c_value = utils.read_qc_config(config_file,
                                           "SPIKE-{}".format(obs_var.name),
                                           "{}".format(t_diff))
            critical_values[t_diff] = float(c_value)
        except KeyError:
            # no critical value for this time difference
            pass

    # if none have been read, give an option to calculate in case that was the reason for none
    if len(critical_values) == 0:
        get_critical_values(obs_var,
                            times,
                            config_file,
                            plots=plots,
                            diagnostics=diagnostics)

        # and try again
        for t_diff in unique_diffs:
            try:
                c_value = utils.read_qc_config(config_file,
                                               "SPIKE-{}".format(obs_var.name),
                                               "{}".format(t_diff))
                critical_values[t_diff] = float(c_value)
            except KeyError:
                # no critical value for this time difference
                pass

    # pre select for each time difference that can be tested
    for t_diff in unique_diffs:
        if t_diff == 0:
            # not a spike or jump, but 2 values at the same time.
            #  should be zero value difference, so fitting histogram not going to work
            #  handled in separate test
            continue

        # new blank flag array
        flags = np.array(["" for i in range(obs_var.data.shape[0])])

        t_locs, = np.where(time_diffs == t_diff)

        try:
            c_locs, = np.where(
                np.abs(value_diffs[t_locs]) > critical_values[t_diff])
        except:
            # no critical value for this time difference
            continue  # to next loop

        # TODO - sort spikes at very beginning or very end of sequence,
        #    when don't have a departure from/return to a normal level

        # potential spikes
        for ps, possible_in_spike in enumerate(t_locs[c_locs]):
            is_spike = False

            spike_len = 1
            while spike_len <= MAX_SPIKE_LENGTH:
                # test for each possible length to see if identified
                try:
                    out_spike_t_diff = time_diffs[possible_in_spike +
                                                  spike_len]
                    possible_out_spike = value_diffs[possible_in_spike +
                                                     spike_len]
                except IndexError:
                    # got to end of data run, can't test final value at the moment
                    break

                # need to test mask/unmasked using array rather than values extracted above
                #    as if values unmasked, then no mask attribute to test!
                if time_diffs.mask[possible_in_spike + spike_len] == False and \
                        value_diffs.mask[possible_in_spike + spike_len] == False:
                    try:
                        # find critical value for time-difference of way out of spike
                        out_critical_value = critical_values[out_spike_t_diff]
                    except KeyError:
                        # don't have a value for this time difference, so use the maximum of all as a proxy
                        out_critical_value = max(critical_values.values())
                else:
                    # time or value difference masked
                    out_critical_value = max(critical_values.values())

                if np.abs(possible_out_spike) > out_critical_value:
                    # check that the signs are opposite
                    if np.sign(value_diffs[possible_in_spike]) != np.sign(
                            value_diffs[possible_in_spike + spike_len]):
                        is_spike = True
                        break

                spike_len += 1

            if is_spike and spike_len >= 1:
                # test within spike differences (chosing correct time difference)
                within = 1
                while within < spike_len:
                    within_t_diff = time_diffs[possible_in_spike + within]
                    if time_diffs.mask[possible_in_spike + within] == False:
                        try:
                            within_critical_value = critical_values[
                                within_t_diff]
                            if value_diffs[
                                    possible_in_spike +
                                    within] > within_critical_value / 2.:
                                is_spike = False
                        except KeyError:
                            # don't have a value for this time difference, so use the maximum of all as a proxy
                            within_critical_value = max(
                                critical_values.values())
                    else:
                        # time difference masked
                        within_critical_value = max(critical_values.values())

                    if value_diffs.mask[possible_in_spike + within] == False:
                        if value_diffs[possible_in_spike +
                                       within] > within_critical_value / 2.:
                            is_spike = False
                    else:
                        # if masked then no data, so can't say if it's not a spike
                        pass

                    within += 1

            if is_spike:
                # test either side (either before or after is too big)
                try:
                    before_t_diff = time_diffs[possible_in_spike - 1]
                    if time_diffs.mask[possible_in_spike - 1] == False:
                        before_critical_value = critical_values[before_t_diff]
                    else:
                        # time difference masked
                        before_critical_value = max(critical_values.values())
                except KeyError:
                    # don't have a value for this time difference, so use the maximum of all as a proxy
                    before_critical_value = max(critical_values.values())
                except IndexError:
                    # off the front of the data array
                    before_critical_value = max(critical_values.values())

                try:
                    after_t_diff = time_diffs[possible_in_spike + spike_len +
                                              1]
                    if time_diffs.mask[possible_in_spike + spike_len +
                                       1] == False:
                        after_critical_value = critical_values[after_t_diff]
                    else:
                        # time difference masked
                        after_critical_value = max(critical_values.values())
                except KeyError:
                    # don't have a value for this time difference, so use the maximum of all as a proxy
                    after_critical_value = max(critical_values.values())
                except IndexError:
                    # off the back of the data array
                    after_critical_value = max(critical_values.values())

                try:
                    if value_diffs.mask[possible_in_spike - 1] == False:
                        if value_diffs[possible_in_spike -
                                       1] > before_critical_value / 2.:
                            # before spike fails test
                            is_spike = False

                except IndexError:
                    # off the front of the data array
                    pass

                try:
                    if value_diffs.mask[possible_in_spike + spike_len +
                                        1] == False:
                        if value_diffs[possible_in_spike + spike_len +
                                       1] > after_critical_value / 2.:
                            # after spike fails test
                            is_spike = False
                except IndexError:
                    # off the back of the data array
                    pass

            # if the spike is still set, set the flags
            if is_spike:
                # "+1" because of difference arrays
                flags[possible_in_spike + 1:possible_in_spike + 1 +
                      spike_len] = "S"

                # diagnostic plots
                if plots:
                    plot_spike(times, obs_var, possible_in_spike + 1,
                               spike_len)

        obs_var.flags = utils.insert_flags(obs_var.flags, flags)

        if diagnostics:

            print("Spike {}".format(obs_var.name))
            print("   Time Difference: {} minutes".format(t_diff))
            print("      Cumulative number of flags set: {}".format(
                len(np.where(flags != "")[0])))

    return  # identify_spikes
Esempio n. 8
0
def neighbour_outlier(target_station,
                      initial_neighbours,
                      variable,
                      diagnostics=False,
                      plots=False,
                      full=False):
    """
    Works on a single station and variable.  Reads in neighbour's data, finds locations where sufficent are sufficiently different.

    :param Station target_station: station to run on 
    :param array initial_neighbours: input neighbours (ID, distance) pairs
    :param str variable: obs variable being run on
    :param bool diagnostics: print extra material to screen
    :param bool plots: create plots from each test
    :param bool full: run full reprocessing rather than using stored values.
    """
    station_list = utils.get_station_list()

    # if sufficient
    n_neighbours = len(np.where(initial_neighbours[:, 0] != "-")[0]) - 1
    if n_neighbours < utils.MIN_NEIGHBOURS:
        print("{} has insufficient neighbours ({}<{})".format(
            target_station.id, n_neighbours, utils.MIN_NEIGHBOURS))

    else:
        #*************************
        # extract target observations
        obs_var = getattr(target_station, variable)
        flags = np.array(["" for i in range(obs_var.data.shape[0])
                          ]).astype("<U10")

        #*************************
        # read in in the neighbour (buddy) data
        all_buddy_data = np.ma.zeros(
            [len(initial_neighbours[:, 0]),
             len(target_station.times)])
        all_buddy_data.mask = np.ones(all_buddy_data.shape)

        for bid, buddy_id in enumerate(initial_neighbours[:, 0]):
            if buddy_id == target_station.id:
                # first entry is self
                continue
            if buddy_id == "-":
                # end of the list of buddies
                break

            if diagnostics:
                print("{}/{} {}".format(bid, len(initial_neighbours[:, 0]),
                                        buddy_id))

            # set up station object to hold information
            buddy_idx, = np.where(station_list.id == buddy_id)
            buddy = utils.Station(buddy_id, station_list.iloc[buddy_idx].latitude.values[0], \
                                      station_list.iloc[buddy_idx].longitude.values[0], station_list.iloc[buddy_idx].elevation.values[0])

            try:
                buddy, buddy_df = io.read_station(os.path.join(
                    setup.SUBDAILY_PROC_DIR, "{:11s}.qff".format(buddy_id)),
                                                  buddy,
                                                  read_flags=True)

                buddy_var = getattr(buddy, variable)

                # apply flags
                flag_locs, = np.where(buddy_var.flags != "")
                buddy_var.data.mask[flag_locs] = True

            except OSError as e:
                # file missing, move on to next in sequence
                io.write_error(
                    target_station,
                    "File Missing (Buddy, {}) - {}".format(variable, buddy_id))
                continue
            except ValueError as e:
                # some issue in the raw file
                io.write_error(target_station,
                               "Error in input file (Buddy, {}) - {}".format(
                                   variable, buddy_id),
                               error=str(e))
                continue

            # match the timestamps of target_station and copy over
            match = np.in1d(target_station.times, buddy.times)
            match_back = np.in1d(buddy.times, target_station.times)

            if True in match and True in match_back:
                # skip if no overlapping times at all!
                all_buddy_data[bid, match] = buddy_var.data[match_back]

        if diagnostics:
            print("All buddies read in")

        #*************************
        # find differences
        differences = all_buddy_data - obs_var.data

        #*************************
        # find spread of differences on monthly basis (with minimum value)
        spreads = np.ma.zeros(differences.shape)

        for month in range(1, 13):

            month_locs = np.where(target_station.months == month)

            for bid, buddy in enumerate(differences):

                if len(differences[bid, month_locs].compressed()
                       ) > utils.DATA_COUNT_THRESHOLD:

                    this_spread = utils.spread(differences[bid, month_locs])
                    if this_spread < MIN_SPREAD:
                        spreads[bid, month_locs] = MIN_SPREAD
                    else:
                        spreads[bid, month_locs] = utils.spread(
                            differences[bid, month_locs])

                else:
                    spreads[bid, month_locs] = MIN_SPREAD

        spreads.mask = np.copy(differences.mask)

        # store which entries may be sufficient to flag
        dubious = np.ma.zeros(differences.shape)
        dubious.mask = np.copy(differences.mask)

        #*************************
        # adjust for storms
        if variable in ["sea_level_pressure", "station_level_pressure"]:
            distant, = np.where(initial_neighbours[:, 1].astype(int) > 100)
            if len(distant) > 0:
                # find positive and negative differences across neighbours
                positive = np.ma.where(
                    differences[distant] > spreads[distant] * SPREAD_LIMIT)
                negative = np.ma.where(
                    differences[distant] < spreads[distant] * SPREAD_LIMIT)

                # spin through each neighbour
                for dn, dist_neigh in enumerate(distant):

                    pos, = np.where(positive[0] == dn)
                    neg, = np.where(negative[0] == dn)

                    if len(neg) > 0:
                        ratio = len(neg) / (len(pos) + len(neg))
                        if ratio > 0.667:
                            # majority negative, only flag the positives [definitely not storms]
                            dubious[dist_neigh, positive[1][pos]] = 1

            else:
                # all stations close by so storms shouldn't affect, include all
                # note where differences exceed the spread
                dubious_locs = np.ma.where(
                    np.ma.abs(differences) > spreads * SPREAD_LIMIT)
                dubious[dubious_locs] = 1

        else:
            #*************************
            # note where differences exceed the spread [all non pressure variables]
            dubious_locs = np.ma.where(
                np.ma.abs(differences) > spreads * SPREAD_LIMIT)
            dubious[dubious_locs] = 1

        if diagnostics:
            print("cross checks complete - assessing all outcomes")
        #*************************
        # sum across neighbours
        neighbour_count = np.ma.count(differences, axis=0)
        dubious_count = np.ma.sum(dubious, axis=0)

        # flag if large enough fraction (>0.66)
        sufficient, = np.ma.where(dubious_count > 0.66 * neighbour_count)
        flags[sufficient] = "N"

        if plots:
            for flag in sufficient:
                plot_neighbour_flags(target_station.times, flag, obs_var,
                                     all_buddy_data)

        # append flags to object
        obs_var.flags = utils.insert_flags(obs_var.flags, flags)

        if diagnostics:

            print("Neighbour Outlier {}".format(obs_var.name))
            print("   Cumulative number of flags set: {}".format(
                len(np.where(flags != "")[0])))

    return  # neighbour_outlier
Esempio n. 9
0
def monthly_gap(obs_var, station, config_file, plots=False, diagnostics=False):
    """
    Use distribution to identify assymetries.

    :param MetVar obs_var: meteorological variable object
    :param Station station: station object
    :param str config_file: configuration file to store critical values
    :param bool plots: turn on plots
    :param bool diagnostics: turn on diagnostic output
    """

    flags = np.array(["" for i in range(obs_var.data.shape[0])])
    all_years = np.unique(station.years)

    for month in range(1, 13):

        month_averages = prepare_monthly_data(obs_var,
                                              station,
                                              month,
                                              diagnostics=diagnostics)

        # read in the scaling
        try:
            climatology = float(
                utils.read_qc_config(config_file,
                                     "MDISTRIBUTION-{}".format(obs_var.name),
                                     "{}-clim".format(month)))
            spread = float(
                utils.read_qc_config(config_file,
                                     "MDISTRIBUTION-{}".format(obs_var.name),
                                     "{}-spread".format(month)))
        except KeyError:
            print("Information missing in config file")
            find_monthly_scaling(obs_var,
                                 station,
                                 config_file,
                                 diagnostics=diagnostics)
            climatology = float(
                utils.read_qc_config(config_file,
                                     "MDISTRIBUTION-{}".format(obs_var.name),
                                     "{}-clim".format(month)))
            spread = float(
                utils.read_qc_config(config_file,
                                     "MDISTRIBUTION-{}".format(obs_var.name),
                                     "{}-spread".format(month)))

        if climatology == utils.MDI and spread == utils.MDI:
            # these weren't calculable, move on
            continue

        standardised_months = (month_averages - climatology) / spread

        bins = utils.create_bins(standardised_months, BIN_WIDTH, obs_var.name)
        hist, bin_edges = np.histogram(standardised_months, bins)

        # flag months with very large offsets
        bad, = np.where(np.abs(standardised_months) >= LARGE_LIMIT)
        # now follow flag locations back up through the process
        for bad_month_id in bad:
            # year ID for this set of calendar months
            locs, = np.where(
                np.logical_and(station.months == month,
                               station.years == all_years[bad_month_id]))
            flags[locs] = "D"

        # walk distribution from centre to find assymetry
        sort_order = standardised_months.argsort()
        mid_point = len(standardised_months) // 2
        good = True
        step = 1
        bad = []
        while good:

            if standardised_months[sort_order][
                    mid_point -
                    step] != standardised_months[sort_order][mid_point + step]:

                suspect_months = [np.abs(standardised_months[sort_order][mid_point - step]), \
                                      np.abs(standardised_months[sort_order][mid_point + step])]

                if min(suspect_months) != 0:
                    # not all clustered at origin

                    if max(suspect_months) / min(suspect_months) >= 2. and min(
                            suspect_months) >= 1.5:
                        # at least 1.5x spread from centre and difference of two in location (longer tail)
                        # flag everything further from this bin for that tail
                        if suspect_months[0] == max(suspect_months):
                            # LHS has issue (remember that have removed the sign)
                            bad = sort_order[:mid_point - (
                                step -
                                1)]  # need -1 given array indexing standards
                        elif suspect_months[1] == max(suspect_months):
                            # RHS has issue
                            bad = sort_order[mid_point + step:]
                        good = False

            step += 1
            if (mid_point - step) < 0 or (
                    mid_point + step) == standardised_months.shape[0]:
                # reached end
                break

        # now follow flag locations back up through the process
        for bad_month_id in bad:
            # year ID for this set of calendar months
            locs, = np.where(
                np.logical_and(station.months == month,
                               station.years == all_years[bad_month_id]))
            flags[locs] = "D"

        if plots:
            import matplotlib.pyplot as plt

            plt.step(bins[1:], hist, color='k', where="pre")
            if len(bad) > 0:
                bad_hist, dummy = np.histogram(standardised_months[bad], bins)
                plt.step(bins[1:], bad_hist, color='r', where="pre")

            plt.ylabel("Number of Months")
            plt.xlabel(obs_var.name.capitalize())
            plt.title("{} - month {}".format(station.id, month))

            plt.show()

    # append flags to object
    obs_var.flags = utils.insert_flags(obs_var.flags, flags)

    if diagnostics:

        print("Distribution (monthly) {}".format(obs_var.name))
        print("   Cumulative number of flags set: {}".format(
            len(np.where(flags != "")[0])))

    return  # monthly_gap
Esempio n. 10
0
def repeating_value(obs_var,
                    times,
                    config_file,
                    plots=False,
                    diagnostics=False):
    """
    AKA straight string

    Use config file to read threshold values.  Then find strings which exceed threshold.

    :param MetVar obs_var: meteorological variable object
    :param array times: array of times (usually in minutes)
    :param str config_file: configuration file to store critical values
    :param bool plots: turn on plots
    :param bool diagnostics: turn on diagnostic output
    """

    # remove calm periods for wind speeds when (a) calculating thresholds and (b) identifying streaks
    this_var = copy.deepcopy(obs_var)
    if obs_var.name == "wind_speed":
        calms, = np.ma.where(this_var.data == 0)
        this_var.data[calms] = utils.MDI
        this_var.data.mask[calms] = True

    flags = np.array(["" for i in range(this_var.data.shape[0])])
    compressed_flags = np.array(
        ["" for i in range(this_var.data.compressed().shape[0])])

    # retrieve the threshold and store in another dictionary
    threshold = {}
    try:
        th = utils.read_qc_config(config_file,
                                  "STREAK-{}".format(this_var.name),
                                  "Straight")
        threshold["Straight"] = float(th)
    except KeyError:
        # no threshold set
        print("Threshold missing in config file")
        get_repeating_string_threshold(this_var,
                                       config_file,
                                       plots=plots,
                                       diagnostics=diagnostics)
        th = utils.read_qc_config(config_file,
                                  "STREAK-{}".format(this_var.name),
                                  "Straight")
        threshold["Straight"] = float(th)

    # only process further if there is enough data
    if len(this_var.data.compressed()) > 1:
        repeated_string_lengths, grouped_diffs, strings = prepare_data_repeating_string(
            this_var, plots=plots, diagnostics=diagnostics)

        # above threshold
        bad, = np.where(repeated_string_lengths >= threshold["Straight"])

        # flag identified strings
        for string in bad:
            start = int(np.sum(grouped_diffs[:strings[string], 1]))
            end = start + int(grouped_diffs[strings[string], 1]) + 1

            compressed_flags[start:end] = "K"

            if plots:
                plot_streak(times, this_var, start, end)

        # undo compression and write into original object (the one with calm periods)
        flags[this_var.data.mask == False] = compressed_flags
        obs_var.flags = utils.insert_flags(obs_var.flags, flags)

    if diagnostics:

        print("Repeated Strings {}".format(this_var.name))
        print("   Cumulative number of flags set: {}".format(
            len(np.where(flags != "")[0])))

    return  # repeating_value
Esempio n. 11
0
def variance_check(obs_var,
                   station,
                   config_file,
                   plots=False,
                   diagnostics=False,
                   winsorize=True):
    """
    Use distribution to identify threshold values.  Then also store in config file.

    :param MetVar obs_var: meteorological variable object
    :param Station station: station object
    :param str config_file: configuration file to store critical values
    :param bool plots: turn on plots
    :param bool diagnostics: turn on diagnostic output
    :param bool winsorize: apply winsorization at 5%/95%
    """

    flags = np.array(["" for i in range(obs_var.data.shape[0])])

    # get hourly climatology for each month
    for month in range(1, 13):
        month_locs, = np.where(station.months == month)

        variances = prepare_data(obs_var,
                                 station,
                                 month,
                                 diagnostics=diagnostics,
                                 winsorize=winsorize)

        try:
            average_variance = float(
                utils.read_qc_config(config_file,
                                     "VARIANCE-{}".format(obs_var.name),
                                     "{}-average".format(month)))
            variance_spread = float(
                utils.read_qc_config(config_file,
                                     "VARIANCE-{}".format(obs_var.name),
                                     "{}-spread".format(month)))
        except KeyError:
            print("Information missing in config file")
            find_thresholds(obs_var,
                            station,
                            config_file,
                            plots=plots,
                            diagnostics=diagnostics)
            average_variance = float(
                utils.read_qc_config(config_file,
                                     "VARIANCE-{}".format(obs_var.name),
                                     "{}-average".format(month)))
            variance_spread = float(
                utils.read_qc_config(config_file,
                                     "VARIANCE-{}".format(obs_var.name),
                                     "{}-spread".format(month)))

        if average_variance == utils.MDI and variance_spread == utils.MDI:
            # couldn't be calculated, move on
            continue

        bad_years, = np.where(
            np.abs(variances - average_variance) /
            variance_spread > SPREAD_THRESHOLD)

        # prepare wind and pressure data in case needed to check for storms
        if obs_var.name in [
                "station_level_pressure", "sea_level_pressure", "wind_speed"
        ]:
            wind_monthly_data = station.wind_speed.data[month_locs]
            if obs_var.name in [
                    "station_level_pressure", "sea_level_pressure"
            ]:
                pressure_monthly_data = obs_var.data[month_locs]
            else:
                pressure_monthly_data = station.sea_level_pressure.data[
                    month_locs]

            if len(pressure_monthly_data.compressed()) < utils.DATA_COUNT_THRESHOLD or \
                    len(wind_monthly_data.compressed()) < utils.DATA_COUNT_THRESHOLD:
                # need sufficient data to work with for storm check to work, else can't tell
                #    move on
                continue

            wind_average = utils.average(wind_monthly_data)
            wind_spread = utils.spread(wind_monthly_data)

            pressure_average = utils.average(pressure_monthly_data)
            pressure_spread = utils.spread(pressure_monthly_data)

        # go through each bad year for this month
        all_years = np.unique(station.years)
        for year in bad_years:

            # corresponding locations
            ym_locs, = np.where(
                np.logical_and(station.months == month,
                               station.years == all_years[year]))

            # if pressure or wind speed, need to do some further checking before applying flags
            if obs_var.name in [
                    "station_level_pressure", "sea_level_pressure",
                    "wind_speed"
            ]:

                # pull out the data
                wind_data = station.wind_speed.data[ym_locs]
                if obs_var.name in [
                        "station_level_pressure", "sea_level_pressure"
                ]:
                    pressure_data = obs_var.data[ym_locs]
                else:
                    pressure_data = station.sea_level_pressure.data[ym_locs]

                # need sufficient data to work with for storm check to work, else can't tell
                if len(pressure_data.compressed()) < utils.DATA_COUNT_THRESHOLD or \
                        len(wind_data.compressed()) < utils.DATA_COUNT_THRESHOLD:
                    # move on
                    continue

                # find locations of high wind speeds and low pressures, cross match
                high_winds, = np.ma.where(
                    (wind_data - wind_average) / wind_spread > STORM_THRESHOLD)
                low_pressures, = np.ma.where(
                    (pressure_average - pressure_data) /
                    pressure_spread > STORM_THRESHOLD)

                match = np.in1d(high_winds, low_pressures)

                couldbe_storm = False
                if len(match) > 0:
                    # this could be a storm, either at tropical station (relatively constant pressure)
                    # or out of season in mid-latitudes.
                    couldbe_storm = True

                if obs_var.name in [
                        "station_level_pressure", "sea_level_pressure"
                ]:
                    diffs = np.ma.diff(pressure_data)
                elif obs_var.name == "wind_speed":
                    diffs = np.ma.diff(wind_data)

                # count up the largest number of sequential negative and positive differences
                negs, poss = 0, 0
                biggest_neg, biggest_pos = 0, 0

                for diff in diffs:

                    if diff > 0:
                        if negs > biggest_neg: biggest_neg = negs
                        negs = 0
                        poss += 1
                    else:
                        if poss > biggest_pos: biggest_pos = poss
                        poss = 0
                        negs += 1

                if (biggest_neg < 10) and (biggest_pos <
                                           10) and not couldbe_storm:
                    # insufficient to identify as a storm (HadISD values)
                    # leave flags set
                    pass
                else:
                    # could be a storm, so better to leave this month unflagged
                    # zero length array to flag
                    ym_locs = np.ma.array([])

            # copy over the flags, if any
            if len(ym_locs) != 0:
                # and set the flags
                flags[ym_locs] = "V"

        # diagnostic plots
        if plots:
            import matplotlib.pyplot as plt

            scaled_variances = ((variances - average_variance) /
                                variance_spread)
            bins = utils.create_bins(scaled_variances, 0.25, obs_var.name)
            hist, bin_edges = np.histogram(scaled_variances, bins)

            plt.clf()
            plt.step(bins[1:], hist, color='k', where="pre")
            plt.yscale("log")

            plt.ylabel("Number of Months")
            plt.xlabel("Scaled {} Variances".format(obs_var.name.capitalize()))
            plt.title("{} - month {}".format(station.id, month))

            plt.ylim([0.1, max(hist) * 2])
            plt.axvline(SPREAD_THRESHOLD, c="r")
            plt.axvline(-SPREAD_THRESHOLD, c="r")

            bad_hist, dummy = np.histogram(scaled_variances[bad_years], bins)
            plt.step(bins[1:], bad_hist, color='r', where="pre")

            plt.show()

    # append flags to object
    obs_var.flags = utils.insert_flags(obs_var.flags, flags)

    if diagnostics:

        print("Variance {}".format(obs_var.name))
        print("   Cumulative number of flags set: {}".format(
            len(np.where(flags != "")[0])))

    return  # variance_check
Esempio n. 12
0
def frequent_values(obs_var,
                    station,
                    config_file,
                    plots=False,
                    diagnostics=False):
    """
    Use config file to read frequent values.  Check each month to see if appear.

    :param MetVar obs_var: meteorological variable object
    :param Station station: station object
    :param str config_file: configuration file to store critical values
    :param bool plots: turn on plots
    :param bool diagnostics: turn on diagnostic output
    """

    flags = np.array(["" for i in range(obs_var.data.shape[0])])

    all_years = np.unique(station.years)

    # work through each month, and then year
    for month in range(1, 13):

        # read in bin-width and suspect bins for this month
        try:
            width = float(
                utils.read_qc_config(config_file,
                                     "FREQUENT-{}".format(obs_var.name),
                                     "width"))
            suspect_bins = utils.read_qc_config(config_file,
                                                "FREQUENT-{}".format(
                                                    obs_var.name),
                                                "{}".format(month),
                                                islist=True)
        except KeyError:
            print("Information missing in config file")
            identify_values(obs_var,
                            station,
                            config_file,
                            plots=plots,
                            diagnostics=diagnostics)
            width = float(
                utils.read_qc_config(config_file,
                                     "FREQUENT-{}".format(obs_var.name),
                                     "width"))
            suspect_bins = utils.read_qc_config(config_file,
                                                "FREQUENT-{}".format(
                                                    obs_var.name),
                                                "{}".format(month),
                                                islist=True)

        # skip on if nothing to find
        if len(suspect_bins) == 0:
            continue

        # work through each year
        for year in all_years:
            locs, = np.where(
                np.logical_and(station.months == month, station.years == year))

            month_data = obs_var.data[locs]

            # skip if no data
            if np.ma.count(month_data) == 0:
                continue

            month_flags = np.array(["" for i in range(month_data.shape[0])])

            # adjust bin widths according to reporting accuracy
            resolution = utils.reporting_accuracy(month_data)

            if resolution <= 0.5:
                bins = utils.create_bins(month_data, 0.5, obs_var.name)
            else:
                bins = utils.create_bins(month_data, 1.0, obs_var.name)
            hist, bin_edges = np.histogram(month_data, bins)

            # Scan through the histogram
            #   check if a bin is the maximum of a local area ("ROLLING")
            for b, bar in enumerate(hist):
                if (b > ROLLING // 2) and (b <= (len(hist) - ROLLING // 2)):

                    target_bins = hist[b - (ROLLING // 2):b + (ROLLING // 2) +
                                       1]

                    # if sufficient obs, maximum and contains > 50% of data
                    if bar >= utils.DATA_COUNT_THRESHOLD:
                        if bar == target_bins.max():
                            if (bar / target_bins.sum()) > RATIO:
                                # this bin meets all the criteria
                                if bins[b] in suspect_bins:
                                    # find observations (month & year) to flag!
                                    flag_locs = np.where(
                                        np.logical_and(
                                            month_data >= bins[b],
                                            month_data < bins[b + 1]))
                                    month_flags[flag_locs] = "F"

            # copy flags for all years into main array
            flags[locs] = month_flags

        # diagnostic plots
        if plots:
            import matplotlib.pyplot as plt

            plt.step(bins[1:], hist, color='k', where="pre")
            plt.yscale("log")

            plt.ylabel("Number of Observations")
            plt.xlabel(obs_var.name.capitalize())
            plt.title("{} - month {}".format(station.id, month))

            bad_hist = np.copy(hist)
            for b, bar in enumerate(bad_hist):
                if bins[b] not in suspect_bins:
                    bad_hist[b] = 0

            plt.step(bins[1:], bad_hist, color='r', where="pre")
            plt.show()

    # append flags to object
    obs_var.flags = utils.insert_flags(obs_var.flags, flags)

    if diagnostics:

        print("Frequent Values {}".format(obs_var.name))
        print("   Cumulative number of flags set: {}".format(
            len(np.where(flags != "")[0])))

    return  # frequent_values
Esempio n. 13
0
def pressure_theory(sealp,
                    stnlp,
                    temperature,
                    times,
                    elevation,
                    plots=False,
                    diagnostics=False):
    """
    Flag locations where difference between recorded and calculated sea-level pressure 
    falls outside of bounds

    :param MetVar sealp: sea level pressure object
    :param MetVar stnlp: station level pressure object
    :param MetVar temperature: temperature object
    :param array times: datetime array
    :param float elevation: station elevation (m)
    :param bool plots: turn on plots
    :param bool diagnostics: turn on diagnostic output
    """

    flags = np.array(["" for i in range(sealp.data.shape[0])])

    theoretical_value = calc_slp(stnlp.data, elevation, temperature.data)

    difference = sealp.data - theoretical_value

    bad_locs, = np.ma.where(np.ma.abs(difference) > THEORY_THRESHOLD)

    # diagnostic plots
    if plots:
        bins = np.arange(
            np.round(np.ma.min(difference)) - 1,
            np.round(np.ma.max(difference)) + 1, 0.1)
        import matplotlib.pyplot as plt
        plt.clf()
        plt.hist(difference.compressed(), bins=bins)
        plt.axvline(x=THEORY_THRESHOLD, ls="--", c="r")
        plt.axvline(x=-THEORY_THRESHOLD, ls="--", c="r")
        plt.xlim([bins[0] - 1, bins[-1] + 1])
        plt.ylabel("Observations")
        plt.xlabel("Difference (hPa)")
        plt.show()

    if len(bad_locs) != 0:
        flags[bad_locs] = "p"
        if diagnostics:
            print("Pressure".format(stnlp.name))
            print(
                "   Number of mismatches between recorded and theoretical SLPs {}"
                .format(len(bad_locs)))
        if plots:
            for bad in bad_locs:
                plot_pressure(sealp, stnlp, times, bad)

    def adjust_preexisting_locs(var, flags):
        # may have flags already set by previous part of test
        # find these locations, and adjust new flags to these aren't added again
        pre_exist = [i for i, item in enumerate(var.flags) if "p" in item]
        new_flags = flags[:]
        new_flags[pre_exist] = ""

        return new_flags

    # flag both as not sure immediately where the issue lies
    stnlp.flags = utils.insert_flags(stnlp.flags,
                                     adjust_preexisting_locs(stnlp, flags))
    sealp.flags = utils.insert_flags(sealp.flags,
                                     adjust_preexisting_locs(sealp, flags))

    if diagnostics:

        print("Pressure {}".format(stnlp.name))
        print("   Cumulative number of flags set: {}".format(
            len(np.where(flags != "")[0])))

    return  # pressure_theory
Esempio n. 14
0
def pressure_offset(sealp,
                    stnlp,
                    times,
                    config_file,
                    plots=False,
                    diagnostics=False):
    """
    Flag locations where difference between station and sea-level pressure
    falls outside of bounds

    :param MetVar sealp: sea level pressure object
    :param MetVar stnlp: station level pressure object
    :param array times: datetime array
    :param str configfile: string for configuration file
    :param bool plots: turn on plots
    :param bool diagnostics: turn on diagnostic output
    """

    flags = np.array(["" for i in range(sealp.data.shape[0])])

    difference = sealp.data - stnlp.data

    if len(difference.compressed()) >= utils.DATA_COUNT_THRESHOLD:

        try:
            average = float(
                utils.read_qc_config(config_file, "PRESSURE", "average"))
            spread = float(
                utils.read_qc_config(config_file, "PRESSURE", "spread"))
        except KeyError:
            print("Information missing in config file")
            average = utils.average(difference)
            spread = utils.spread(difference)
            if spread < MIN_SPREAD:  # less than XhPa
                spread = MIN_SPREAD
            elif spread > MAX_SPREAD:  # more than XhPa
                spread = MAX_SPREAD

            utils.write_qc_config(config_file,
                                  "PRESSURE",
                                  "average",
                                  "{}".format(average),
                                  diagnostics=diagnostics)
            utils.write_qc_config(config_file,
                                  "PRESSURE",
                                  "spread",
                                  "{}".format(spread),
                                  diagnostics=diagnostics)

        if np.abs(np.ma.mean(difference) -
                  np.ma.median(difference)) > THRESHOLD * spread:
            if diagnostics:
                print("Large difference between mean and median")
                print("Likely to have two populations of roughly equal size")
                print("Test won't work")
            pass
        else:
            high, = np.ma.where(difference > (average + (THRESHOLD * spread)))
            low, = np.ma.where(difference < (average - (THRESHOLD * spread)))

            # diagnostic plots
            if plots:
                bins = np.arange(
                    np.round(difference.min()) - 1,
                    np.round(difference.max()) + 1, 0.1)
                import matplotlib.pyplot as plt
                plt.clf()
                plt.hist(difference.compressed(), bins=bins)
                plt.axvline(x=(average + (THRESHOLD * spread)), ls="--", c="r")
                plt.axvline(x=(average - (THRESHOLD * spread)), ls="--", c="r")
                plt.xlim([bins[0] - 1, bins[-1] + 1])
                plt.ylabel("Observations")
                plt.xlabel("Difference (hPa)")
                plt.show()

            if len(high) != 0:
                flags[high] = "p"
                if diagnostics:
                    print("Pressure".format(stnlp.name))
                    print("   Number of high differences {}".format(len(high)))
                if plots:
                    for bad in high:
                        plot_pressure(sealp, stnlp, times, bad)

            if len(low) != 0:
                flags[low] = "p"
                if diagnostics:
                    print("   Number of low differences {}".format(len(low)))
                if plots:
                    for bad in low:
                        plot_pressure(sealp, stnlp, times, bad)

            # only flag the station level pressure
            stnlp.flags = utils.insert_flags(stnlp.flags, flags)

    if diagnostics:

        print("Pressure {}".format(stnlp.name))
        print("   Cumulative number of flags set: {}".format(
            len(np.where(flags != "")[0])))

    return  # pressure_offset
Esempio n. 15
0
def monthly_clim(obs_var,
                 station,
                 config_file,
                 logfile="",
                 plots=False,
                 diagnostics=False,
                 winsorize=True):
    """
    Run through the variables and pass to the Distributional Gap Checks

    :param MetVar obs_var: meteorological variable object
    :param Station station: station object
    :param str configfile: string for configuration file
    :param str logfile: string for log file
    :param bool plots: turn on plots
    :param bool diagnostics: turn on diagnostic output
    """
    flags = np.array(["" for i in range(obs_var.data.shape[0])])

    for month in range(1, 13):

        month_locs, = np.where(station.months == month)

        # note these are for the whole record, just this month is unmasked
        normalised_anomalies = prepare_data(obs_var,
                                            station,
                                            month,
                                            diagnostics=diagnostics,
                                            winsorize=winsorize)

        if len(normalised_anomalies.compressed()
               ) >= utils.DATA_COUNT_THRESHOLD:

            bins = utils.create_bins(normalised_anomalies, BIN_WIDTH,
                                     obs_var.name)
            hist, bin_edges = np.histogram(normalised_anomalies.compressed(),
                                           bins)

            try:
                upper_threshold = float(
                    utils.read_qc_config(
                        config_file, "CLIMATOLOGICAL-{}".format(obs_var.name),
                        "{}-uthresh".format(month)))
                lower_threshold = float(
                    utils.read_qc_config(
                        config_file, "CLIMATOLOGICAL-{}".format(obs_var.name),
                        "{}-lthresh".format(month)))
            except KeyError:
                print("Information missing in config file")
                find_month_thresholds(obs_var,
                                      station,
                                      config_file,
                                      plots=plots,
                                      diagnostics=diagnostics)
                upper_threshold = float(
                    utils.read_qc_config(
                        config_file, "CLIMATOLOGICAL-{}".format(obs_var.name),
                        "{}-uthresh".format(month)))
                lower_threshold = float(
                    utils.read_qc_config(
                        config_file, "CLIMATOLOGICAL-{}".format(obs_var.name),
                        "{}-lthresh".format(month)))

            # now to find the gaps
            uppercount = len(
                np.where(normalised_anomalies > upper_threshold)[0])
            lowercount = len(
                np.where(normalised_anomalies < lower_threshold)[0])

            if uppercount > 0:
                gap_start = utils.find_gap(hist, bins, upper_threshold,
                                           GAP_SIZE)

                if gap_start != 0:
                    bad_locs, = np.ma.where(
                        normalised_anomalies >
                        gap_start)  # all years for one month

                    # normalised_anomalies are for the whole record, just this month is unmasked
                    flags[bad_locs] = "C"

            if lowercount > 0:
                gap_start = utils.find_gap(hist,
                                           bins,
                                           lower_threshold,
                                           GAP_SIZE,
                                           upwards=False)

                if gap_start != 0:
                    bad_locs, = np.ma.where(
                        normalised_anomalies <
                        gap_start)  # all years for one month

                    flags[bad_locs] = "C"

            # diagnostic plots
            if plots:
                import matplotlib.pyplot as plt
                plt.clf()
                plt.step(bins[1:], hist, color='k', where="pre")
                plt.yscale("log")

                plt.ylabel("Number of Observations")
                plt.xlabel("Scaled {}".format(obs_var.name.capitalize()))
                plt.title("{} - month {}".format(station.id, month))

                plt.ylim([0.1, max(hist) * 2])
                plt.axvline(upper_threshold, c="r")
                plt.axvline(lower_threshold, c="r")

                bad_locs, = np.where(flags[month_locs] == "C")
                bad_hist, dummy = np.histogram(
                    normalised_anomalies[month_locs][bad_locs], bins)
                plt.step(bins[1:], bad_hist, color='r', where="pre")

                plt.show()

    # append flags to object
    obs_var.flags = utils.insert_flags(obs_var.flags, flags)

    if diagnostics:

        print("Climatological {}".format(obs_var.name))
        print("   Cumulative number of flags set: {}".format(
            len(np.where(flags != "")[0])))

    return  # monthly_clim
Esempio n. 16
0
def all_obs_gap(obs_var, station, config_file, plots=False, diagnostics=False):
    """
    Extract data for month and find secondary populations in distribution.

    :param MetVar obs_var: meteorological variable object
    :param Station station: station object
    :param str config_file: configuration file to store critical values
    :param bool plots: turn on plots
    :param bool diagnostics: turn on diagnostic output
    """

    flags = np.array(["" for i in range(obs_var.data.shape[0])])

    for month in range(1, 13):

        normalised_anomalies = prepare_all_data(obs_var,
                                                station,
                                                month,
                                                config_file,
                                                full=False,
                                                diagnostics=diagnostics)

        if (len(normalised_anomalies.compressed()) == 1
                and normalised_anomalies[0] == utils.MDI):
            # no data to work with for this month, move on.
            continue

        bins = utils.create_bins(normalised_anomalies, BIN_WIDTH, obs_var.name)
        hist, bin_edges = np.histogram(normalised_anomalies, bins)

        try:
            upper_threshold = float(
                utils.read_qc_config(config_file,
                                     "ADISTRIBUTION-{}".format(obs_var.name),
                                     "{}-uthresh".format(month)))
            lower_threshold = float(
                utils.read_qc_config(config_file,
                                     "ADISTRIBUTION-{}".format(obs_var.name),
                                     "{}-lthresh".format(month)))
        except KeyError:
            print("Information missing in config file")
            find_thresholds(obs_var,
                            station,
                            config_file,
                            plots=plots,
                            diagnostics=diagnostics)
            upper_threshold = float(
                utils.read_qc_config(config_file,
                                     "ADISTRIBUTION-{}".format(obs_var.name),
                                     "{}-uthresh".format(month)))
            lower_threshold = float(
                utils.read_qc_config(config_file,
                                     "ADISTRIBUTION-{}".format(obs_var.name),
                                     "{}-lthresh".format(month)))

        if upper_threshold == utils.MDI and lower_threshold == utils.MDI:
            # these weren't able to be calculated, move on
            continue
        elif len(np.unique(normalised_anomalies)) == 1:
            # all the same value, so won't be able to fit a histogram
            continue

        # now to find the gaps
        uppercount = len(np.where(normalised_anomalies > upper_threshold)[0])
        lowercount = len(np.where(normalised_anomalies < lower_threshold)[0])

        month_locs, = np.where(
            station.months == month)  # append should keep year order
        if uppercount > 0:
            gap_start = utils.find_gap(hist, bins, upper_threshold, GAP_SIZE)

            if gap_start != 0:
                bad_locs, = np.ma.where(normalised_anomalies >
                                        gap_start)  # all years for one month

                month_flags = flags[month_locs]
                month_flags[bad_locs] = "d"
                flags[month_locs] = month_flags

        if lowercount > 0:
            gap_start = utils.find_gap(hist,
                                       bins,
                                       lower_threshold,
                                       GAP_SIZE,
                                       upwards=False)

            if gap_start != 0:
                bad_locs, = np.ma.where(normalised_anomalies <
                                        gap_start)  # all years for one month

                month_flags = flags[month_locs]
                month_flags[bad_locs] = "d"

                # TODO - can this bit be refactored?
                # for pressure data, see if the flagged obs correspond with high winds
                # could be a storm signal
                if obs_var.name in [
                        "station_level_pressure", "sea_level_pressure"
                ]:
                    wind_monthly_data = prepare_monthly_data(
                        station.wind_speed, station, month)
                    pressure_monthly_data = prepare_monthly_data(
                        obs_var, station, month)

                    if len(pressure_monthly_data.compressed()) < utils.DATA_COUNT_THRESHOLD or \
                            len(wind_monthly_data.compressed()) < utils.DATA_COUNT_THRESHOLD:
                        # need sufficient data to work with for storm check to work, else can't tell
                        pass
                    else:

                        wind_monthly_average = utils.average(wind_monthly_data)
                        wind_monthly_spread = utils.spread(wind_monthly_data)

                        pressure_monthly_average = utils.average(
                            pressure_monthly_data)
                        pressure_monthly_spread = utils.spread(
                            pressure_monthly_data)

                        # already a single calendar month, so go through each year
                        all_years = np.unique(station.years)
                        for year in all_years:

                            # what's best - extract only when necessary but repeatedly if so, or always, but once
                            this_year_locs = np.where(
                                station.years[month_locs] == year)

                            if "d" not in month_flags[this_year_locs]:
                                # skip if you get the chance
                                continue

                            wind_data = station.wind_speed.data[month_locs][
                                this_year_locs]
                            pressure_data = obs_var.data[month_locs][
                                this_year_locs]

                            storms, = np.ma.where(
                                np.logical_and(
                                    (((wind_data - wind_monthly_average) /
                                      wind_monthly_spread) > STORM_THRESHOLD),
                                    (((pressure_monthly_average - pressure_data
                                       ) / pressure_monthly_spread) >
                                     STORM_THRESHOLD)))

                            # more than one entry - check if separate events
                            if len(storms) >= 2:
                                # find where separation more than the usual obs separation
                                storm_1diffs = np.ma.diff(storms)
                                separations, = np.where(
                                    storm_1diffs > np.ma.median(
                                        np.ma.diff(wind_data)))

                                if len(separations) != 0:
                                    # multiple storm signals
                                    storm_start = 0
                                    storm_finish = separations[0] + 1
                                    first_storm = expand_around_storms(
                                        storms[storm_start:storm_finish],
                                        len(wind_data))
                                    final_storm_locs = copy.deepcopy(
                                        first_storm)

                                    for j in range(len(separations)):
                                        # then do the rest in a loop

                                        if j + 1 == len(separations):
                                            # final one
                                            this_storm = expand_around_storms(
                                                storms[separations[j] + 1:],
                                                len(wind_data))
                                        else:
                                            this_storm = expand_around_storms(
                                                storms[separations[j] +
                                                       1:separations[j + 1] +
                                                       1], len(wind_data))

                                        final_storm_locs = np.append(
                                            final_storm_locs, this_storm)

                                else:
                                    # locations separated at same interval as data
                                    final_storm_locs = expand_around_storms(
                                        storms, len(wind_data))

                            # single entry
                            elif len(storms) != 0:
                                # expand around the storm signal (rather than
                                #  just unflagging what could be the peak and
                                #  leaving the entry/exit flagged)
                                final_storm_locs = expand_around_storms(
                                    storms, len(wind_data))

                            # unset the flags
                            if len(storms) > 0:
                                month_flags[this_year_locs][
                                    final_storm_locs] = ""

                # having checked for storms now store final flags
                flags[month_locs] = month_flags

        # diagnostic plots
        if plots:
            import matplotlib.pyplot as plt
            plt.step(bins[1:], hist, color='k', where="pre")
            plt.yscale("log")

            plt.ylabel("Number of Observations")
            plt.xlabel(obs_var.name.capitalize())
            plt.title("{} - month {}".format(station.id, month))

            plt.ylim([0.1, max(hist) * 2])

            plt.axvline(upper_threshold, c="r")
            plt.axvline(lower_threshold, c="r")

            bad_locs, = np.where(flags[month_locs] == "d")
            bad_hist, dummy = np.histogram(normalised_anomalies[bad_locs],
                                           bins)
            plt.step(bins[1:], bad_hist, color='r', where="pre")

            plt.show()

    # append flags to object
    obs_var.flags = utils.insert_flags(obs_var.flags, flags)

    if diagnostics:

        print("Distribution (all) {}".format(obs_var.name))
        print("   Cumulative number of flags set: {}".format(
            len(np.where(flags != "")[0])))

    return  # all_obs_gap
Esempio n. 17
0
def diurnal_cycle_check(obs_var, station, config_file, plots=False, diagnostics=False, best_fit_diurnal=None, best_fit_uncertainty=None):
    """
    Use offset to find days where cycle doesn't match

    :param MetVar obs_var: Meteorological Variable object
    :param Station station: Station Object for the station
    :param str configfile: string for configuration file
    :param bool plots: turn on plots
    :param bool diagnostics: turn on diagnostic output
    """
    flags = np.array(["" for i in range(obs_var.data.shape[0])])

    diurnal_offset = int(utils.read_qc_config(config_file, "DIURNAL-{}".format(obs_var.name), "peak"))

    hours = np.arange(24)
    hours = np.roll(hours, 11-int(diurnal_offset))


    if diurnal_offset != MISSING:

        if (best_fit_diurnal is None) and (best_fit_uncertainty is None):
            best_fit_diurnal, best_fit_uncertainty = prepare_data(station, obs_var)

        # find locations where the overall best fit does not match the daily fit
        potentially_spurious = np.ones(best_fit_diurnal.shape[0])*MISSING

        for d, (fit, uncertainty) in enumerate(zip(best_fit_diurnal, best_fit_uncertainty)):
            if fit != MISSING:
                min_range = 11 - uncertainty
                max_range = 11 + uncertainty
                
                offset_loc, = np.where(hours == fit)
                
                # find where the best fit falls outside the range for this particular day
                if offset_loc < min_range or offset_loc > max_range:
                    potentially_spurious[d] = 1
                else:
                    potentially_spurious[d] = 0

        # now check there are sufficient issues in running 30 day periods
        """Any periods>30 days where the diurnal cycle deviates from the expected 
        phase by more than this uncertainty, without three consecutive good or missing days 
        or six consecutive days consisting of a mix of only good or missing values, a
        re deemed dubious and the entire period of data (including all non-temperature elements) is flagged"""

        n_good = 0
        n_miss = 0
        n_not_bad = 0
        total_points = 0
        total_not_miss = 0
        bad_locs = np.zeros(best_fit_diurnal.shape[0])

        for d in range(best_fit_diurnal.shape[0]):

            if potentially_spurious[d] == 1:
                # if bad, just add one
                n_good = 0
                n_miss = 0
                n_not_bad = 0
                total_points += 1
                total_not_miss += 1

            else:
                # find a non-bad value - so check previous run
                #  if have reached limits on good/missing
                if (n_good == 3) or (n_miss == 3) or (n_not_bad >= 6):
                    # sufficient good missing or not bad data
                    if total_points >= 30:
                        # if have collected enough others, then set flag
                        if float(total_not_miss)/total_points >= 0.5:
                            bad_locs[d - total_points : d] = 1
                    # reset counters
                    n_good = 0
                    n_miss = 0
                    n_not_bad = 0
                    total_points = 0 
                    total_not_miss = 0

                # and deal with this point
                total_points += 1
                if potentially_spurious[d] == 0:
                    # if good
                    n_good += 1
                    n_not_bad += 1
                    if n_miss != 0:
                        n_miss = 0
                    total_not_miss += 1

                elif potentially_spurious[d] == -999:
                    # if missing data
                    n_miss += 1
                    n_not_bad += 1
                    if n_good != 0:
                        n_good = 0

        # run through all days
        # find zero point of day counter in data preparation part
        day_counter_start = dt.datetime(np.unique(station.years)[0], np.unique(station.months)[0], np.unique(station.days)[0])

        # find the bad days in the times array
        for day in bad_locs:

            this_day = day_counter_start + dt.timedelta(days=int(day))

            locs, = np.where(np.logical_and.reduce((station.years == this_day.year, station.months == this_day.month, station.days == this_day.day)))

            flags[locs] = "U"

        # append flags to object
        obs_var.flags = utils.insert_flags(obs_var.flags, flags)

        if diagnostics:

            print("Diurnal Check {}".format(obs_var.name))
            print("   Cumulative number of flags set: {}".format(len(np.where(flags != "")[0])))

    else:
        if diagnostics:
            print("Diurnal fit not found")

    return # diurnal_cycle_check
Esempio n. 18
0
def flag_clusters(obs_var, station, plots=False, diagnostics=False):
    """
    Go through the clusters of data and flag if meet requirements

    :param MetVar obs_var: meteorological variable object
    :param Station station: Station Object for the station
    :param bool plots: turn on plots
    :param bool diagnostics: turn on diagnostic output
    """

    flags = np.array(["" for i in range(obs_var.data.shape[0])])

    time_differences = np.diff(station.times) / np.timedelta64(1, "m")

    potential_cluster_ends, = np.where(time_differences >= MIN_SEPARATION * 60)

    # TODO - need explicit checks for start and end of timeseries
    for ce, cluster_end in enumerate(potential_cluster_ends):

        if ce == 0:
            # check if cluster at start of series (long gap after a first few points)
            cluster_length = station.times.iloc[
                cluster_end] - station.times.iloc[0]
            if cluster_length.asm8 / np.timedelta64(1, "h") < MAX_LENGTH_TIME:
                # could be a cluster
                if len(flags[:cluster_end + 1]) < MAX_LENGTH_OBS:
                    flags[:cluster_end + 1] = "o"

                    if plots:
                        plot_cluster(station, obs_var, 0, cluster_end + 1)

        elif ce == len(potential_cluster_ends) - 1:

            # check if cluster at end of series (long gap before last few points)
            cluster_length = station.times.iloc[-1] - station.times.iloc[
                cluster_end + 1]  # add one to find cluster start!
            if cluster_length.asm8 / np.timedelta64(1, "h") < MAX_LENGTH_TIME:
                # could be a cluster
                if len(flags[cluster_end + 1:]) < MAX_LENGTH_OBS:
                    flags[cluster_end + 1:] = "o"

                    if plots:
                        plot_cluster(station, obs_var, cluster_end + 1, -1)

        if ce > 0:
            # check for cluster in series.
            #  use previous gap > MIN_SEPARATION to define cluster and check length
            cluster_length = station.times.iloc[
                cluster_end] - station.times.iloc[potential_cluster_ends[
                    ce - 1] + 1]  # add one to find cluster start!
            if cluster_length.asm8 / np.timedelta64(1, "h") < MAX_LENGTH_TIME:
                # could be a cluster
                if len(flags[potential_cluster_ends[ce - 1] + 1:cluster_end +
                             1]) < MAX_LENGTH_OBS:
                    flags[potential_cluster_ends[ce - 1] + 1:cluster_end +
                          1] = "o"

                    if plots:
                        plot_cluster(station.times, obs_var,
                                     potential_cluster_ends[ce - 1] + 1,
                                     cluster_end + 1)

    # append flags to object
    obs_var.flags = utils.insert_flags(obs_var.flags, flags)

    if diagnostics:

        print("Odd Cluster {}".format(obs_var.name))
        print("   Cumulative number of flags set: {}".format(
            len(np.where(flags != "")[0])))

    return  # flag_clusters