コード例 #1
0
ファイル: diurnal.py プロジェクト: glamod/glamod_landQC
def diurnal_cycle_check(obs_var, station, config_file, plots=False, diagnostics=False, best_fit_diurnal=None, best_fit_uncertainty=None):
    """
    Use offset to find days where cycle doesn't match

    :param MetVar obs_var: Meteorological Variable object
    :param Station station: Station Object for the station
    :param str configfile: string for configuration file
    :param bool plots: turn on plots
    :param bool diagnostics: turn on diagnostic output
    """
    flags = np.array(["" for i in range(obs_var.data.shape[0])])

    diurnal_offset = int(utils.read_qc_config(config_file, "DIURNAL-{}".format(obs_var.name), "peak"))

    hours = np.arange(24)
    hours = np.roll(hours, 11-int(diurnal_offset))


    if diurnal_offset != MISSING:

        if (best_fit_diurnal is None) and (best_fit_uncertainty is None):
            best_fit_diurnal, best_fit_uncertainty = prepare_data(station, obs_var)

        # find locations where the overall best fit does not match the daily fit
        potentially_spurious = np.ones(best_fit_diurnal.shape[0])*MISSING

        for d, (fit, uncertainty) in enumerate(zip(best_fit_diurnal, best_fit_uncertainty)):
            if fit != MISSING:
                min_range = 11 - uncertainty
                max_range = 11 + uncertainty
                
                offset_loc, = np.where(hours == fit)
                
                # find where the best fit falls outside the range for this particular day
                if offset_loc < min_range or offset_loc > max_range:
                    potentially_spurious[d] = 1
                else:
                    potentially_spurious[d] = 0

        # now check there are sufficient issues in running 30 day periods
        """Any periods>30 days where the diurnal cycle deviates from the expected 
        phase by more than this uncertainty, without three consecutive good or missing days 
        or six consecutive days consisting of a mix of only good or missing values, a
        re deemed dubious and the entire period of data (including all non-temperature elements) is flagged"""

        n_good = 0
        n_miss = 0
        n_not_bad = 0
        total_points = 0
        total_not_miss = 0
        bad_locs = np.zeros(best_fit_diurnal.shape[0])

        for d in range(best_fit_diurnal.shape[0]):

            if potentially_spurious[d] == 1:
                # if bad, just add one
                n_good = 0
                n_miss = 0
                n_not_bad = 0
                total_points += 1
                total_not_miss += 1

            else:
                # find a non-bad value - so check previous run
                #  if have reached limits on good/missing
                if (n_good == 3) or (n_miss == 3) or (n_not_bad >= 6):
                    # sufficient good missing or not bad data
                    if total_points >= 30:
                        # if have collected enough others, then set flag
                        if float(total_not_miss)/total_points >= 0.5:
                            bad_locs[d - total_points : d] = 1
                    # reset counters
                    n_good = 0
                    n_miss = 0
                    n_not_bad = 0
                    total_points = 0 
                    total_not_miss = 0

                # and deal with this point
                total_points += 1
                if potentially_spurious[d] == 0:
                    # if good
                    n_good += 1
                    n_not_bad += 1
                    if n_miss != 0:
                        n_miss = 0
                    total_not_miss += 1

                elif potentially_spurious[d] == -999:
                    # if missing data
                    n_miss += 1
                    n_not_bad += 1
                    if n_good != 0:
                        n_good = 0

        # run through all days
        # find zero point of day counter in data preparation part
        day_counter_start = dt.datetime(np.unique(station.years)[0], np.unique(station.months)[0], np.unique(station.days)[0])

        # find the bad days in the times array
        for day in bad_locs:

            this_day = day_counter_start + dt.timedelta(days=int(day))

            locs, = np.where(np.logical_and.reduce((station.years == this_day.year, station.months == this_day.month, station.days == this_day.day)))

            flags[locs] = "U"

        # append flags to object
        obs_var.flags = utils.insert_flags(obs_var.flags, flags)

        if diagnostics:

            print("Diurnal Check {}".format(obs_var.name))
            print("   Cumulative number of flags set: {}".format(len(np.where(flags != "")[0])))

    else:
        if diagnostics:
            print("Diurnal fit not found")

    return # diurnal_cycle_check
コード例 #2
0
ファイル: streaks.py プロジェクト: glamod/glamod_landQC
def repeating_value(obs_var,
                    times,
                    config_file,
                    plots=False,
                    diagnostics=False):
    """
    AKA straight string

    Use config file to read threshold values.  Then find strings which exceed threshold.

    :param MetVar obs_var: meteorological variable object
    :param array times: array of times (usually in minutes)
    :param str config_file: configuration file to store critical values
    :param bool plots: turn on plots
    :param bool diagnostics: turn on diagnostic output
    """

    # remove calm periods for wind speeds when (a) calculating thresholds and (b) identifying streaks
    this_var = copy.deepcopy(obs_var)
    if obs_var.name == "wind_speed":
        calms, = np.ma.where(this_var.data == 0)
        this_var.data[calms] = utils.MDI
        this_var.data.mask[calms] = True

    flags = np.array(["" for i in range(this_var.data.shape[0])])
    compressed_flags = np.array(
        ["" for i in range(this_var.data.compressed().shape[0])])

    # retrieve the threshold and store in another dictionary
    threshold = {}
    try:
        th = utils.read_qc_config(config_file,
                                  "STREAK-{}".format(this_var.name),
                                  "Straight")
        threshold["Straight"] = float(th)
    except KeyError:
        # no threshold set
        print("Threshold missing in config file")
        get_repeating_string_threshold(this_var,
                                       config_file,
                                       plots=plots,
                                       diagnostics=diagnostics)
        th = utils.read_qc_config(config_file,
                                  "STREAK-{}".format(this_var.name),
                                  "Straight")
        threshold["Straight"] = float(th)

    # only process further if there is enough data
    if len(this_var.data.compressed()) > 1:
        repeated_string_lengths, grouped_diffs, strings = prepare_data_repeating_string(
            this_var, plots=plots, diagnostics=diagnostics)

        # above threshold
        bad, = np.where(repeated_string_lengths >= threshold["Straight"])

        # flag identified strings
        for string in bad:
            start = int(np.sum(grouped_diffs[:strings[string], 1]))
            end = start + int(grouped_diffs[strings[string], 1]) + 1

            compressed_flags[start:end] = "K"

            if plots:
                plot_streak(times, this_var, start, end)

        # undo compression and write into original object (the one with calm periods)
        flags[this_var.data.mask == False] = compressed_flags
        obs_var.flags = utils.insert_flags(obs_var.flags, flags)

    if diagnostics:

        print("Repeated Strings {}".format(this_var.name))
        print("   Cumulative number of flags set: {}".format(
            len(np.where(flags != "")[0])))

    return  # repeating_value
コード例 #3
0
ファイル: spike.py プロジェクト: glamod/glamod_landQC
def identify_spikes(obs_var,
                    times,
                    config_file,
                    plots=False,
                    diagnostics=False):
    """
    Use config_file to read in critical values, and then assess to find spikes

    :param MetVar obs_var: meteorological variable object
    :param array times: array of times (usually in minutes)
    :param str config_file: configuration file to store critical values
    :param bool plots: turn on plots
    :param bool diagnostics: turn on diagnostic output
    """

    # TODO check works with missing data (compressed?)
    # TODO monthly?

    masked_times = np.ma.masked_array(times, mask=obs_var.data.mask)

    time_diffs = np.ma.diff(masked_times) / np.timedelta64(
        1, "m")  # presuming minutes
    value_diffs = np.ma.diff(obs_var.data)

    if len(value_diffs.mask.shape) == 0:
        # single mask value, replace with array of True/False's
        if value_diffs.mask:
            value_diffs.mask = np.ones(value_diffs.shape)
        else:
            value_diffs.mask = np.zeros(value_diffs.shape)

    # get thresholds for each unique time differences
    unique_diffs = np.unique(time_diffs.compressed())

    # retrieve the critical values
    critical_values = {}
    for t_diff in unique_diffs:
        try:
            c_value = utils.read_qc_config(config_file,
                                           "SPIKE-{}".format(obs_var.name),
                                           "{}".format(t_diff))
            critical_values[t_diff] = float(c_value)
        except KeyError:
            # no critical value for this time difference
            pass

    # if none have been read, give an option to calculate in case that was the reason for none
    if len(critical_values) == 0:
        get_critical_values(obs_var,
                            times,
                            config_file,
                            plots=plots,
                            diagnostics=diagnostics)

        # and try again
        for t_diff in unique_diffs:
            try:
                c_value = utils.read_qc_config(config_file,
                                               "SPIKE-{}".format(obs_var.name),
                                               "{}".format(t_diff))
                critical_values[t_diff] = float(c_value)
            except KeyError:
                # no critical value for this time difference
                pass

    # pre select for each time difference that can be tested
    for t_diff in unique_diffs:
        if t_diff == 0:
            # not a spike or jump, but 2 values at the same time.
            #  should be zero value difference, so fitting histogram not going to work
            #  handled in separate test
            continue

        # new blank flag array
        flags = np.array(["" for i in range(obs_var.data.shape[0])])

        t_locs, = np.where(time_diffs == t_diff)

        try:
            c_locs, = np.where(
                np.abs(value_diffs[t_locs]) > critical_values[t_diff])
        except:
            # no critical value for this time difference
            continue  # to next loop

        # TODO - sort spikes at very beginning or very end of sequence,
        #    when don't have a departure from/return to a normal level

        # potential spikes
        for ps, possible_in_spike in enumerate(t_locs[c_locs]):
            is_spike = False

            spike_len = 1
            while spike_len <= MAX_SPIKE_LENGTH:
                # test for each possible length to see if identified
                try:
                    out_spike_t_diff = time_diffs[possible_in_spike +
                                                  spike_len]
                    possible_out_spike = value_diffs[possible_in_spike +
                                                     spike_len]
                except IndexError:
                    # got to end of data run, can't test final value at the moment
                    break

                # need to test mask/unmasked using array rather than values extracted above
                #    as if values unmasked, then no mask attribute to test!
                if time_diffs.mask[possible_in_spike + spike_len] == False and \
                        value_diffs.mask[possible_in_spike + spike_len] == False:
                    try:
                        # find critical value for time-difference of way out of spike
                        out_critical_value = critical_values[out_spike_t_diff]
                    except KeyError:
                        # don't have a value for this time difference, so use the maximum of all as a proxy
                        out_critical_value = max(critical_values.values())
                else:
                    # time or value difference masked
                    out_critical_value = max(critical_values.values())

                if np.abs(possible_out_spike) > out_critical_value:
                    # check that the signs are opposite
                    if np.sign(value_diffs[possible_in_spike]) != np.sign(
                            value_diffs[possible_in_spike + spike_len]):
                        is_spike = True
                        break

                spike_len += 1

            if is_spike and spike_len >= 1:
                # test within spike differences (chosing correct time difference)
                within = 1
                while within < spike_len:
                    within_t_diff = time_diffs[possible_in_spike + within]
                    if time_diffs.mask[possible_in_spike + within] == False:
                        try:
                            within_critical_value = critical_values[
                                within_t_diff]
                            if value_diffs[
                                    possible_in_spike +
                                    within] > within_critical_value / 2.:
                                is_spike = False
                        except KeyError:
                            # don't have a value for this time difference, so use the maximum of all as a proxy
                            within_critical_value = max(
                                critical_values.values())
                    else:
                        # time difference masked
                        within_critical_value = max(critical_values.values())

                    if value_diffs.mask[possible_in_spike + within] == False:
                        if value_diffs[possible_in_spike +
                                       within] > within_critical_value / 2.:
                            is_spike = False
                    else:
                        # if masked then no data, so can't say if it's not a spike
                        pass

                    within += 1

            if is_spike:
                # test either side (either before or after is too big)
                try:
                    before_t_diff = time_diffs[possible_in_spike - 1]
                    if time_diffs.mask[possible_in_spike - 1] == False:
                        before_critical_value = critical_values[before_t_diff]
                    else:
                        # time difference masked
                        before_critical_value = max(critical_values.values())
                except KeyError:
                    # don't have a value for this time difference, so use the maximum of all as a proxy
                    before_critical_value = max(critical_values.values())
                except IndexError:
                    # off the front of the data array
                    before_critical_value = max(critical_values.values())

                try:
                    after_t_diff = time_diffs[possible_in_spike + spike_len +
                                              1]
                    if time_diffs.mask[possible_in_spike + spike_len +
                                       1] == False:
                        after_critical_value = critical_values[after_t_diff]
                    else:
                        # time difference masked
                        after_critical_value = max(critical_values.values())
                except KeyError:
                    # don't have a value for this time difference, so use the maximum of all as a proxy
                    after_critical_value = max(critical_values.values())
                except IndexError:
                    # off the back of the data array
                    after_critical_value = max(critical_values.values())

                try:
                    if value_diffs.mask[possible_in_spike - 1] == False:
                        if value_diffs[possible_in_spike -
                                       1] > before_critical_value / 2.:
                            # before spike fails test
                            is_spike = False

                except IndexError:
                    # off the front of the data array
                    pass

                try:
                    if value_diffs.mask[possible_in_spike + spike_len +
                                        1] == False:
                        if value_diffs[possible_in_spike + spike_len +
                                       1] > after_critical_value / 2.:
                            # after spike fails test
                            is_spike = False
                except IndexError:
                    # off the back of the data array
                    pass

            # if the spike is still set, set the flags
            if is_spike:
                # "+1" because of difference arrays
                flags[possible_in_spike + 1:possible_in_spike + 1 +
                      spike_len] = "S"

                # diagnostic plots
                if plots:
                    plot_spike(times, obs_var, possible_in_spike + 1,
                               spike_len)

        obs_var.flags = utils.insert_flags(obs_var.flags, flags)

        if diagnostics:

            print("Spike {}".format(obs_var.name))
            print("   Time Difference: {} minutes".format(t_diff))
            print("      Cumulative number of flags set: {}".format(
                len(np.where(flags != "")[0])))

    return  # identify_spikes
コード例 #4
0
ファイル: frequent.py プロジェクト: glamod/glamod_landQC
def frequent_values(obs_var,
                    station,
                    config_file,
                    plots=False,
                    diagnostics=False):
    """
    Use config file to read frequent values.  Check each month to see if appear.

    :param MetVar obs_var: meteorological variable object
    :param Station station: station object
    :param str config_file: configuration file to store critical values
    :param bool plots: turn on plots
    :param bool diagnostics: turn on diagnostic output
    """

    flags = np.array(["" for i in range(obs_var.data.shape[0])])

    all_years = np.unique(station.years)

    # work through each month, and then year
    for month in range(1, 13):

        # read in bin-width and suspect bins for this month
        try:
            width = float(
                utils.read_qc_config(config_file,
                                     "FREQUENT-{}".format(obs_var.name),
                                     "width"))
            suspect_bins = utils.read_qc_config(config_file,
                                                "FREQUENT-{}".format(
                                                    obs_var.name),
                                                "{}".format(month),
                                                islist=True)
        except KeyError:
            print("Information missing in config file")
            identify_values(obs_var,
                            station,
                            config_file,
                            plots=plots,
                            diagnostics=diagnostics)
            width = float(
                utils.read_qc_config(config_file,
                                     "FREQUENT-{}".format(obs_var.name),
                                     "width"))
            suspect_bins = utils.read_qc_config(config_file,
                                                "FREQUENT-{}".format(
                                                    obs_var.name),
                                                "{}".format(month),
                                                islist=True)

        # skip on if nothing to find
        if len(suspect_bins) == 0:
            continue

        # work through each year
        for year in all_years:
            locs, = np.where(
                np.logical_and(station.months == month, station.years == year))

            month_data = obs_var.data[locs]

            # skip if no data
            if np.ma.count(month_data) == 0:
                continue

            month_flags = np.array(["" for i in range(month_data.shape[0])])

            # adjust bin widths according to reporting accuracy
            resolution = utils.reporting_accuracy(month_data)

            if resolution <= 0.5:
                bins = utils.create_bins(month_data, 0.5, obs_var.name)
            else:
                bins = utils.create_bins(month_data, 1.0, obs_var.name)
            hist, bin_edges = np.histogram(month_data, bins)

            # Scan through the histogram
            #   check if a bin is the maximum of a local area ("ROLLING")
            for b, bar in enumerate(hist):
                if (b > ROLLING // 2) and (b <= (len(hist) - ROLLING // 2)):

                    target_bins = hist[b - (ROLLING // 2):b + (ROLLING // 2) +
                                       1]

                    # if sufficient obs, maximum and contains > 50% of data
                    if bar >= utils.DATA_COUNT_THRESHOLD:
                        if bar == target_bins.max():
                            if (bar / target_bins.sum()) > RATIO:
                                # this bin meets all the criteria
                                if bins[b] in suspect_bins:
                                    # find observations (month & year) to flag!
                                    flag_locs = np.where(
                                        np.logical_and(
                                            month_data >= bins[b],
                                            month_data < bins[b + 1]))
                                    month_flags[flag_locs] = "F"

            # copy flags for all years into main array
            flags[locs] = month_flags

        # diagnostic plots
        if plots:
            import matplotlib.pyplot as plt

            plt.step(bins[1:], hist, color='k', where="pre")
            plt.yscale("log")

            plt.ylabel("Number of Observations")
            plt.xlabel(obs_var.name.capitalize())
            plt.title("{} - month {}".format(station.id, month))

            bad_hist = np.copy(hist)
            for b, bar in enumerate(bad_hist):
                if bins[b] not in suspect_bins:
                    bad_hist[b] = 0

            plt.step(bins[1:], bad_hist, color='r', where="pre")
            plt.show()

    # append flags to object
    obs_var.flags = utils.insert_flags(obs_var.flags, flags)

    if diagnostics:

        print("Frequent Values {}".format(obs_var.name))
        print("   Cumulative number of flags set: {}".format(
            len(np.where(flags != "")[0])))

    return  # frequent_values
コード例 #5
0
ファイル: variance.py プロジェクト: glamod/glamod_landQC
def variance_check(obs_var,
                   station,
                   config_file,
                   plots=False,
                   diagnostics=False,
                   winsorize=True):
    """
    Use distribution to identify threshold values.  Then also store in config file.

    :param MetVar obs_var: meteorological variable object
    :param Station station: station object
    :param str config_file: configuration file to store critical values
    :param bool plots: turn on plots
    :param bool diagnostics: turn on diagnostic output
    :param bool winsorize: apply winsorization at 5%/95%
    """

    flags = np.array(["" for i in range(obs_var.data.shape[0])])

    # get hourly climatology for each month
    for month in range(1, 13):
        month_locs, = np.where(station.months == month)

        variances = prepare_data(obs_var,
                                 station,
                                 month,
                                 diagnostics=diagnostics,
                                 winsorize=winsorize)

        try:
            average_variance = float(
                utils.read_qc_config(config_file,
                                     "VARIANCE-{}".format(obs_var.name),
                                     "{}-average".format(month)))
            variance_spread = float(
                utils.read_qc_config(config_file,
                                     "VARIANCE-{}".format(obs_var.name),
                                     "{}-spread".format(month)))
        except KeyError:
            print("Information missing in config file")
            find_thresholds(obs_var,
                            station,
                            config_file,
                            plots=plots,
                            diagnostics=diagnostics)
            average_variance = float(
                utils.read_qc_config(config_file,
                                     "VARIANCE-{}".format(obs_var.name),
                                     "{}-average".format(month)))
            variance_spread = float(
                utils.read_qc_config(config_file,
                                     "VARIANCE-{}".format(obs_var.name),
                                     "{}-spread".format(month)))

        if average_variance == utils.MDI and variance_spread == utils.MDI:
            # couldn't be calculated, move on
            continue

        bad_years, = np.where(
            np.abs(variances - average_variance) /
            variance_spread > SPREAD_THRESHOLD)

        # prepare wind and pressure data in case needed to check for storms
        if obs_var.name in [
                "station_level_pressure", "sea_level_pressure", "wind_speed"
        ]:
            wind_monthly_data = station.wind_speed.data[month_locs]
            if obs_var.name in [
                    "station_level_pressure", "sea_level_pressure"
            ]:
                pressure_monthly_data = obs_var.data[month_locs]
            else:
                pressure_monthly_data = station.sea_level_pressure.data[
                    month_locs]

            if len(pressure_monthly_data.compressed()) < utils.DATA_COUNT_THRESHOLD or \
                    len(wind_monthly_data.compressed()) < utils.DATA_COUNT_THRESHOLD:
                # need sufficient data to work with for storm check to work, else can't tell
                #    move on
                continue

            wind_average = utils.average(wind_monthly_data)
            wind_spread = utils.spread(wind_monthly_data)

            pressure_average = utils.average(pressure_monthly_data)
            pressure_spread = utils.spread(pressure_monthly_data)

        # go through each bad year for this month
        all_years = np.unique(station.years)
        for year in bad_years:

            # corresponding locations
            ym_locs, = np.where(
                np.logical_and(station.months == month,
                               station.years == all_years[year]))

            # if pressure or wind speed, need to do some further checking before applying flags
            if obs_var.name in [
                    "station_level_pressure", "sea_level_pressure",
                    "wind_speed"
            ]:

                # pull out the data
                wind_data = station.wind_speed.data[ym_locs]
                if obs_var.name in [
                        "station_level_pressure", "sea_level_pressure"
                ]:
                    pressure_data = obs_var.data[ym_locs]
                else:
                    pressure_data = station.sea_level_pressure.data[ym_locs]

                # need sufficient data to work with for storm check to work, else can't tell
                if len(pressure_data.compressed()) < utils.DATA_COUNT_THRESHOLD or \
                        len(wind_data.compressed()) < utils.DATA_COUNT_THRESHOLD:
                    # move on
                    continue

                # find locations of high wind speeds and low pressures, cross match
                high_winds, = np.ma.where(
                    (wind_data - wind_average) / wind_spread > STORM_THRESHOLD)
                low_pressures, = np.ma.where(
                    (pressure_average - pressure_data) /
                    pressure_spread > STORM_THRESHOLD)

                match = np.in1d(high_winds, low_pressures)

                couldbe_storm = False
                if len(match) > 0:
                    # this could be a storm, either at tropical station (relatively constant pressure)
                    # or out of season in mid-latitudes.
                    couldbe_storm = True

                if obs_var.name in [
                        "station_level_pressure", "sea_level_pressure"
                ]:
                    diffs = np.ma.diff(pressure_data)
                elif obs_var.name == "wind_speed":
                    diffs = np.ma.diff(wind_data)

                # count up the largest number of sequential negative and positive differences
                negs, poss = 0, 0
                biggest_neg, biggest_pos = 0, 0

                for diff in diffs:

                    if diff > 0:
                        if negs > biggest_neg: biggest_neg = negs
                        negs = 0
                        poss += 1
                    else:
                        if poss > biggest_pos: biggest_pos = poss
                        poss = 0
                        negs += 1

                if (biggest_neg < 10) and (biggest_pos <
                                           10) and not couldbe_storm:
                    # insufficient to identify as a storm (HadISD values)
                    # leave flags set
                    pass
                else:
                    # could be a storm, so better to leave this month unflagged
                    # zero length array to flag
                    ym_locs = np.ma.array([])

            # copy over the flags, if any
            if len(ym_locs) != 0:
                # and set the flags
                flags[ym_locs] = "V"

        # diagnostic plots
        if plots:
            import matplotlib.pyplot as plt

            scaled_variances = ((variances - average_variance) /
                                variance_spread)
            bins = utils.create_bins(scaled_variances, 0.25, obs_var.name)
            hist, bin_edges = np.histogram(scaled_variances, bins)

            plt.clf()
            plt.step(bins[1:], hist, color='k', where="pre")
            plt.yscale("log")

            plt.ylabel("Number of Months")
            plt.xlabel("Scaled {} Variances".format(obs_var.name.capitalize()))
            plt.title("{} - month {}".format(station.id, month))

            plt.ylim([0.1, max(hist) * 2])
            plt.axvline(SPREAD_THRESHOLD, c="r")
            plt.axvline(-SPREAD_THRESHOLD, c="r")

            bad_hist, dummy = np.histogram(scaled_variances[bad_years], bins)
            plt.step(bins[1:], bad_hist, color='r', where="pre")

            plt.show()

    # append flags to object
    obs_var.flags = utils.insert_flags(obs_var.flags, flags)

    if diagnostics:

        print("Variance {}".format(obs_var.name))
        print("   Cumulative number of flags set: {}".format(
            len(np.where(flags != "")[0])))

    return  # variance_check
コード例 #6
0
ファイル: pressure.py プロジェクト: glamod/glamod_landQC
def pressure_offset(sealp,
                    stnlp,
                    times,
                    config_file,
                    plots=False,
                    diagnostics=False):
    """
    Flag locations where difference between station and sea-level pressure
    falls outside of bounds

    :param MetVar sealp: sea level pressure object
    :param MetVar stnlp: station level pressure object
    :param array times: datetime array
    :param str configfile: string for configuration file
    :param bool plots: turn on plots
    :param bool diagnostics: turn on diagnostic output
    """

    flags = np.array(["" for i in range(sealp.data.shape[0])])

    difference = sealp.data - stnlp.data

    if len(difference.compressed()) >= utils.DATA_COUNT_THRESHOLD:

        try:
            average = float(
                utils.read_qc_config(config_file, "PRESSURE", "average"))
            spread = float(
                utils.read_qc_config(config_file, "PRESSURE", "spread"))
        except KeyError:
            print("Information missing in config file")
            average = utils.average(difference)
            spread = utils.spread(difference)
            if spread < MIN_SPREAD:  # less than XhPa
                spread = MIN_SPREAD
            elif spread > MAX_SPREAD:  # more than XhPa
                spread = MAX_SPREAD

            utils.write_qc_config(config_file,
                                  "PRESSURE",
                                  "average",
                                  "{}".format(average),
                                  diagnostics=diagnostics)
            utils.write_qc_config(config_file,
                                  "PRESSURE",
                                  "spread",
                                  "{}".format(spread),
                                  diagnostics=diagnostics)

        if np.abs(np.ma.mean(difference) -
                  np.ma.median(difference)) > THRESHOLD * spread:
            if diagnostics:
                print("Large difference between mean and median")
                print("Likely to have two populations of roughly equal size")
                print("Test won't work")
            pass
        else:
            high, = np.ma.where(difference > (average + (THRESHOLD * spread)))
            low, = np.ma.where(difference < (average - (THRESHOLD * spread)))

            # diagnostic plots
            if plots:
                bins = np.arange(
                    np.round(difference.min()) - 1,
                    np.round(difference.max()) + 1, 0.1)
                import matplotlib.pyplot as plt
                plt.clf()
                plt.hist(difference.compressed(), bins=bins)
                plt.axvline(x=(average + (THRESHOLD * spread)), ls="--", c="r")
                plt.axvline(x=(average - (THRESHOLD * spread)), ls="--", c="r")
                plt.xlim([bins[0] - 1, bins[-1] + 1])
                plt.ylabel("Observations")
                plt.xlabel("Difference (hPa)")
                plt.show()

            if len(high) != 0:
                flags[high] = "p"
                if diagnostics:
                    print("Pressure".format(stnlp.name))
                    print("   Number of high differences {}".format(len(high)))
                if plots:
                    for bad in high:
                        plot_pressure(sealp, stnlp, times, bad)

            if len(low) != 0:
                flags[low] = "p"
                if diagnostics:
                    print("   Number of low differences {}".format(len(low)))
                if plots:
                    for bad in low:
                        plot_pressure(sealp, stnlp, times, bad)

            # only flag the station level pressure
            stnlp.flags = utils.insert_flags(stnlp.flags, flags)

    if diagnostics:

        print("Pressure {}".format(stnlp.name))
        print("   Cumulative number of flags set: {}".format(
            len(np.where(flags != "")[0])))

    return  # pressure_offset
コード例 #7
0
def monthly_clim(obs_var,
                 station,
                 config_file,
                 logfile="",
                 plots=False,
                 diagnostics=False,
                 winsorize=True):
    """
    Run through the variables and pass to the Distributional Gap Checks

    :param MetVar obs_var: meteorological variable object
    :param Station station: station object
    :param str configfile: string for configuration file
    :param str logfile: string for log file
    :param bool plots: turn on plots
    :param bool diagnostics: turn on diagnostic output
    """
    flags = np.array(["" for i in range(obs_var.data.shape[0])])

    for month in range(1, 13):

        month_locs, = np.where(station.months == month)

        # note these are for the whole record, just this month is unmasked
        normalised_anomalies = prepare_data(obs_var,
                                            station,
                                            month,
                                            diagnostics=diagnostics,
                                            winsorize=winsorize)

        if len(normalised_anomalies.compressed()
               ) >= utils.DATA_COUNT_THRESHOLD:

            bins = utils.create_bins(normalised_anomalies, BIN_WIDTH,
                                     obs_var.name)
            hist, bin_edges = np.histogram(normalised_anomalies.compressed(),
                                           bins)

            try:
                upper_threshold = float(
                    utils.read_qc_config(
                        config_file, "CLIMATOLOGICAL-{}".format(obs_var.name),
                        "{}-uthresh".format(month)))
                lower_threshold = float(
                    utils.read_qc_config(
                        config_file, "CLIMATOLOGICAL-{}".format(obs_var.name),
                        "{}-lthresh".format(month)))
            except KeyError:
                print("Information missing in config file")
                find_month_thresholds(obs_var,
                                      station,
                                      config_file,
                                      plots=plots,
                                      diagnostics=diagnostics)
                upper_threshold = float(
                    utils.read_qc_config(
                        config_file, "CLIMATOLOGICAL-{}".format(obs_var.name),
                        "{}-uthresh".format(month)))
                lower_threshold = float(
                    utils.read_qc_config(
                        config_file, "CLIMATOLOGICAL-{}".format(obs_var.name),
                        "{}-lthresh".format(month)))

            # now to find the gaps
            uppercount = len(
                np.where(normalised_anomalies > upper_threshold)[0])
            lowercount = len(
                np.where(normalised_anomalies < lower_threshold)[0])

            if uppercount > 0:
                gap_start = utils.find_gap(hist, bins, upper_threshold,
                                           GAP_SIZE)

                if gap_start != 0:
                    bad_locs, = np.ma.where(
                        normalised_anomalies >
                        gap_start)  # all years for one month

                    # normalised_anomalies are for the whole record, just this month is unmasked
                    flags[bad_locs] = "C"

            if lowercount > 0:
                gap_start = utils.find_gap(hist,
                                           bins,
                                           lower_threshold,
                                           GAP_SIZE,
                                           upwards=False)

                if gap_start != 0:
                    bad_locs, = np.ma.where(
                        normalised_anomalies <
                        gap_start)  # all years for one month

                    flags[bad_locs] = "C"

            # diagnostic plots
            if plots:
                import matplotlib.pyplot as plt
                plt.clf()
                plt.step(bins[1:], hist, color='k', where="pre")
                plt.yscale("log")

                plt.ylabel("Number of Observations")
                plt.xlabel("Scaled {}".format(obs_var.name.capitalize()))
                plt.title("{} - month {}".format(station.id, month))

                plt.ylim([0.1, max(hist) * 2])
                plt.axvline(upper_threshold, c="r")
                plt.axvline(lower_threshold, c="r")

                bad_locs, = np.where(flags[month_locs] == "C")
                bad_hist, dummy = np.histogram(
                    normalised_anomalies[month_locs][bad_locs], bins)
                plt.step(bins[1:], bad_hist, color='r', where="pre")

                plt.show()

    # append flags to object
    obs_var.flags = utils.insert_flags(obs_var.flags, flags)

    if diagnostics:

        print("Climatological {}".format(obs_var.name))
        print("   Cumulative number of flags set: {}".format(
            len(np.where(flags != "")[0])))

    return  # monthly_clim
コード例 #8
0
def all_obs_gap(obs_var, station, config_file, plots=False, diagnostics=False):
    """
    Extract data for month and find secondary populations in distribution.

    :param MetVar obs_var: meteorological variable object
    :param Station station: station object
    :param str config_file: configuration file to store critical values
    :param bool plots: turn on plots
    :param bool diagnostics: turn on diagnostic output
    """

    flags = np.array(["" for i in range(obs_var.data.shape[0])])

    for month in range(1, 13):

        normalised_anomalies = prepare_all_data(obs_var,
                                                station,
                                                month,
                                                config_file,
                                                full=False,
                                                diagnostics=diagnostics)

        if (len(normalised_anomalies.compressed()) == 1
                and normalised_anomalies[0] == utils.MDI):
            # no data to work with for this month, move on.
            continue

        bins = utils.create_bins(normalised_anomalies, BIN_WIDTH, obs_var.name)
        hist, bin_edges = np.histogram(normalised_anomalies, bins)

        try:
            upper_threshold = float(
                utils.read_qc_config(config_file,
                                     "ADISTRIBUTION-{}".format(obs_var.name),
                                     "{}-uthresh".format(month)))
            lower_threshold = float(
                utils.read_qc_config(config_file,
                                     "ADISTRIBUTION-{}".format(obs_var.name),
                                     "{}-lthresh".format(month)))
        except KeyError:
            print("Information missing in config file")
            find_thresholds(obs_var,
                            station,
                            config_file,
                            plots=plots,
                            diagnostics=diagnostics)
            upper_threshold = float(
                utils.read_qc_config(config_file,
                                     "ADISTRIBUTION-{}".format(obs_var.name),
                                     "{}-uthresh".format(month)))
            lower_threshold = float(
                utils.read_qc_config(config_file,
                                     "ADISTRIBUTION-{}".format(obs_var.name),
                                     "{}-lthresh".format(month)))

        if upper_threshold == utils.MDI and lower_threshold == utils.MDI:
            # these weren't able to be calculated, move on
            continue
        elif len(np.unique(normalised_anomalies)) == 1:
            # all the same value, so won't be able to fit a histogram
            continue

        # now to find the gaps
        uppercount = len(np.where(normalised_anomalies > upper_threshold)[0])
        lowercount = len(np.where(normalised_anomalies < lower_threshold)[0])

        month_locs, = np.where(
            station.months == month)  # append should keep year order
        if uppercount > 0:
            gap_start = utils.find_gap(hist, bins, upper_threshold, GAP_SIZE)

            if gap_start != 0:
                bad_locs, = np.ma.where(normalised_anomalies >
                                        gap_start)  # all years for one month

                month_flags = flags[month_locs]
                month_flags[bad_locs] = "d"
                flags[month_locs] = month_flags

        if lowercount > 0:
            gap_start = utils.find_gap(hist,
                                       bins,
                                       lower_threshold,
                                       GAP_SIZE,
                                       upwards=False)

            if gap_start != 0:
                bad_locs, = np.ma.where(normalised_anomalies <
                                        gap_start)  # all years for one month

                month_flags = flags[month_locs]
                month_flags[bad_locs] = "d"

                # TODO - can this bit be refactored?
                # for pressure data, see if the flagged obs correspond with high winds
                # could be a storm signal
                if obs_var.name in [
                        "station_level_pressure", "sea_level_pressure"
                ]:
                    wind_monthly_data = prepare_monthly_data(
                        station.wind_speed, station, month)
                    pressure_monthly_data = prepare_monthly_data(
                        obs_var, station, month)

                    if len(pressure_monthly_data.compressed()) < utils.DATA_COUNT_THRESHOLD or \
                            len(wind_monthly_data.compressed()) < utils.DATA_COUNT_THRESHOLD:
                        # need sufficient data to work with for storm check to work, else can't tell
                        pass
                    else:

                        wind_monthly_average = utils.average(wind_monthly_data)
                        wind_monthly_spread = utils.spread(wind_monthly_data)

                        pressure_monthly_average = utils.average(
                            pressure_monthly_data)
                        pressure_monthly_spread = utils.spread(
                            pressure_monthly_data)

                        # already a single calendar month, so go through each year
                        all_years = np.unique(station.years)
                        for year in all_years:

                            # what's best - extract only when necessary but repeatedly if so, or always, but once
                            this_year_locs = np.where(
                                station.years[month_locs] == year)

                            if "d" not in month_flags[this_year_locs]:
                                # skip if you get the chance
                                continue

                            wind_data = station.wind_speed.data[month_locs][
                                this_year_locs]
                            pressure_data = obs_var.data[month_locs][
                                this_year_locs]

                            storms, = np.ma.where(
                                np.logical_and(
                                    (((wind_data - wind_monthly_average) /
                                      wind_monthly_spread) > STORM_THRESHOLD),
                                    (((pressure_monthly_average - pressure_data
                                       ) / pressure_monthly_spread) >
                                     STORM_THRESHOLD)))

                            # more than one entry - check if separate events
                            if len(storms) >= 2:
                                # find where separation more than the usual obs separation
                                storm_1diffs = np.ma.diff(storms)
                                separations, = np.where(
                                    storm_1diffs > np.ma.median(
                                        np.ma.diff(wind_data)))

                                if len(separations) != 0:
                                    # multiple storm signals
                                    storm_start = 0
                                    storm_finish = separations[0] + 1
                                    first_storm = expand_around_storms(
                                        storms[storm_start:storm_finish],
                                        len(wind_data))
                                    final_storm_locs = copy.deepcopy(
                                        first_storm)

                                    for j in range(len(separations)):
                                        # then do the rest in a loop

                                        if j + 1 == len(separations):
                                            # final one
                                            this_storm = expand_around_storms(
                                                storms[separations[j] + 1:],
                                                len(wind_data))
                                        else:
                                            this_storm = expand_around_storms(
                                                storms[separations[j] +
                                                       1:separations[j + 1] +
                                                       1], len(wind_data))

                                        final_storm_locs = np.append(
                                            final_storm_locs, this_storm)

                                else:
                                    # locations separated at same interval as data
                                    final_storm_locs = expand_around_storms(
                                        storms, len(wind_data))

                            # single entry
                            elif len(storms) != 0:
                                # expand around the storm signal (rather than
                                #  just unflagging what could be the peak and
                                #  leaving the entry/exit flagged)
                                final_storm_locs = expand_around_storms(
                                    storms, len(wind_data))

                            # unset the flags
                            if len(storms) > 0:
                                month_flags[this_year_locs][
                                    final_storm_locs] = ""

                # having checked for storms now store final flags
                flags[month_locs] = month_flags

        # diagnostic plots
        if plots:
            import matplotlib.pyplot as plt
            plt.step(bins[1:], hist, color='k', where="pre")
            plt.yscale("log")

            plt.ylabel("Number of Observations")
            plt.xlabel(obs_var.name.capitalize())
            plt.title("{} - month {}".format(station.id, month))

            plt.ylim([0.1, max(hist) * 2])

            plt.axvline(upper_threshold, c="r")
            plt.axvline(lower_threshold, c="r")

            bad_locs, = np.where(flags[month_locs] == "d")
            bad_hist, dummy = np.histogram(normalised_anomalies[bad_locs],
                                           bins)
            plt.step(bins[1:], bad_hist, color='r', where="pre")

            plt.show()

    # append flags to object
    obs_var.flags = utils.insert_flags(obs_var.flags, flags)

    if diagnostics:

        print("Distribution (all) {}".format(obs_var.name))
        print("   Cumulative number of flags set: {}".format(
            len(np.where(flags != "")[0])))

    return  # all_obs_gap
コード例 #9
0
def prepare_all_data(obs_var,
                     station,
                     month,
                     config_file,
                     full=False,
                     diagnostics=False):
    """
    Extract data for the month, make & store or read average and spread.
    Use to calculate normalised anomalies.

    :param MetVar obs_var: meteorological variable object
    :param Station station: station object
    :param int month: month to process
    :param str config_file: configuration file to store critical values
    :param bool diagnostics: turn on diagnostic output
    """

    month_locs, = np.where(station.months == month)

    all_month_data = obs_var.data[month_locs]

    if full:

        if len(all_month_data.compressed()) >= utils.DATA_COUNT_THRESHOLD:
            # have data, now to standardise
            climatology = utils.average(all_month_data)  # mean
            spread = utils.spread(all_month_data)  # IQR currently
        else:
            climatology = utils.MDI
            spread = utils.MDI

        # write out the scaling...
        utils.write_qc_config(config_file,
                              "ADISTRIBUTION-{}".format(obs_var.name),
                              "{}-clim".format(month),
                              "{}".format(climatology),
                              diagnostics=diagnostics)
        utils.write_qc_config(config_file,
                              "ADISTRIBUTION-{}".format(obs_var.name),
                              "{}-spread".format(month),
                              "{}".format(spread),
                              diagnostics=diagnostics)

    else:

        try:
            climatology = float(
                utils.read_qc_config(config_file,
                                     "ADISTRIBUTION-{}".format(obs_var.name),
                                     "{}-clim".format(month)))
            spread = float(
                utils.read_qc_config(config_file,
                                     "ADISTRIBUTION-{}".format(obs_var.name),
                                     "{}-spread".format(month)))
        except KeyError:

            if len(all_month_data.compressed()) >= utils.DATA_COUNT_THRESHOLD:
                # have data, now to standardise
                climatology = utils.average(all_month_data)  # mean
                spread = utils.spread(all_month_data)  # IQR currently
            else:
                climatology = utils.MDI
                spread = utils.MDI

            # write out the scaling...
            utils.write_qc_config(config_file,
                                  "ADISTRIBUTION-{}".format(obs_var.name),
                                  "{}-clim".format(month),
                                  "{}".format(climatology),
                                  diagnostics=diagnostics)
            utils.write_qc_config(config_file,
                                  "ADISTRIBUTION-{}".format(obs_var.name),
                                  "{}-spread".format(month),
                                  "{}".format(spread),
                                  diagnostics=diagnostics)

    if climatology == utils.MDI and spread == utils.MDI:
        # these weren't calculable, move on
        return np.ma.array([utils.MDI])
    elif spread == 0:
        # all the same value
        return (all_month_data - climatology)  # prepare_all_data
    else:
        return (all_month_data - climatology) / spread  # prepare_all_data
コード例 #10
0
def monthly_gap(obs_var, station, config_file, plots=False, diagnostics=False):
    """
    Use distribution to identify assymetries.

    :param MetVar obs_var: meteorological variable object
    :param Station station: station object
    :param str config_file: configuration file to store critical values
    :param bool plots: turn on plots
    :param bool diagnostics: turn on diagnostic output
    """

    flags = np.array(["" for i in range(obs_var.data.shape[0])])
    all_years = np.unique(station.years)

    for month in range(1, 13):

        month_averages = prepare_monthly_data(obs_var,
                                              station,
                                              month,
                                              diagnostics=diagnostics)

        # read in the scaling
        try:
            climatology = float(
                utils.read_qc_config(config_file,
                                     "MDISTRIBUTION-{}".format(obs_var.name),
                                     "{}-clim".format(month)))
            spread = float(
                utils.read_qc_config(config_file,
                                     "MDISTRIBUTION-{}".format(obs_var.name),
                                     "{}-spread".format(month)))
        except KeyError:
            print("Information missing in config file")
            find_monthly_scaling(obs_var,
                                 station,
                                 config_file,
                                 diagnostics=diagnostics)
            climatology = float(
                utils.read_qc_config(config_file,
                                     "MDISTRIBUTION-{}".format(obs_var.name),
                                     "{}-clim".format(month)))
            spread = float(
                utils.read_qc_config(config_file,
                                     "MDISTRIBUTION-{}".format(obs_var.name),
                                     "{}-spread".format(month)))

        if climatology == utils.MDI and spread == utils.MDI:
            # these weren't calculable, move on
            continue

        standardised_months = (month_averages - climatology) / spread

        bins = utils.create_bins(standardised_months, BIN_WIDTH, obs_var.name)
        hist, bin_edges = np.histogram(standardised_months, bins)

        # flag months with very large offsets
        bad, = np.where(np.abs(standardised_months) >= LARGE_LIMIT)
        # now follow flag locations back up through the process
        for bad_month_id in bad:
            # year ID for this set of calendar months
            locs, = np.where(
                np.logical_and(station.months == month,
                               station.years == all_years[bad_month_id]))
            flags[locs] = "D"

        # walk distribution from centre to find assymetry
        sort_order = standardised_months.argsort()
        mid_point = len(standardised_months) // 2
        good = True
        step = 1
        bad = []
        while good:

            if standardised_months[sort_order][
                    mid_point -
                    step] != standardised_months[sort_order][mid_point + step]:

                suspect_months = [np.abs(standardised_months[sort_order][mid_point - step]), \
                                      np.abs(standardised_months[sort_order][mid_point + step])]

                if min(suspect_months) != 0:
                    # not all clustered at origin

                    if max(suspect_months) / min(suspect_months) >= 2. and min(
                            suspect_months) >= 1.5:
                        # at least 1.5x spread from centre and difference of two in location (longer tail)
                        # flag everything further from this bin for that tail
                        if suspect_months[0] == max(suspect_months):
                            # LHS has issue (remember that have removed the sign)
                            bad = sort_order[:mid_point - (
                                step -
                                1)]  # need -1 given array indexing standards
                        elif suspect_months[1] == max(suspect_months):
                            # RHS has issue
                            bad = sort_order[mid_point + step:]
                        good = False

            step += 1
            if (mid_point - step) < 0 or (
                    mid_point + step) == standardised_months.shape[0]:
                # reached end
                break

        # now follow flag locations back up through the process
        for bad_month_id in bad:
            # year ID for this set of calendar months
            locs, = np.where(
                np.logical_and(station.months == month,
                               station.years == all_years[bad_month_id]))
            flags[locs] = "D"

        if plots:
            import matplotlib.pyplot as plt

            plt.step(bins[1:], hist, color='k', where="pre")
            if len(bad) > 0:
                bad_hist, dummy = np.histogram(standardised_months[bad], bins)
                plt.step(bins[1:], bad_hist, color='r', where="pre")

            plt.ylabel("Number of Months")
            plt.xlabel(obs_var.name.capitalize())
            plt.title("{} - month {}".format(station.id, month))

            plt.show()

    # append flags to object
    obs_var.flags = utils.insert_flags(obs_var.flags, flags)

    if diagnostics:

        print("Distribution (monthly) {}".format(obs_var.name))
        print("   Cumulative number of flags set: {}".format(
            len(np.where(flags != "")[0])))

    return  # monthly_gap