def dgc_set_up_plot(plot_gaussian, standardised_months, variable, threshold = (1.5,-1.5), sub_par = "", GH = False):
    '''
    Set up the histogram plot and the Gaussian Fit.

    :param array standardised_months: input array of months standardised by IQR
    :param str variable: label for title and axes
    :param int threshold: x values to draw vertical lines
    :param str sub_par: sub-parameter for labels
    :returns:
    '''

    # set up the bins
    bins, bincenters = utils.create_bins(standardised_months, BIN_SIZE)
    dummy, plot_bincenters = utils.create_bins(standardised_months, BIN_SIZE/10.)

    # make the histogram
    hist, binEdges = np.histogram(standardised_months, bins = bins)
    plot_hist = np.array([0.01 if h == 0 else h for h in hist]) # allow for log y-scale
    
    import matplotlib.pyplot as plt

    plt.clf()
    plt.axes([0.1,0.15,0.8,0.7])
    plt.step(bincenters, plot_hist, 'k-', label = 'standardised months', where='mid')

    # # plot fitted Gaussian
    # if GH:
    #     initial_values = [np.max(hist), np.mean(standardised_months), np.std(standardised_months), stats.skew(standardised_months), stats.kurtosis(standardised_months)] # norm, mean, std, skew, kurtosis
        
    #     fit = leastsq(utils.residualsGH, initial_values, [bincenters, hist, np.ones(len(hist))])
    #     res = utils.hermite2gauss(fit[0])
        
    #     bins, bincenters = utils.create_bins(standardised_months, 0.025)
    #     plot_gaussian = utils.funcGH(fit[0], bincenters)

    # else:

    #     fit = utils.fit_gaussian(bincenters, hist, max(hist), mu = np.mean(standardised_months), sig = np.std(standardised_months))
    #     bins, bincenters = utils.create_bins(standardised_months, 0.025)
    #     plot_gaussian = utils.gaussian(bincenters, fit)


    plt.plot(plot_bincenters, plot_gaussian, 'b-', label = 'Gaussian fit')

    # sort the labels etc
    plt.xlabel("%s offset (IQR)" % variable)                    
    plt.ylabel("Frequency (%s)" % sub_par)
    plt.gca().set_yscale('log')
    plt.axvline(threshold[0],c='r')
    plt.axvline(threshold[1],c='r')
    plt.ylim(ymin=0.1)
    plt.title("Distributional Gap Check - %s - %s" % (sub_par, variable) )        

    return  # dgc_set_up_plot
Exemple #2
0
def plot_target_neigh_diffs_dist(differences, iqr):
    '''
    Plot the distribution of target-neighbour differences
    
    :param array differences: masked difference array
    :param float iqr: inter quartile range of differences

    :returns: 
    '''
    import matplotlib.pyplot as plt
    
    plt.clf()
    
    bins, bincenters = utils.create_bins(differences.compressed(), 1.0)
    
    hist, binEdges = np.histogram(differences.compressed(), bins=bins)
    plot_hist = np.array([float(x) if x != 0 else 1e-1 for x in hist])
    plt.step(bincenters, plot_hist, 'k-', label = 'observations', where='mid')
    
    fit = utils.fit_gaussian(bincenters, hist, max(hist), mu=np.mean(differences.compressed()), sig = np.std(differences.compressed()))
    plot_gaussian = utils.gaussian(bincenters, fit)
    plt.plot(bincenters, plot_gaussian, 'b-', label = 'Gaussian fit')
    
    plt.axvline(5.*iqr, c = 'r')
    plt.axvline(-5.*iqr, c = 'r')
    
    print "only shows lowest of monthly IQRs"

    plt.ylabel("Frequency")
    plt.gca().set_yscale('log')
    plt.ylim([0.1,2*max(hist)])
    
    plt.show()      

    return # plot_target_neigh_diffs_dist
Exemple #3
0
def evc_plot_hist(plot_variances, iqr_threshold, title):
    '''
    Plot the histogram, with removed observations highlighted
    
    :param array plot_variances: values to be shown on histogram
    :param array iqr_threshold: threshold for removal
    :param str title: title of plot
    
    :returns:
    '''

    import matplotlib.pyplot as plt
    # set up the bins
    bins, bincenters = utils.create_bins(plot_variances, 1.0)

    # make the histogram
    hist, binEdges = np.histogram(plot_variances, bins=bins)

    plot_hist = np.array([0.01 if h == 0 else h
                          for h in hist])  # allow for log y-scale

    plt.clf()
    plt.step(bincenters,
             plot_hist,
             'k-',
             label='standardised months',
             where='mid')

    # sort the labels etc
    plt.xlabel("variance offset (IQR)")
    plt.ylabel("Frequency")
    plt.gca().set_yscale('log')

    plt.axvline(-iqr_threshold, c='r')
    plt.axvline(iqr_threshold, c='r')
    plt.step(bincenters[bincenters < -iqr_threshold],
             plot_hist[bincenters < -iqr_threshold],
             'r-',
             where='mid')
    plt.step(bincenters[bincenters > iqr_threshold],
             plot_hist[bincenters > iqr_threshold],
             'r-',
             where='mid')

    plt.ylim(ymin=0.1)
    plt.title(title)

    plt.show()

    return  # plot_hist
Exemple #4
0
def evc_plot_hist(plot_variances, iqr_threshold, title):
    '''
    Plot the histogram, with removed observations highlighted
    
    :param array plot_variances: values to be shown on histogram
    :param array iqr_threshold: threshold for removal
    :param str title: title of plot
    
    :returns:
    '''

    import matplotlib.pyplot as plt
    # set up the bins
    bins, bincenters = utils.create_bins(plot_variances, 1.0)

    # make the histogram
    hist, binEdges = np.histogram(plot_variances, bins = bins)
                
    plot_hist = np.array([0.01 if h == 0 else h for h in hist]) # allow for log y-scale

    plt.clf()
    plt.step(bincenters, plot_hist, 'k-', label = 'standardised months', where='mid')
    
    # sort the labels etc
    plt.xlabel("variance offset (IQR)")                    
    plt.ylabel("Frequency")
    plt.gca().set_yscale('log')

    plt.axvline(-iqr_threshold,c='r')
    plt.axvline(iqr_threshold,c='r')
    plt.step(bincenters[bincenters < -iqr_threshold], plot_hist[bincenters < -iqr_threshold], 'r-', where='mid')
    plt.step(bincenters[bincenters > iqr_threshold], plot_hist[bincenters > iqr_threshold], 'r-', where='mid')

    plt.ylim(ymin=0.1)
    plt.title(title)
                
    plt.show()

    return # plot_hist
Exemple #5
0
def plot_target_neigh_diffs_dist(differences, iqr):
    '''
    Plot the distribution of target-neighbour differences
    
    :param array differences: masked difference array
    :param float iqr: inter quartile range of differences

    :returns: 
    '''
    import matplotlib.pyplot as plt

    plt.clf()

    bins, bincenters = utils.create_bins(differences.compressed(), 1.0)

    hist, binEdges = np.histogram(differences.compressed(), bins=bins)
    plot_hist = np.array([float(x) if x != 0 else 1e-1 for x in hist])
    plt.step(bincenters, plot_hist, 'k-', label='observations', where='mid')

    fit = utils.fit_gaussian(bincenters,
                             hist,
                             max(hist),
                             mu=np.mean(differences.compressed()),
                             sig=np.std(differences.compressed()))
    plot_gaussian = utils.gaussian(bincenters, fit)
    plt.plot(bincenters, plot_gaussian, 'b-', label='Gaussian fit')

    plt.axvline(5. * iqr, c='r')
    plt.axvline(-5. * iqr, c='r')

    print "only shows lowest of monthly IQRs"

    plt.ylabel("Frequency")
    plt.gca().set_yscale('log')
    plt.ylim([0.1, 2 * max(hist)])

    plt.show()

    return  # plot_target_neigh_diffs_dist
Exemple #6
0
def identify_values(obs_var,
                    station,
                    config_file,
                    plots=False,
                    diagnostics=False):
    """
    Use distribution to identify frequent values.  Then also store in config file.

    :param MetVar obs_var: meteorological variable object
    :param Station station: station object
    :param str config_file: configuration file to store critical values
    :param bool plots: turn on plots
    :param bool diagnostics: turn on diagnostic output
    """

    # TODO - do we want to go down the road of allowing resolution (and hence test)
    #           to vary over the p-o-r?  I.e. 1C in early, to 0.5C to 0.1C in different decades?

    utils.write_qc_config(config_file,
                          "FREQUENT-{}".format(obs_var.name),
                          "width",
                          "{}".format(BIN_WIDTH),
                          diagnostics=diagnostics)

    for month in range(1, 13):

        locs, = np.where(station.months == month)

        month_data = obs_var.data[locs]

        if len(month_data.compressed()) < utils.DATA_COUNT_THRESHOLD:
            # insufficient data, so write out empty config and move on
            utils.write_qc_config(config_file,
                                  "FREQUENT-{}".format(obs_var.name),
                                  "{}".format(month),
                                  "[{}]".format(",".join(str(s) for s in [])),
                                  diagnostics=diagnostics)
            continue

        # adjust bin widths according to reporting accuracy
        resolution = utils.reporting_accuracy(month_data)

        if resolution <= 0.5:
            bins = utils.create_bins(month_data, 0.5, obs_var.name)
        else:
            bins = utils.create_bins(month_data, 1.0, obs_var.name)

        hist, bin_edges = np.histogram(month_data, bins)

        # diagnostic plots
        if plots:
            import matplotlib.pyplot as plt

            plt.step(bins[1:], hist, color='k', where="pre")
            plt.yscale("log")

            plt.ylabel("Number of Observations")
            plt.xlabel(obs_var.name.capitalize())
            plt.title("{} - month {}".format(station.id, month))

        # Scan through the histogram
        #   check if a bin is the maximum of a local area ("ROLLING")
        suspect = []
        for b, bar in enumerate(hist):
            if (b > ROLLING // 2) and (b <= (len(hist) - ROLLING // 2)):

                target_bins = hist[b - (ROLLING // 2):b + (ROLLING // 2) + 1]

                # if sufficient obs, maximum and contains > 50%, but not all, of the data
                if bar >= utils.DATA_COUNT_THRESHOLD:
                    if bar == target_bins.max():
                        if (bar / target_bins.sum()) > RATIO:
                            suspect += [bins[b]]

        # diagnostic plots
        if plots:
            bad_hist = np.copy(hist)
            for b, bar in enumerate(bad_hist):
                if bins[b] not in suspect:
                    bad_hist[b] = 0

            plt.step(bins[1:], bad_hist, color='r', where="pre")
            plt.show()

        # write out the thresholds...
        utils.write_qc_config(config_file,
                              "FREQUENT-{}".format(obs_var.name),
                              "{}".format(month),
                              "[{}]".format(",".join(str(s) for s in suspect)),
                              diagnostics=diagnostics)

    return  # identify_values
Exemple #7
0
def frequent_values(obs_var,
                    station,
                    config_file,
                    plots=False,
                    diagnostics=False):
    """
    Use config file to read frequent values.  Check each month to see if appear.

    :param MetVar obs_var: meteorological variable object
    :param Station station: station object
    :param str config_file: configuration file to store critical values
    :param bool plots: turn on plots
    :param bool diagnostics: turn on diagnostic output
    """

    flags = np.array(["" for i in range(obs_var.data.shape[0])])

    all_years = np.unique(station.years)

    # work through each month, and then year
    for month in range(1, 13):

        # read in bin-width and suspect bins for this month
        try:
            width = float(
                utils.read_qc_config(config_file,
                                     "FREQUENT-{}".format(obs_var.name),
                                     "width"))
            suspect_bins = utils.read_qc_config(config_file,
                                                "FREQUENT-{}".format(
                                                    obs_var.name),
                                                "{}".format(month),
                                                islist=True)
        except KeyError:
            print("Information missing in config file")
            identify_values(obs_var,
                            station,
                            config_file,
                            plots=plots,
                            diagnostics=diagnostics)
            width = float(
                utils.read_qc_config(config_file,
                                     "FREQUENT-{}".format(obs_var.name),
                                     "width"))
            suspect_bins = utils.read_qc_config(config_file,
                                                "FREQUENT-{}".format(
                                                    obs_var.name),
                                                "{}".format(month),
                                                islist=True)

        # skip on if nothing to find
        if len(suspect_bins) == 0:
            continue

        # work through each year
        for year in all_years:
            locs, = np.where(
                np.logical_and(station.months == month, station.years == year))

            month_data = obs_var.data[locs]

            # skip if no data
            if np.ma.count(month_data) == 0:
                continue

            month_flags = np.array(["" for i in range(month_data.shape[0])])

            # adjust bin widths according to reporting accuracy
            resolution = utils.reporting_accuracy(month_data)

            if resolution <= 0.5:
                bins = utils.create_bins(month_data, 0.5, obs_var.name)
            else:
                bins = utils.create_bins(month_data, 1.0, obs_var.name)
            hist, bin_edges = np.histogram(month_data, bins)

            # Scan through the histogram
            #   check if a bin is the maximum of a local area ("ROLLING")
            for b, bar in enumerate(hist):
                if (b > ROLLING // 2) and (b <= (len(hist) - ROLLING // 2)):

                    target_bins = hist[b - (ROLLING // 2):b + (ROLLING // 2) +
                                       1]

                    # if sufficient obs, maximum and contains > 50% of data
                    if bar >= utils.DATA_COUNT_THRESHOLD:
                        if bar == target_bins.max():
                            if (bar / target_bins.sum()) > RATIO:
                                # this bin meets all the criteria
                                if bins[b] in suspect_bins:
                                    # find observations (month & year) to flag!
                                    flag_locs = np.where(
                                        np.logical_and(
                                            month_data >= bins[b],
                                            month_data < bins[b + 1]))
                                    month_flags[flag_locs] = "F"

            # copy flags for all years into main array
            flags[locs] = month_flags

        # diagnostic plots
        if plots:
            import matplotlib.pyplot as plt

            plt.step(bins[1:], hist, color='k', where="pre")
            plt.yscale("log")

            plt.ylabel("Number of Observations")
            plt.xlabel(obs_var.name.capitalize())
            plt.title("{} - month {}".format(station.id, month))

            bad_hist = np.copy(hist)
            for b, bar in enumerate(bad_hist):
                if bins[b] not in suspect_bins:
                    bad_hist[b] = 0

            plt.step(bins[1:], bad_hist, color='r', where="pre")
            plt.show()

    # append flags to object
    obs_var.flags = utils.insert_flags(obs_var.flags, flags)

    if diagnostics:

        print("Frequent Values {}".format(obs_var.name))
        print("   Cumulative number of flags set: {}".format(
            len(np.where(flags != "")[0])))

    return  # frequent_values
def dgc_all_obs(station,
                variable,
                flags,
                start,
                end,
                plots=False,
                diagnostics=False,
                idl=False,
                windspeeds=False,
                GH=False):
    '''RJHD addition working on all observations'''

    if plots:
        import matplotlib.pyplot as plt

    st_var = getattr(station, variable)

    month_ranges = utils.month_starts_in_pairs(start, end)
    month_ranges = month_ranges.reshape(-1, 12, 2)

    all_filtered = utils.apply_filter_flags(st_var)

    for month in range(12):

        if windspeeds == True:
            st_var_wind = getattr(station, "windspeeds")

            # get monthly averages
            windspeeds_month = np.empty([])
            for y, year in enumerate(month_ranges[:, month, :]):

                if y == 0:
                    windspeeds_month = np.ma.array(
                        st_var_wind.data[year[0]:year[1]])
                else:
                    windspeeds_month = np.ma.concatenate(
                        [windspeeds_month, st_var_wind.data[year[0]:year[1]]])

            windspeeds_month_average = dgc_get_monthly_averages(
                windspeeds_month, OBS_LIMIT, st_var_wind.mdi, MEAN)
            windspeeds_month_mad = utils.mean_absolute_deviation(
                windspeeds_month, median=True)

        this_month_data = np.array([])
        this_month_filtered = np.array([])

        this_month_data, dummy, dummy = utils.concatenate_months(
            month_ranges[:, month, :], st_var.data, hours=False)
        this_month_filtered, dummy, dummy = utils.concatenate_months(
            month_ranges[:, month, :], all_filtered, hours=False)

        if len(this_month_filtered.compressed()) > OBS_LIMIT:

            if idl:
                monthly_median = utils.idl_median(
                    this_month_filtered.compressed().reshape(-1))
            else:
                monthly_median = np.ma.median(this_month_filtered)

            iqr = utils.IQR(this_month_filtered.compressed())

            if iqr == 0.0:
                # to get some spread if IQR too small
                iqr = utils.IQR(this_month_filtered.compressed(),
                                percentile=0.05)

                print "Spurious_stations file not yet sorted"

            if iqr != 0.0:
                monthly_values = np.ma.array(
                    (this_month_data.compressed() - monthly_median) / iqr)

                bins, bincenters = utils.create_bins(monthly_values, BIN_SIZE)
                dummy, plot_bincenters = utils.create_bins(
                    monthly_values, BIN_SIZE / 10.)

                hist, binEdges = np.histogram(monthly_values, bins=bins)

                if GH:
                    # Use Gauss-Hermite polynomials to add skew and kurtosis to Gaussian fit - January 2015 ^RJHD

                    initial_values = [
                        np.max(hist),
                        np.mean(monthly_values),
                        np.std(monthly_values),
                        stats.skew(monthly_values),
                        stats.kurtosis(monthly_values)
                    ]  # norm, mean, std, skew, kurtosis

                    fit = leastsq(utils.residualsGH, initial_values,
                                  [bincenters, hist,
                                   np.ones(len(hist))])
                    res = utils.hermite2gauss(fit[0], diagnostics=diagnostics)

                    plot_gaussian = utils.funcGH(fit[0], plot_bincenters)

                    # adjust to remove the rising bumps seen in some fits - artefacts of GH fitting?
                    mid_point = np.argmax(plot_gaussian)
                    bad, = np.where(
                        plot_gaussian[mid_point:] < FREQUENCY_THRESHOLD / 10.)
                    if len(bad) > 0:
                        plot_gaussian[mid_point:][
                            bad[0]:] = FREQUENCY_THRESHOLD / 10.

                    bad, = np.where(
                        plot_gaussian[:mid_point] < FREQUENCY_THRESHOLD / 10.)
                    if len(bad) > 0:
                        plot_gaussian[:mid_point][:bad[
                            -1]] = FREQUENCY_THRESHOLD / 10.

                    # extract threshold values
                    good_values = np.argwhere(
                        plot_gaussian > FREQUENCY_THRESHOLD)

                    l_minimum_threshold = round(
                        plot_bincenters[good_values[0]]) - 1
                    u_minimum_threshold = 1 + round(
                        plot_bincenters[good_values[-1]])

                else:
                    gaussian = utils.fit_gaussian(bincenters,
                                                  hist,
                                                  max(hist),
                                                  mu=np.mean(monthly_values),
                                                  sig=np.std(monthly_values))

                    # assume the same threshold value
                    u_minimum_threshold = 1 + round(
                        utils.invert_gaussian(FREQUENCY_THRESHOLD, gaussian))
                    l_minimum_threshold = -u_minimum_threshold

                    plot_gaussian = utils.gaussian(plot_bincenters, gaussian)

                if diagnostics:
                    if GH:
                        print hist
                        print res
                        print iqr, l_minimum_threshold, u_minimum_threshold

                    else:
                        print hist
                        print gaussian
                        print iqr, u_minimum_threshold, 1. + utils.invert_gaussian(
                            FREQUENCY_THRESHOLD, gaussian)

                if plots:
                    dgc_set_up_plot(plot_gaussian,
                                    monthly_values,
                                    variable,
                                    threshold=(u_minimum_threshold,
                                               l_minimum_threshold),
                                    sub_par="observations",
                                    GH=GH)

                    if GH:
                        plt.figtext(
                            0.15,
                            0.67,
                            'Mean %.2f, S.d. %.2f,\nSkew %.2f, Kurtosis %.2f' %
                            (res['mean'], res['dispersion'], res['skewness'],
                             res['kurtosis']),
                            color='k',
                            size='small')

                uppercount = len(
                    np.where(monthly_values > u_minimum_threshold)[0])
                lowercount = len(
                    np.where(monthly_values < l_minimum_threshold)[0])

                # this needs refactoring - but lots of variables to pass in
                if plots or diagnostics: gap_plot_values = np.array([])

                if uppercount > 0:
                    gap_start = dgc_find_gap(hist, binEdges,
                                             u_minimum_threshold)

                    if gap_start != 0:

                        for y, year in enumerate(month_ranges[:, month, :]):

                            this_year_data = np.ma.array(
                                all_filtered[year[0]:year[1]])
                            this_year_flags = np.array(flags[year[0]:year[1]])
                            gap_cleaned_locations = np.where(
                                ((this_year_data - monthly_median) /
                                 iqr) > gap_start)

                            this_year_flags[gap_cleaned_locations] = 1
                            flags[year[0]:year[1]] = this_year_flags

                            if plots or diagnostics:
                                gap_plot_values = np.append(
                                    gap_plot_values,
                                    (this_year_data[gap_cleaned_locations].
                                     compressed() - monthly_median) / iqr)

                if lowercount > 0:
                    gap_start = dgc_find_gap(hist, binEdges,
                                             l_minimum_threshold)

                    if gap_start != 0:

                        for y, year in enumerate(month_ranges[:, month, :]):

                            this_year_data = np.ma.array(
                                all_filtered[year[0]:year[1]])
                            this_year_flags = np.array(flags[year[0]:year[1]])
                            gap_cleaned_locations = np.where(
                                np.logical_and(
                                    ((this_year_data - monthly_median) / iqr) <
                                    gap_start, this_year_data.mask != True))

                            this_year_flags[gap_cleaned_locations] = 1
                            flags[year[0]:year[1]] = this_year_flags

                            if plots or diagnostics:
                                gap_plot_values = np.append(
                                    gap_plot_values,
                                    (this_year_data[gap_cleaned_locations].
                                     compressed() - monthly_median) / iqr)

                            if windspeeds:
                                this_year_flags[
                                    gap_cleaned_locations] = 2  # tentative flags

                                slp_average = dgc_get_monthly_averages(
                                    this_month_data, OBS_LIMIT, st_var.mdi,
                                    MEAN)
                                slp_mad = utils.mean_absolute_deviation(
                                    this_month_data, median=True)
                                storms = np.where((((windspeeds_month - windspeeds_month_average) / windspeeds_month_mad) > 4.5) &\
                                                   (((this_month_data - slp_average) / slp_mad) > 4.5))

                                if len(storms[0]) >= 2:

                                    storm_1diffs = np.diff(storms)

                                    separations = np.where(storm_1diffs != 1)

                                    #for sep in separations:

                if plots:
                    hist, binEdges = np.histogram(gap_plot_values, bins=bins)
                    plot_hist = np.array([0.01 if h == 0 else h for h in hist])
                    plt.step(bincenters,
                             plot_hist,
                             'r-',
                             label='flagged',
                             where='mid')
                    import calendar
                    plt.text(0.1,
                             0.9,
                             calendar.month_name[month + 1],
                             transform=plt.gca().transAxes)
                    plt.legend(loc='lower center',
                               ncol=3,
                               bbox_to_anchor=(0.5, -0.2),
                               frameon=False,
                               prop={'size': 13})
                    plt.show()
                    #plt.savefig(IMAGELOCATION+'/'+station.id+'_DistributionalGap_'+str(month+1)+'.png')
    if diagnostics:
        utils.print_flagged_obs_number("",
                                       "Distributional Gap",
                                       variable,
                                       len(gap_plot_values),
                                       noWrite=True)

    return flags  # dgc_all_obs
def dgc_monthly(station,
                variable,
                flags,
                start,
                end,
                plots=False,
                diagnostics=False,
                idl=False):
    '''
    Original Distributional Gap Check

    :param obj station: station object
    :param str variable: variable to act on
    :param array flags: flags array
    :param datetime start: data start
    :param datetime end: data end
    :param bool plots: run plots
    :param bool diagnostics: run diagnostics
    :param bool idl: run IDL equivalent routines for median
    :returns: 
       flags - updated flag array
    '''

    if plots:
        import matplotlib.pyplot as plt

    st_var = getattr(station, variable)

    month_ranges = utils.month_starts_in_pairs(start, end)

    # get monthly averages
    month_average = np.empty(month_ranges.shape[0])
    month_average.fill(st_var.mdi)
    month_average_filtered = np.empty(month_ranges.shape[0])
    month_average_filtered.fill(st_var.mdi)

    all_filtered = utils.apply_filter_flags(st_var)
    for m, month in enumerate(month_ranges):

        data = st_var.data[month[0]:month[1]]

        filtered = all_filtered[month[0]:month[1]]

        month_average[m] = dgc_get_monthly_averages(data, OBS_LIMIT,
                                                    st_var.mdi, MEAN)
        month_average_filtered[m] = dgc_get_monthly_averages(
            filtered, OBS_LIMIT, st_var.mdi, MEAN)

    # get overall monthly climatologies - use filtered data

    month_average = month_average.reshape(-1, 12)
    month_average_filtered = month_average_filtered.reshape(-1, 12)

    standardised_months = np.empty(month_average.shape)
    standardised_months.fill(st_var.mdi)

    for m in range(12):

        valid_filtered = np.where(month_average_filtered[:, m] != st_var.mdi)

        if len(valid_filtered[0]) >= VALID_MONTHS:

            valid_data = month_average_filtered[valid_filtered, m][0]

            if MEAN:
                clim = np.mean(valid_data)
                spread = np.stdev(valid_data)

            else:
                if idl:
                    clim = utils.idl_median(
                        valid_data.compressed().reshape(-1))
                else:
                    clim = np.median(valid_data)
                spread = utils.IQR(valid_data)
                if spread <= SPREAD_LIMIT:
                    spread = SPREAD_LIMIT

            standardised_months[valid_filtered,
                                m] = (month_average[valid_filtered, m] -
                                      clim) / spread

    standardised_months = standardised_months.reshape(month_ranges.shape[0])

    good_months = np.where(standardised_months != st_var.mdi)

    # must be able to do this with masked arrays
    if plots:
        bins, bincenters = utils.create_bins(standardised_months[good_months],
                                             BIN_SIZE)
        dummy, plot_bincenters = utils.create_bins(
            standardised_months[good_months], BIN_SIZE / 10.)

        hist, binEdges = np.histogram(standardised_months[good_months],
                                      bins=bins)

        fit = utils.fit_gaussian(bincenters,
                                 hist,
                                 max(hist),
                                 mu=np.mean(standardised_months[good_months]),
                                 sig=np.std(standardised_months[good_months]))
        plot_gaussian = utils.gaussian(plot_bincenters, fit)

        dgc_set_up_plot(plot_gaussian,
                        standardised_months[good_months],
                        variable,
                        sub_par="Months")

    # remove all months with a large standardised offset

    if len(good_months[0]) >= MONTH_LIMIT:

        standardised_months = np.ma.masked_values(standardised_months,
                                                  st_var.mdi)
        large_offsets = np.where(standardised_months >= LARGE_LIMIT)

        if len(large_offsets[0]) > 0:

            for lo in large_offsets[0]:
                flags[month_ranges[lo, 0]:month_ranges[lo, 1]] = 1

            if plots:

                hist, binEdges = np.histogram(
                    standardised_months[large_offsets], bins=bins)
                plot_hist = np.array([0.01 if h == 0 else h for h in hist])
                plt.step(bincenters,
                         plot_hist,
                         'g-',
                         label='> %i' % LARGE_LIMIT,
                         where='mid',
                         zorder=5)

                plt.axvline(5, c='g')
                plt.axvline(-5, c='g')

        # walk distribution from centre and see if any assymetry
        sort_order = standardised_months[good_months].argsort()

        mid_point = len(good_months[0]) / 2

        good = True
        iter = 1
        while good:

            if standardised_months[good_months][sort_order][
                    mid_point -
                    iter] != standardised_months[good_months][sort_order][
                        mid_point + iter]:
                # using IDL notation
                tempvals = [
                    np.abs(
                        standardised_months[good_months][sort_order][mid_point
                                                                     - iter]),
                    np.abs(
                        standardised_months[good_months][sort_order][mid_point
                                                                     + iter])
                ]

                if min(tempvals) != 0:
                    if max(tempvals) / min(tempvals) >= 2. and min(
                            tempvals) >= 1.5:
                        # substantial asymmetry in distribution - at least 1.5 from centre and difference of 2.

                        if tempvals[0] == max(tempvals):
                            # LHS
                            bad = good_months[0][sort_order][:mid_point - iter]
                            if plots:
                                badplot = standardised_months[good_months][
                                    sort_order][:mid_point - iter]
                        elif tempvals[1] == max(tempvals):
                            #RHS
                            bad = good_months[0][sort_order][mid_point + iter:]
                            if plots:
                                badplot = standardised_months[good_months][
                                    sort_order][mid_point + iter:]

                        for b in bad:
                            flags[month_ranges[b, 0]:month_ranges[b, 1]] = 1

                        if plots:

                            hist, binEdges = np.histogram(badplot, bins=bins)
                            plot_hist = np.array(
                                [0.01 if h == 0 else h for h in hist])
                            plt.step(bincenters,
                                     plot_hist,
                                     'r-',
                                     label='Gap',
                                     where='mid',
                                     zorder=4)

                        good = False

            iter += 1
            if iter == mid_point: break

        if plots:
            plt.legend(loc='lower center',
                       ncol=4,
                       bbox_to_anchor=(0.5, -0.2),
                       frameon=False,
                       prop={'size': 13})
            plt.show()
            #plt.savefig(IMAGELOCATION+'/'+station.id+'_DistributionalGap.png')

    return flags  # dgc_monthly
def dgc_set_up_plot(plot_gaussian,
                    standardised_months,
                    variable,
                    threshold=(1.5, -1.5),
                    sub_par="",
                    GH=False):
    '''
    Set up the histogram plot and the Gaussian Fit.

    :param array standardised_months: input array of months standardised by IQR
    :param str variable: label for title and axes
    :param int threshold: x values to draw vertical lines
    :param str sub_par: sub-parameter for labels
    :returns:
    '''

    # set up the bins
    bins, bincenters = utils.create_bins(standardised_months, BIN_SIZE)
    dummy, plot_bincenters = utils.create_bins(standardised_months,
                                               BIN_SIZE / 10.)

    # make the histogram
    hist, binEdges = np.histogram(standardised_months, bins=bins)
    plot_hist = np.array([0.01 if h == 0 else h
                          for h in hist])  # allow for log y-scale

    import matplotlib.pyplot as plt

    plt.clf()
    plt.axes([0.1, 0.15, 0.8, 0.7])
    plt.step(bincenters,
             plot_hist,
             'k-',
             label='standardised months',
             where='mid')

    # # plot fitted Gaussian
    # if GH:
    #     initial_values = [np.max(hist), np.mean(standardised_months), np.std(standardised_months), stats.skew(standardised_months), stats.kurtosis(standardised_months)] # norm, mean, std, skew, kurtosis

    #     fit = leastsq(utils.residualsGH, initial_values, [bincenters, hist, np.ones(len(hist))])
    #     res = utils.hermite2gauss(fit[0])

    #     bins, bincenters = utils.create_bins(standardised_months, 0.025)
    #     plot_gaussian = utils.funcGH(fit[0], bincenters)

    # else:

    #     fit = utils.fit_gaussian(bincenters, hist, max(hist), mu = np.mean(standardised_months), sig = np.std(standardised_months))
    #     bins, bincenters = utils.create_bins(standardised_months, 0.025)
    #     plot_gaussian = utils.gaussian(bincenters, fit)

    plt.plot(plot_bincenters, plot_gaussian, 'b-', label='Gaussian fit')

    # sort the labels etc
    plt.xlabel("%s offset (IQR)" % variable)
    plt.ylabel("Frequency (%s)" % sub_par)
    plt.gca().set_yscale('log')
    plt.axvline(threshold[0], c='r')
    plt.axvline(threshold[1], c='r')
    plt.ylim(ymin=0.1)
    plt.title("Distributional Gap Check - %s - %s" % (sub_par, variable))

    return  # dgc_set_up_plot
Exemple #11
0
def fvc(station, variable_list, flag_col, start, end, logfile, diagnostics = False, plots = False):
    '''
    Check for certain values occurring more frequently than would be expected
    
    :param object station: station object to process
    :param list variable_list: list of variables to process
    :param list flag_col: columns to fill in flag array
    :param datetime start: datetime object of start of data
    :param datetime end: datetime object of end of data
    :param file logfile: logfile to store outputs
    :param bool diagnostics: produce extra diagnostic output
    :param bool plots: produce plots
    '''
    
    MIN_DATA_REQUIRED = 500 # to create histogram for complete record
    MIN_DATA_REQUIRED_YEAR = 100 # to create histogram

    month_ranges = utils.month_starts_in_pairs(start, end)

    month_ranges_years = month_ranges.reshape(-1,12,2)

    for v,variable in enumerate(variable_list):
    
        st_var = getattr(station, variable)
        
        reporting_accuracy = utils.reporting_accuracy(utils.apply_filter_flags(st_var))
        
        # apply flags - for detection only
        filtered_data = utils.apply_filter_flags(st_var)

        for season in range(5): # Year,MAM,JJA,SON,JF+D
 
            if season == 0:
                # all year
                season_data = np.ma.masked_values(filtered_data.compressed(), st_var.fdi)
                thresholds = [30,20,10]

            else:
                thresholds = [20,15,10]
                season_data = np.ma.array([])
                
                for y,year in enumerate(month_ranges_years):
                    # churn through months extracting data, accounting for fdi and concatenating together
                    if season == 1:
                        #mam
                        season_data = np.ma.concatenate([season_data, np.ma.masked_values(filtered_data[year[2][0]:year[4][-1]], st_var.fdi)])
                    elif season == 2:
                        #jja
                        season_data = np.ma.concatenate([season_data, np.ma.masked_values(filtered_data[year[5][0]:year[7][-1]], st_var.fdi)])
                    elif season == 3:
                        #son
                        season_data = np.ma.concatenate([season_data, np.ma.masked_values(filtered_data[year[8][0]:year[10][-1]], st_var.fdi)])
                    elif season == 4:
                        #d+jf
                        season_data = np.ma.concatenate([season_data, np.ma.masked_values(filtered_data[year[0][0]:year[1][-1]], st_var.fdi)])
                        season_data = np.ma.concatenate([season_data, np.ma.masked_values(filtered_data[year[-1][0]:year[-1][-1]], st_var.fdi)])
                    

            season_data = season_data.compressed()

            if len(season_data) > MIN_DATA_REQUIRED:    

                if 0 < reporting_accuracy <= 0.5: # -1 used as missing value
                    bins, bincenters = utils.create_bins(season_data, 0.5)
                else:
                    bins, bincenters = utils.create_bins(season_data, 1.0)

                hist, binEdges = np.histogram(season_data, bins = bins)
            
                if plots:
                    plot_hist, bincenters = fvc_plot_setup(season_data, hist, binEdges, st_var.name, title = "%s" % (SEASONS[season]))

                bad_bin = np.zeros(len(hist))

                # scan through bin values and identify bad ones
                for e, element in enumerate(hist):                  
                    if e > 3 and e <= (len(hist) - 3):
                        # don't bother with first three or last three bins
                        seven_bins = hist[e-3:e+3+1]
                        if (seven_bins[3] == seven_bins.max()) and (seven_bins[3] != 0):
                            # is local maximum and != zero
                            if (seven_bins[3]/float(seven_bins.sum()) >= 0.5) and (seven_bins[3] >= thresholds[0]):
                                # contains >50% of data and is greater than threshold
                                bad_bin[e] = 1

                            # for plotting remove good bins
                            else:
                                if plots: plot_hist[e]=1e-1
                        else:
                            if plots: plot_hist[e]=1e-1
                    else:
                        if plots: plot_hist[e]=1e-1



                if plots:
                    plt.step(bincenters, plot_hist, 'r-', where='mid')
                    plt.show()
            
                # having identified possible bad bins, check each year in turn
                for y,year in enumerate(month_ranges_years):

                    if season == 0:
                        # year
                        year_data = np.ma.masked_values(st_var.data[year[0][0]:year[-1][-1]], st_var.fdi)
                        year_flags = station.qc_flags[year[0][0]:year[-1][-1],flag_col[v]]
                    elif season == 1:
                        #mam
                        year_data = np.ma.masked_values(st_var.data[year[2][0]:year[4][-1]], st_var.fdi)
                        year_flags = station.qc_flags[year[2][0]:year[4][-1],flag_col[v]]
                    elif season == 2:
                        #jja
                        year_data = np.ma.masked_values(st_var.data[year[5][0]:year[7][-1]], st_var.fdi)
                        year_flags = station.qc_flags[year[5][0]:year[7][-1],flag_col[v]]
                    elif season == 3:
                        #son
                        year_data = np.ma.masked_values(st_var.data[year[8][0]:year[10][-1]], st_var.fdi)
                        year_flags = station.qc_flags[year[8][0]:year[10][-1],flag_col[v]]
                    elif season == 4:
                        #d+jf
                        year_data = np.ma.concatenate([np.ma.masked_values(st_var.data[year[0][0]:year[1][-1]], st_var.fdi),\
                                                       np.ma.masked_values(st_var.data[year[-1][0]:year[-1][-1]], st_var.fdi)])
                        year_flags = np.append(station.qc_flags[year[0][0]:year[1][-1],flag_col[v]],station.qc_flags[year[-1][0]:year[-1][-1],flag_col[v]])


                    if len(year_data.compressed()) > MIN_DATA_REQUIRED_YEAR:    

                        hist, binEdges = np.histogram(year_data.compressed(), bins = bins)

                        if plots:
                            plot_hist, bincenters = fvc_plot_setup(hist, binEdges, st_var.name, title = "%s - %s" % (y+start.year, SEASONS[season]))

                        for e, element in enumerate(hist):

                            if bad_bin[e] == 1:
                                # only look at pre-identified bins

                                if e >= 3 and e <= (len(hist) - 3):
                                    # don't bother with first three or last three bins
                                    seven_bins = hist[e-3:e+3+1].astype('float')
                                    if (seven_bins[3] == seven_bins.max()) and (seven_bins[3] != 0):
                                        # is local maximum and != zero
                                        if (seven_bins[3]/seven_bins.sum() >= 0.5 and seven_bins[3] >= thresholds[1]) \
                                            or (seven_bins[3]/seven_bins.sum() >= 0.9 and seven_bins[3] >= thresholds[2]):
                                            # contains >50% or >90% of data and is greater than appropriate threshold

                                            # Flag these data
                                            bad_points = np.where((year_data >= binEdges[e]) & (year_data < binEdges[e+1]))
                                            year_flags[bad_points] = 1

                                        # for plotting remove good bins
                                        else:
                                            if plots: plot_hist[e]=1e-1
                                    else:
                                        if plots: plot_hist[e]=1e-1
                                else:
                                    if plots: plot_hist[e]=1e-1
                            else:
                                if plots: plot_hist[e]=1e-1

                        if diagnostics or plots:
                            nflags = len(np.where(year_flags != 0)[0])
                            print "{} {}".format(y + start.year, nflags)

                        if plots:
                            if nflags > 0:
                                plt.step(bincenters, plot_hist, 'r-', where='mid')
                                plt.show()
                            else:
                                plt.clf()

                    # copy flags back

                    if season == 0:
                        station.qc_flags[year[0][0]:year[-1][-1], flag_col[v]] = year_flags   
                    elif season == 1:
                        station.qc_flags[year[2][0]:year[4][-1], flag_col[v]] = year_flags   
                    elif season == 2:
                        station.qc_flags[year[5][0]:year[7][-1], flag_col[v]] = year_flags   
                    elif season == 3:
                        station.qc_flags[year[8][0]:year[10][-1], flag_col[v]] = year_flags   
                    elif season == 4:
                        split = len(station.qc_flags[year[0][0]:year[1][-1], flag_col[v]])
                        station.qc_flags[year[0][0]:year[1][-1], flag_col[v]] = year_flags[:split]
                        station.qc_flags[year[-1][0]:year[-1][-1], flag_col[v]] = year_flags[split:]
 
        flag_locs = np.where(station.qc_flags[:, flag_col[v]] != 0)
        if plots or diagnostics:
            utils.print_flagged_obs_number(logfile, "Frequent Value", variable, len(flag_locs[0]), noWrite = True)
        else:
            utils.print_flagged_obs_number(logfile, "Frequent Value", variable, len(flag_locs[0]))

        # copy flags into attribute
        st_var.flags[flag_locs] = 1
        
    station = utils.append_history(station, "Frequent Values Check")  
                     
    return # fvc
Exemple #12
0
def dgc_monthly(station, variable, flags, start, end, plots=False, diagnostics=False, idl = False):
    '''
    Original Distributional Gap Check

    :param obj station: station object
    :param str variable: variable to act on
    :param array flags: flags array
    :param datetime start: data start
    :param datetime end: data end
    :param bool plots: run plots
    :param bool diagnostics: run diagnostics
    :param bool idl: run IDL equivalent routines for median
    :returns: 
       flags - updated flag array
    '''

    if plots:
        import matplotlib.pyplot as plt
    
    st_var = getattr(station, variable)
    
    month_ranges = utils.month_starts_in_pairs(start, end)
    
    # get monthly averages
    month_average = np.empty(month_ranges.shape[0])
    month_average.fill(st_var.mdi)
    month_average_filtered = np.empty(month_ranges.shape[0])
    month_average_filtered.fill(st_var.mdi)
    
    all_filtered = utils.apply_filter_flags(st_var)
    for m, month in enumerate(month_ranges):
        
        data = st_var.data[month[0]:month[1]]
        
        filtered = all_filtered[month[0]:month[1]]
        
        month_average[m] = dgc_get_monthly_averages(data, OBS_LIMIT, st_var.mdi, MEAN)
        month_average_filtered[m] = dgc_get_monthly_averages(filtered, OBS_LIMIT, st_var.mdi, MEAN)
            
    # get overall monthly climatologies - use filtered data
    
    month_average = month_average.reshape(-1,12)
    month_average_filtered = month_average_filtered.reshape(-1,12)
    
    standardised_months = np.empty(month_average.shape)
    standardised_months.fill(st_var.mdi)
    
    for m in range(12):
        
        valid_filtered = np.where(month_average_filtered[:,m] != st_var.mdi)
        
        if len(valid_filtered[0]) >= VALID_MONTHS:
            
            valid_data = month_average_filtered[valid_filtered,m][0]
            
            if MEAN:
                clim = np.mean(valid_data)
                spread = np.stdev(valid_data)
                
            else:        
                if idl:
                    clim = utils.idl_median(valid_data.compressed().reshape(-1))
                else:
                    clim = np.median(valid_data)
                spread = utils.IQR(valid_data)
                if spread <= SPREAD_LIMIT:
                    spread = SPREAD_LIMIT
                    
            standardised_months[valid_filtered,m] = (month_average[valid_filtered,m] - clim) / spread 
                    
    standardised_months = standardised_months.reshape(month_ranges.shape[0]) 
    
    good_months = np.where(standardised_months != st_var.mdi)

    # must be able to do this with masked arrays
    if plots:
        bins, bincenters = utils.create_bins(standardised_months[good_months], BIN_SIZE)
        dummy, plot_bincenters = utils.create_bins(standardised_months[good_months], BIN_SIZE/10.)

        hist, binEdges = np.histogram(standardised_months[good_months], bins = bins)   

        fit = utils.fit_gaussian(bincenters, hist, max(hist), mu = np.mean(standardised_months[good_months]), sig = np.std(standardised_months[good_months]))
        plot_gaussian = utils.gaussian(plot_bincenters, fit)

        dgc_set_up_plot(plot_gaussian, standardised_months[good_months], variable, sub_par = "Months")
        
    # remove all months with a large standardised offset
        
    if len(good_months[0]) >= MONTH_LIMIT:
                
        standardised_months = np.ma.masked_values(standardised_months, st_var.mdi)
        large_offsets = np.where(standardised_months >= LARGE_LIMIT)

        if len(large_offsets[0]) > 0:
            
            for lo in large_offsets[0]:
                flags[month_ranges[lo,0]:month_ranges[lo,1]] = 1
                
            if plots:
                
                hist, binEdges = np.histogram(standardised_months[large_offsets], bins = bins)
                plot_hist = np.array([0.01 if h == 0 else h for h in hist])
                plt.step(bincenters, plot_hist, 'g-', label = '> %i' % LARGE_LIMIT, where = 'mid', zorder = 5)
                
                plt.axvline(5,c='g')
                plt.axvline(-5,c='g')



        # walk distribution from centre and see if any assymetry
        sort_order = standardised_months[good_months].argsort()

        mid_point = len(good_months[0]) / 2
        
        good = True
        iter = 1
        while good:
            
            if standardised_months[good_months][sort_order][mid_point - iter] != standardised_months[good_months][sort_order][mid_point + iter]:
                # using IDL notation
                tempvals = [np.abs(standardised_months[good_months][sort_order][mid_point - iter]),np.abs(standardised_months[good_months][sort_order][mid_point + iter])]
                
                if min(tempvals) != 0:
                    if max(tempvals)/min(tempvals) >= 2. and min(tempvals) >= 1.5:
                        # substantial asymmetry in distribution - at least 1.5 from centre and difference of 2.
                        
                        if tempvals[0] == max(tempvals):
                            # LHS
                            bad = good_months[0][sort_order][:mid_point - iter]
                            if plots: badplot = standardised_months[good_months][sort_order][:mid_point - iter]
                        elif tempvals[1] == max(tempvals):
                            #RHS
                            bad = good_months[0][sort_order][mid_point + iter:]
                            if plots: badplot = standardised_months[good_months][sort_order][mid_point + iter:]
                            
                        for b in bad:
                            flags[month_ranges[b,0]:month_ranges[b,1]] = 1
                
                        if plots:
                            
                            hist, binEdges = np.histogram(badplot, bins = bins)
                            plot_hist = np.array([0.01 if h == 0 else h for h in hist])
                            plt.step(bincenters, plot_hist, 'r-', label = 'Gap', where = 'mid', zorder = 4)
                
                        good = False        
                            
                
            iter += 1
            if iter == mid_point: break
                
                          
        if plots: 
            plt.legend(loc='lower center',ncol=4, bbox_to_anchor=(0.5,-0.2),frameon=False,prop={'size':13})
            plt.show()
            #plt.savefig(IMAGELOCATION+'/'+station.id+'_DistributionalGap.png')
                   
    return flags # dgc_monthly
Exemple #13
0
def dgc_all_obs(station,
                variable,
                flags,
                start,
                end,
                logfile,
                plots=False,
                diagnostics=False,
                idl=False,
                windspeeds=False,
                GH=False,
                doMonth=False):
    '''RJHD addition working on all observations'''

    if plots:
        import matplotlib.pyplot as plt

    month_ranges = utils.month_starts_in_pairs(start, end)
    month_ranges = month_ranges.reshape(-1, 12, 2)

    # extract variable
    st_var = getattr(station, variable)
    # apply flags (and mask incomplete year if appropriate)
    all_filtered = utils.apply_filter_flags(st_var,
                                            doMonth=doMonth,
                                            start=start,
                                            end=end)

    st_var_complete_year = copy.deepcopy(st_var)
    if doMonth:
        # restrict the incomplete year if appropriate - keep other flagged obs.
        full_year_end = utils.get_first_hour_this_year(start, end)
        st_var_complete_year.data.mask[full_year_end:] = True

    for month in range(12):

        # if requiring wind data, extract data and find monthly averages
        if windspeeds == True:
            st_var_wind = getattr(station, "windspeeds")

            if doMonth:
                # restrict the incomplete year if appropriate
                st_var_wind.data.mask[full_year_end:] = True

            # get monthly averages
            windspeeds_month = np.empty([])
            for y, year in enumerate(month_ranges[:, month, :]):

                if y == 0:
                    windspeeds_month = np.ma.array(
                        st_var_wind.data[year[0]:year[1]])
                else:
                    windspeeds_month = np.ma.concatenate(
                        [windspeeds_month, st_var_wind.data[year[0]:year[1]]])

            windspeeds_month_average = dgc_get_monthly_averages(
                windspeeds_month, OBS_LIMIT, st_var_wind.mdi, MEAN)
            windspeeds_month_mad = utils.mean_absolute_deviation(
                windspeeds_month, median=True)

        # pull data from each calendar month together
        this_month_data, dummy, dummy = utils.concatenate_months(
            month_ranges[:, month, :], st_var.data, hours=False)
        this_month_filtered, dummy, dummy = utils.concatenate_months(
            month_ranges[:, month, :], all_filtered, hours=False)
        this_month_complete, dummy, dummy = utils.concatenate_months(
            month_ranges[:, month, :], st_var_complete_year.data, hours=False)

        # if enough clean and complete data for this calendar month find the median and IQR
        if len(this_month_filtered.compressed()) > OBS_LIMIT:

            if idl:
                monthly_median = utils.idl_median(
                    this_month_filtered.compressed().reshape(-1))
            else:
                monthly_median = np.ma.median(this_month_filtered)

            iqr = utils.IQR(this_month_filtered.compressed())

            if iqr == 0.0:
                # to get some spread if IQR too small
                iqr = utils.IQR(this_month_filtered.compressed(),
                                percentile=0.05)
                print "Spurious_stations file not yet sorted"

            # if have an IQR, anomalise using median and standardise using IQR
            if iqr != 0.0:

                monthly_values = np.ma.array(
                    (this_month_data.compressed() - monthly_median) / iqr)
                complete_values = np.ma.array(
                    (this_month_complete.compressed() - monthly_median) / iqr)

                # use complete years only for the histogram - aiming to find outliers.
                bins, bincenters = utils.create_bins(complete_values, BIN_SIZE)
                dummy, plot_bincenters = utils.create_bins(
                    complete_values, BIN_SIZE / 10.)
                hist, binEdges = np.histogram(complete_values, bins=bins)
                """
                Change to monthly updates Oct 2017
                Thought about changing distribution to use filtered values
                But this changes the test beyond just dealing with additional months
                Commented out lines below would be alternative.
                """
                # bins, bincenters = utils.create_bins(filtered_values, BIN_SIZE)
                # dummy, plot_bincenters = utils.create_bins(filtered_values, BIN_SIZE/10.)
                # hist, binEdges = np.histogram(filtered_values, bins = bins)

                # used filtered (incl. incomplete year mask) to determine the distribution.
                if GH:
                    # Use Gauss-Hermite polynomials to add skew and kurtosis to Gaussian fit - January 2015 ^RJHD

                    # Feb 2019 - if large amounts off centre, can affect initial values
                    # switched to median and MAD
                    initial_values = [
                        np.max(hist),
                        np.median(complete_values),
                        utils.mean_absolute_deviation(complete_values,
                                                      median=True),
                        stats.skew(complete_values),
                        stats.kurtosis(complete_values)
                    ]  # norm, mean, std, skew, kurtosis

                    fit = leastsq(utils.residualsGH, initial_values,
                                  [bincenters, hist,
                                   np.ones(len(hist))])
                    res = utils.hermite2gauss(fit[0], diagnostics=diagnostics)

                    plot_gaussian = utils.funcGH(fit[0], plot_bincenters)

                    # adjust to remove the rising bumps seen in some fits - artefacts of GH fitting?
                    mid_point = np.argmax(plot_gaussian)
                    bad, = np.where(
                        plot_gaussian[mid_point:] < FREQUENCY_THRESHOLD / 10.)
                    if len(bad) > 0:
                        plot_gaussian[mid_point:][
                            bad[0]:] = FREQUENCY_THRESHOLD / 10.

                    bad, = np.where(
                        plot_gaussian[:mid_point] < FREQUENCY_THRESHOLD / 10.)
                    if len(bad) > 0:
                        plot_gaussian[:mid_point][:bad[
                            -1]] = FREQUENCY_THRESHOLD / 10.

                    # extract threshold values
                    good_values = np.argwhere(
                        plot_gaussian > FREQUENCY_THRESHOLD)

                    l_minimum_threshold = round(
                        plot_bincenters[good_values[0]]) - 1
                    u_minimum_threshold = 1 + round(
                        plot_bincenters[good_values[-1]])

                    if diagnostics:
                        print hist
                        print res
                        print iqr, l_minimum_threshold, u_minimum_threshold

                # or just a standard Gaussian
                else:
                    gaussian = utils.fit_gaussian(
                        bincenters,
                        hist,
                        max(hist),
                        mu=np.median(complete_values),
                        sig=utils.mean_absolute_value(complete_values))

                    # assume the same threshold value
                    u_minimum_threshold = 1 + round(
                        utils.invert_gaussian(FREQUENCY_THRESHOLD, gaussian))
                    l_minimum_threshold = -u_minimum_threshold

                    plot_gaussian = utils.gaussian(plot_bincenters, gaussian)

                    if diagnostics:
                        print hist
                        print gaussian
                        print iqr, u_minimum_threshold, 1. + utils.invert_gaussian(
                            FREQUENCY_THRESHOLD, gaussian)

                if plots:
                    dgc_set_up_plot(plot_gaussian,
                                    complete_values,
                                    variable,
                                    threshold=(u_minimum_threshold,
                                               l_minimum_threshold),
                                    sub_par="observations",
                                    GH=GH)

                    if GH:
                        plt.figtext(
                            0.15,
                            0.67,
                            'Mean %.2f, S.d. %.2f,\nSkew %.2f, Kurtosis %.2f' %
                            (res['mean'], res['dispersion'], res['skewness'],
                             res['kurtosis']),
                            color='k',
                            size='small')

                # now trying to find gaps in the distribution
                uppercount = len(
                    np.where(monthly_values > u_minimum_threshold)[0])
                lowercount = len(
                    np.where(monthly_values < l_minimum_threshold)[0])

                # this needs refactoring - but lots of variables to pass in
                if plots or diagnostics: gap_plot_values = np.array([])

                # do one side of distribution and then other
                if uppercount > 0:
                    gap_start = dgc_find_gap(hist, binEdges,
                                             u_minimum_threshold)

                    if gap_start != 0:

                        # if found a gap, then go through each year for this calendar month
                        #  and flag observations further from middle
                        for y, year in enumerate(month_ranges[:, month, :]):

                            # not using filtered - checking all available data
                            this_year_data = np.ma.array(
                                st_var.data[year[0]:year[1]])
                            this_year_flags = np.array(flags[year[0]:year[1]])
                            gap_cleaned_locations = np.ma.where(
                                ((this_year_data - monthly_median) /
                                 iqr) > gap_start)

                            this_year_flags[gap_cleaned_locations] = 1
                            flags[year[0]:year[1]] = this_year_flags

                            if plots or diagnostics:
                                gap_plot_values = np.append(
                                    gap_plot_values,
                                    (this_year_data[gap_cleaned_locations].
                                     compressed() - monthly_median) / iqr)

                                if len(gap_cleaned_locations[0]) > 0:
                                    print "Upper {}-{} - {} obs flagged".format(
                                        y + start.year, month,
                                        len(gap_cleaned_locations[0]))
                                    print gap_cleaned_locations, this_year_data[
                                        gap_cleaned_locations]

                if lowercount > 0:
                    gap_start = dgc_find_gap(hist, binEdges,
                                             l_minimum_threshold)

                    if gap_start != 0:

                        # if found a gap, then go through each year for this calendar month
                        #  and flag observations further from middle
                        for y, year in enumerate(month_ranges[:, month, :]):

                            this_year_data = np.ma.array(
                                st_var.data[year[0]:year[1]])
                            this_year_flags = np.array(flags[year[0]:year[1]])
                            gap_cleaned_locations = np.ma.where(
                                np.logical_and(
                                    ((this_year_data - monthly_median) / iqr) <
                                    gap_start, this_year_data.mask != True))
                            # add flag requirement for low pressure bit if appropriate

                            this_year_flags[gap_cleaned_locations] = 1
                            flags[year[0]:year[1]] = this_year_flags

                            if plots or diagnostics:
                                gap_plot_values = np.append(
                                    gap_plot_values,
                                    (this_year_data[gap_cleaned_locations].
                                     compressed() - monthly_median) / iqr)

                                if len(gap_cleaned_locations[0]) > 0:
                                    print "Lower {}-{} - {} obs flagged".format(
                                        y + start.year, month,
                                        len(gap_cleaned_locations[0]))
                                    print gap_cleaned_locations, this_year_data[
                                        gap_cleaned_locations]

                            # if doing SLP then do extra checks for storms
                            if windspeeds:
                                windspeeds_year = np.ma.array(
                                    st_var_wind.data[year[0]:year[1]])

                                this_year_flags[
                                    gap_cleaned_locations] = 2  # tentative flags

                                slp_average = dgc_get_monthly_averages(
                                    this_month_data, OBS_LIMIT, st_var.mdi,
                                    MEAN)
                                slp_mad = utils.mean_absolute_deviation(
                                    this_month_data, median=True)

                                # need to ensure that this_year_data is less than slp_average, hence order of test
                                storms, = np.ma.where((((windspeeds_year - windspeeds_month_average) / windspeeds_month_mad) > MAD_THRESHOLD) &\
                                                   (((slp_average - this_year_data) / slp_mad) > MAD_THRESHOLD))

                                # using IDL terminology
                                if len(storms) >= 2:
                                    # use the first difference series to find when there are gaps in
                                    # contiguous sequences of storm observations - want to split up into
                                    # separate storm events
                                    storm_1diffs = np.diff(storms)
                                    separations, = np.where(storm_1diffs != 1)

                                    # expand around storm signal so that all low SLP values covered, and unflagged
                                    if len(separations) >= 1:
                                        print "  multiple storms in {} {}".format(
                                            y + start.year, month)

                                        # if more than one storm signal that month, then use intervals
                                        #    in the first difference series to expand around the first interval alone
                                        storm_start = 0
                                        storm_finish = separations[0] + 1
                                        first_storm = dgc_expand_storms(
                                            storms[storm_start:storm_finish],
                                            len(this_year_data))
                                        final_storms = copy.deepcopy(
                                            first_storm)

                                        for j in range(len(separations)):
                                            # then do the rest in a loop

                                            if j + 1 == len(separations):
                                                # final one
                                                this_storm = dgc_expand_storms(
                                                    storms[separations[j] +
                                                           1:],
                                                    len(this_year_data))
                                            else:
                                                this_storm = dgc_expand_storms(
                                                    storms[separations[j] +
                                                           1:separations[j +
                                                                         1] +
                                                           1],
                                                    len(this_year_data))

                                            final_storms = np.append(
                                                final_storms, this_storm)

                                    else:
                                        # else just expand around the signal by 6 hours either way
                                        final_storms = dgc_expand_storms(
                                            storms, len(this_year_data))

                                else:
                                    final_storms = storms

                                if len(storms) >= 1:
                                    print "Tropical Storm signal in {} {}".format(
                                        y + start.year, month)
                                    this_year_flags[final_storms] = 0

                            # and write flags back into array
                            flags[year[0]:year[1]] = this_year_flags

                if plots:
                    hist, binEdges = np.histogram(gap_plot_values, bins=bins)
                    plot_hist = np.array([0.01 if h == 0 else h for h in hist])
                    plt.step(bincenters,
                             plot_hist,
                             'r-',
                             label='flagged',
                             where='mid')
                    import calendar
                    plt.text(0.1,
                             0.9,
                             calendar.month_name[month + 1],
                             transform=plt.gca().transAxes)
                    plt.legend(loc='lower center',
                               ncol=3,
                               bbox_to_anchor=(0.5, -0.2),
                               frameon=False,
                               prop={'size': 13})
                    plt.show()
                    #plt.savefig(IMAGELOCATION+'/'+station.id+'_DistributionalGap_'+str(month+1)+'.png')

    nflags, = np.where(flags != 0)
    utils.print_flagged_obs_number(logfile,
                                   "Distributional Gap All",
                                   variable,
                                   len(nflags),
                                   noWrite=diagnostics)

    return flags  # dgc_all_obs
Exemple #14
0
def fvc(station,
        variable_list,
        flag_col,
        start,
        end,
        logfile,
        diagnostics=False,
        plots=False,
        doMonth=False):
    '''
    Check for certain values occurring more frequently than would be expected
    
    :param object station: station object to process
    :param list variable_list: list of variables to process
    :param list flag_col: columns to fill in flag array
    :param datetime start: datetime object of start of data
    :param datetime end: datetime object of end of data
    :param file logfile: logfile to store outputs
    :param bool diagnostics: produce extra diagnostic output
    :param bool plots: produce plots
    :param bool month: ignore months after last complete year/season for distribution
    '''

    MIN_DATA_REQUIRED = 500  # to create histogram for complete record
    MIN_DATA_REQUIRED_YEAR = 100  # to create histogram

    month_ranges = utils.month_starts_in_pairs(start, end)

    month_ranges_years = month_ranges.reshape(-1, 12, 2)

    for v, variable in enumerate(variable_list):

        st_var = getattr(station, variable)

        reporting_accuracy = utils.reporting_accuracy(
            utils.apply_filter_flags(st_var))

        # apply flags - for detection only
        filtered_data = utils.apply_filter_flags(st_var,
                                                 doMonth=doMonth,
                                                 start=start,
                                                 end=end)

        for season in range(5):  # Year,MAM,JJA,SON,JF+D

            if season == 0:
                # all year
                season_data = np.ma.masked_values(filtered_data.compressed(),
                                                  st_var.fdi)
                thresholds = [30, 20, 10]

            else:
                thresholds = [20, 15, 10]
                season_data = np.ma.array([])

                for y, year in enumerate(month_ranges_years):
                    # churn through months extracting data, accounting for fdi and concatenating together
                    if season == 1:
                        #mam
                        season_data = np.ma.concatenate([
                            season_data,
                            np.ma.masked_values(
                                filtered_data[year[2][0]:year[4][-1]],
                                st_var.fdi)
                        ])
                    elif season == 2:
                        #jja
                        season_data = np.ma.concatenate([
                            season_data,
                            np.ma.masked_values(
                                filtered_data[year[5][0]:year[7][-1]],
                                st_var.fdi)
                        ])
                    elif season == 3:
                        #son
                        season_data = np.ma.concatenate([
                            season_data,
                            np.ma.masked_values(
                                filtered_data[year[8][0]:year[10][-1]],
                                st_var.fdi)
                        ])
                    elif season == 4:
                        #d+jf
                        season_data = np.ma.concatenate([
                            season_data,
                            np.ma.masked_values(
                                filtered_data[year[0][0]:year[1][-1]],
                                st_var.fdi)
                        ])
                        season_data = np.ma.concatenate([
                            season_data,
                            np.ma.masked_values(
                                filtered_data[year[-1][0]:year[-1][-1]],
                                st_var.fdi)
                        ])

            season_data = season_data.compressed()

            if len(season_data) > MIN_DATA_REQUIRED:

                if 0 < reporting_accuracy <= 0.5:  # -1 used as missing value
                    bins, bincenters = utils.create_bins(season_data, 0.5)
                else:
                    bins, bincenters = utils.create_bins(season_data, 1.0)

                hist, binEdges = np.histogram(season_data, bins=bins)

                if plots:
                    plot_hist, bincenters = fvc_plot_setup(season_data,
                                                           hist,
                                                           binEdges,
                                                           st_var.name,
                                                           title="%s" %
                                                           (SEASONS[season]))

                bad_bin = np.zeros(len(hist))

                # scan through bin values and identify bad ones
                for e, element in enumerate(hist):
                    if e > 3 and e <= (len(hist) - 3):
                        # don't bother with first three or last three bins
                        seven_bins = hist[e - 3:e + 3 + 1]
                        if (seven_bins[3]
                                == seven_bins.max()) and (seven_bins[3] != 0):
                            # is local maximum and != zero
                            if (seven_bins[3] / float(seven_bins.sum()) >=
                                    0.5) and (seven_bins[3] >= thresholds[0]):
                                # contains >50% of data and is greater than threshold
                                bad_bin[e] = 1

                            # for plotting remove good bins
                            else:
                                if plots: plot_hist[e] = 1e-1
                        else:
                            if plots: plot_hist[e] = 1e-1
                    else:
                        if plots: plot_hist[e] = 1e-1

                if plots:
                    import matplotlib.pyplot as plt
                    plt.step(bincenters, plot_hist, 'r-', where='mid')
                    plt.show()

                # having identified possible bad bins, check each year in turn, on unfiltered data
                for y, year in enumerate(month_ranges_years):

                    if season == 0:
                        # year
                        year_data = np.ma.masked_values(
                            st_var.data[year[0][0]:year[-1][-1]], st_var.fdi)
                        year_flags = station.qc_flags[year[0][0]:year[-1][-1],
                                                      flag_col[v]]
                    elif season == 1:
                        #mam
                        year_data = np.ma.masked_values(
                            st_var.data[year[2][0]:year[4][-1]], st_var.fdi)
                        year_flags = station.qc_flags[year[2][0]:year[4][-1],
                                                      flag_col[v]]
                    elif season == 2:
                        #jja
                        year_data = np.ma.masked_values(
                            st_var.data[year[5][0]:year[7][-1]], st_var.fdi)
                        year_flags = station.qc_flags[year[5][0]:year[7][-1],
                                                      flag_col[v]]
                    elif season == 3:
                        #son
                        year_data = np.ma.masked_values(
                            st_var.data[year[8][0]:year[10][-1]], st_var.fdi)
                        year_flags = station.qc_flags[year[8][0]:year[10][-1],
                                                      flag_col[v]]
                    elif season == 4:
                        #d+jf
                        year_data = np.ma.concatenate([np.ma.masked_values(st_var.data[year[0][0]:year[1][-1]], st_var.fdi),\
                                                       np.ma.masked_values(st_var.data[year[-1][0]:year[-1][-1]], st_var.fdi)])
                        year_flags = np.append(
                            station.qc_flags[year[0][0]:year[1][-1],
                                             flag_col[v]],
                            station.qc_flags[year[-1][0]:year[-1][-1],
                                             flag_col[v]])

                    if len(year_data.compressed()) > MIN_DATA_REQUIRED_YEAR:

                        hist, binEdges = np.histogram(year_data.compressed(),
                                                      bins=bins)

                        if plots:
                            plot_hist, bincenters = fvc_plot_setup(
                                year_data.compressed(),
                                hist,
                                binEdges,
                                st_var.name,
                                title="%s - %s" %
                                (y + start.year, SEASONS[season]))

                        for e, element in enumerate(hist):

                            if bad_bin[e] == 1:
                                # only look at pre-identified bins

                                if e >= 3 and e <= (len(hist) - 3):
                                    # don't bother with first three or last three bins
                                    seven_bins = hist[e - 3:e + 3 +
                                                      1].astype('float')
                                    if (seven_bins[3] == seven_bins.max()
                                        ) and (seven_bins[3] != 0):
                                        # is local maximum and != zero
                                        if (seven_bins[3]/seven_bins.sum() >= 0.5 and seven_bins[3] >= thresholds[1]) \
                                            or (seven_bins[3]/seven_bins.sum() >= 0.9 and seven_bins[3] >= thresholds[2]):
                                            # contains >50% or >90% of data and is greater than appropriate threshold

                                            # Flag these data
                                            bad_points = np.where(
                                                (year_data >= binEdges[e]) &
                                                (year_data < binEdges[e + 1]))
                                            year_flags[bad_points] = 1

                                        # for plotting remove good bins
                                        else:
                                            if plots: plot_hist[e] = 1e-1
                                    else:
                                        if plots: plot_hist[e] = 1e-1
                                else:
                                    if plots: plot_hist[e] = 1e-1
                            else:
                                if plots: plot_hist[e] = 1e-1

                        if diagnostics or plots:
                            nflags = len(np.where(year_flags != 0)[0])
                            print "{} {}".format(y + start.year, nflags)

                        if plots:
                            if nflags > 0:
                                plt.step(bincenters,
                                         plot_hist,
                                         'r-',
                                         where='mid')
                                plt.show()
                            else:
                                plt.clf()

                    # copy flags back

                    if season == 0:
                        station.qc_flags[year[0][0]:year[-1][-1],
                                         flag_col[v]] = year_flags
                    elif season == 1:
                        station.qc_flags[year[2][0]:year[4][-1],
                                         flag_col[v]] = year_flags
                    elif season == 2:
                        station.qc_flags[year[5][0]:year[7][-1],
                                         flag_col[v]] = year_flags
                    elif season == 3:
                        station.qc_flags[year[8][0]:year[10][-1],
                                         flag_col[v]] = year_flags
                    elif season == 4:
                        split = len(station.qc_flags[year[0][0]:year[1][-1],
                                                     flag_col[v]])
                        station.qc_flags[year[0][0]:year[1][-1],
                                         flag_col[v]] = year_flags[:split]
                        station.qc_flags[year[-1][0]:year[-1][-1],
                                         flag_col[v]] = year_flags[split:]

        flag_locs = np.where(station.qc_flags[:, flag_col[v]] != 0)
        utils.print_flagged_obs_number(logfile,
                                       "Frequent Value",
                                       variable,
                                       len(flag_locs[0]),
                                       noWrite=diagnostics)

        # copy flags into attribute
        st_var.flags[flag_locs] = 1

    station = utils.append_history(station, "Frequent Values Check")

    return  # fvc
Exemple #15
0
def all_obs_gap(obs_var, station, config_file, plots=False, diagnostics=False):
    """
    Extract data for month and find secondary populations in distribution.

    :param MetVar obs_var: meteorological variable object
    :param Station station: station object
    :param str config_file: configuration file to store critical values
    :param bool plots: turn on plots
    :param bool diagnostics: turn on diagnostic output
    """

    flags = np.array(["" for i in range(obs_var.data.shape[0])])

    for month in range(1, 13):

        normalised_anomalies = prepare_all_data(obs_var,
                                                station,
                                                month,
                                                config_file,
                                                full=False,
                                                diagnostics=diagnostics)

        if (len(normalised_anomalies.compressed()) == 1
                and normalised_anomalies[0] == utils.MDI):
            # no data to work with for this month, move on.
            continue

        bins = utils.create_bins(normalised_anomalies, BIN_WIDTH, obs_var.name)
        hist, bin_edges = np.histogram(normalised_anomalies, bins)

        try:
            upper_threshold = float(
                utils.read_qc_config(config_file,
                                     "ADISTRIBUTION-{}".format(obs_var.name),
                                     "{}-uthresh".format(month)))
            lower_threshold = float(
                utils.read_qc_config(config_file,
                                     "ADISTRIBUTION-{}".format(obs_var.name),
                                     "{}-lthresh".format(month)))
        except KeyError:
            print("Information missing in config file")
            find_thresholds(obs_var,
                            station,
                            config_file,
                            plots=plots,
                            diagnostics=diagnostics)
            upper_threshold = float(
                utils.read_qc_config(config_file,
                                     "ADISTRIBUTION-{}".format(obs_var.name),
                                     "{}-uthresh".format(month)))
            lower_threshold = float(
                utils.read_qc_config(config_file,
                                     "ADISTRIBUTION-{}".format(obs_var.name),
                                     "{}-lthresh".format(month)))

        if upper_threshold == utils.MDI and lower_threshold == utils.MDI:
            # these weren't able to be calculated, move on
            continue
        elif len(np.unique(normalised_anomalies)) == 1:
            # all the same value, so won't be able to fit a histogram
            continue

        # now to find the gaps
        uppercount = len(np.where(normalised_anomalies > upper_threshold)[0])
        lowercount = len(np.where(normalised_anomalies < lower_threshold)[0])

        month_locs, = np.where(
            station.months == month)  # append should keep year order
        if uppercount > 0:
            gap_start = utils.find_gap(hist, bins, upper_threshold, GAP_SIZE)

            if gap_start != 0:
                bad_locs, = np.ma.where(normalised_anomalies >
                                        gap_start)  # all years for one month

                month_flags = flags[month_locs]
                month_flags[bad_locs] = "d"
                flags[month_locs] = month_flags

        if lowercount > 0:
            gap_start = utils.find_gap(hist,
                                       bins,
                                       lower_threshold,
                                       GAP_SIZE,
                                       upwards=False)

            if gap_start != 0:
                bad_locs, = np.ma.where(normalised_anomalies <
                                        gap_start)  # all years for one month

                month_flags = flags[month_locs]
                month_flags[bad_locs] = "d"

                # TODO - can this bit be refactored?
                # for pressure data, see if the flagged obs correspond with high winds
                # could be a storm signal
                if obs_var.name in [
                        "station_level_pressure", "sea_level_pressure"
                ]:
                    wind_monthly_data = prepare_monthly_data(
                        station.wind_speed, station, month)
                    pressure_monthly_data = prepare_monthly_data(
                        obs_var, station, month)

                    if len(pressure_monthly_data.compressed()) < utils.DATA_COUNT_THRESHOLD or \
                            len(wind_monthly_data.compressed()) < utils.DATA_COUNT_THRESHOLD:
                        # need sufficient data to work with for storm check to work, else can't tell
                        pass
                    else:

                        wind_monthly_average = utils.average(wind_monthly_data)
                        wind_monthly_spread = utils.spread(wind_monthly_data)

                        pressure_monthly_average = utils.average(
                            pressure_monthly_data)
                        pressure_monthly_spread = utils.spread(
                            pressure_monthly_data)

                        # already a single calendar month, so go through each year
                        all_years = np.unique(station.years)
                        for year in all_years:

                            # what's best - extract only when necessary but repeatedly if so, or always, but once
                            this_year_locs = np.where(
                                station.years[month_locs] == year)

                            if "d" not in month_flags[this_year_locs]:
                                # skip if you get the chance
                                continue

                            wind_data = station.wind_speed.data[month_locs][
                                this_year_locs]
                            pressure_data = obs_var.data[month_locs][
                                this_year_locs]

                            storms, = np.ma.where(
                                np.logical_and(
                                    (((wind_data - wind_monthly_average) /
                                      wind_monthly_spread) > STORM_THRESHOLD),
                                    (((pressure_monthly_average - pressure_data
                                       ) / pressure_monthly_spread) >
                                     STORM_THRESHOLD)))

                            # more than one entry - check if separate events
                            if len(storms) >= 2:
                                # find where separation more than the usual obs separation
                                storm_1diffs = np.ma.diff(storms)
                                separations, = np.where(
                                    storm_1diffs > np.ma.median(
                                        np.ma.diff(wind_data)))

                                if len(separations) != 0:
                                    # multiple storm signals
                                    storm_start = 0
                                    storm_finish = separations[0] + 1
                                    first_storm = expand_around_storms(
                                        storms[storm_start:storm_finish],
                                        len(wind_data))
                                    final_storm_locs = copy.deepcopy(
                                        first_storm)

                                    for j in range(len(separations)):
                                        # then do the rest in a loop

                                        if j + 1 == len(separations):
                                            # final one
                                            this_storm = expand_around_storms(
                                                storms[separations[j] + 1:],
                                                len(wind_data))
                                        else:
                                            this_storm = expand_around_storms(
                                                storms[separations[j] +
                                                       1:separations[j + 1] +
                                                       1], len(wind_data))

                                        final_storm_locs = np.append(
                                            final_storm_locs, this_storm)

                                else:
                                    # locations separated at same interval as data
                                    final_storm_locs = expand_around_storms(
                                        storms, len(wind_data))

                            # single entry
                            elif len(storms) != 0:
                                # expand around the storm signal (rather than
                                #  just unflagging what could be the peak and
                                #  leaving the entry/exit flagged)
                                final_storm_locs = expand_around_storms(
                                    storms, len(wind_data))

                            # unset the flags
                            if len(storms) > 0:
                                month_flags[this_year_locs][
                                    final_storm_locs] = ""

                # having checked for storms now store final flags
                flags[month_locs] = month_flags

        # diagnostic plots
        if plots:
            import matplotlib.pyplot as plt
            plt.step(bins[1:], hist, color='k', where="pre")
            plt.yscale("log")

            plt.ylabel("Number of Observations")
            plt.xlabel(obs_var.name.capitalize())
            plt.title("{} - month {}".format(station.id, month))

            plt.ylim([0.1, max(hist) * 2])

            plt.axvline(upper_threshold, c="r")
            plt.axvline(lower_threshold, c="r")

            bad_locs, = np.where(flags[month_locs] == "d")
            bad_hist, dummy = np.histogram(normalised_anomalies[bad_locs],
                                           bins)
            plt.step(bins[1:], bad_hist, color='r', where="pre")

            plt.show()

    # append flags to object
    obs_var.flags = utils.insert_flags(obs_var.flags, flags)

    if diagnostics:

        print("Distribution (all) {}".format(obs_var.name))
        print("   Cumulative number of flags set: {}".format(
            len(np.where(flags != "")[0])))

    return  # all_obs_gap
Exemple #16
0
def find_thresholds(obs_var,
                    station,
                    config_file,
                    plots=False,
                    diagnostics=False):
    """
    Extract data for month and find thresholds in distribution and store.

    :param MetVar obs_var: meteorological variable object
    :param Station station: station object
    :param int month: month to process
    :param str config_file: configuration file to store critical values
    :param bool diagnostics: turn on diagnostic output
    """

    for month in range(1, 13):

        normalised_anomalies = prepare_all_data(obs_var,
                                                station,
                                                month,
                                                config_file,
                                                full=True,
                                                diagnostics=diagnostics)

        if len(normalised_anomalies.compressed()
               ) == 1 and normalised_anomalies[0] == utils.MDI:
            # scaling not possible for this month
            utils.write_qc_config(config_file,
                                  "ADISTRIBUTION-{}".format(obs_var.name),
                                  "{}-uthresh".format(month),
                                  "{}".format(utils.MDI),
                                  diagnostics=diagnostics)
            utils.write_qc_config(config_file,
                                  "ADISTRIBUTION-{}".format(obs_var.name),
                                  "{}-lthresh".format(month),
                                  "{}".format(utils.MDI),
                                  diagnostics=diagnostics)
            continue
        elif len(np.unique(normalised_anomalies)) == 1:
            # all the same value, so won't be able to fit a histogram
            utils.write_qc_config(config_file,
                                  "ADISTRIBUTION-{}".format(obs_var.name),
                                  "{}-uthresh".format(month),
                                  "{}".format(utils.MDI),
                                  diagnostics=diagnostics)
            utils.write_qc_config(config_file,
                                  "ADISTRIBUTION-{}".format(obs_var.name),
                                  "{}-lthresh".format(month),
                                  "{}".format(utils.MDI),
                                  diagnostics=diagnostics)
            continue

        bins = utils.create_bins(normalised_anomalies, BIN_WIDTH, obs_var.name)
        hist, bin_edges = np.histogram(normalised_anomalies, bins)

        gaussian_fit = utils.fit_gaussian(bins[1:], hist, max(hist), mu=bins[np.argmax(hist)], \
                                          sig=utils.spread(normalised_anomalies), skew=skew(normalised_anomalies.compressed()))

        fitted_curve = utils.skew_gaussian(bins[1:], gaussian_fit)

        # diagnostic plots
        if plots:
            import matplotlib.pyplot as plt
            plt.clf()
            plt.step(bins[1:], hist, color='k', where="pre")
            plt.yscale("log")

            plt.ylabel("Number of Observations")
            plt.xlabel(obs_var.name.capitalize())
            plt.title("{} - month {}".format(station.id, month))

            plt.plot(bins[1:], fitted_curve)
            plt.ylim([0.1, max(hist) * 2])

        # use bins and curve to find points where curve is < FREQUENCY_THRESHOLD
        try:
            lower_threshold = bins[1:][np.where(
                np.logical_and(
                    fitted_curve < FREQUENCY_THRESHOLD,
                    bins[1:] < bins[np.argmax(fitted_curve)]))[0]][-1]
        except:
            lower_threshold = bins[1]
        try:
            if len(np.unique(fitted_curve)) == 1:
                # just a line of zeros perhaps (found on AFA00409906 station_level_pressure 20190913)
                upper_threshold = bins[-1]
            else:
                upper_threshold = bins[1:][np.where(
                    np.logical_and(
                        fitted_curve < FREQUENCY_THRESHOLD,
                        bins[1:] > bins[np.argmax(fitted_curve)]))[0]][0]
        except:
            upper_threshold = bins[-1]

        if plots:
            plt.axvline(upper_threshold, c="r")
            plt.axvline(lower_threshold, c="r")
            plt.show()

        utils.write_qc_config(config_file,
                              "ADISTRIBUTION-{}".format(obs_var.name),
                              "{}-uthresh".format(month),
                              "{}".format(upper_threshold),
                              diagnostics=diagnostics)
        utils.write_qc_config(config_file,
                              "ADISTRIBUTION-{}".format(obs_var.name),
                              "{}-lthresh".format(month),
                              "{}".format(lower_threshold),
                              diagnostics=diagnostics)

    return  # find_thresholds
Exemple #17
0
def monthly_gap(obs_var, station, config_file, plots=False, diagnostics=False):
    """
    Use distribution to identify assymetries.

    :param MetVar obs_var: meteorological variable object
    :param Station station: station object
    :param str config_file: configuration file to store critical values
    :param bool plots: turn on plots
    :param bool diagnostics: turn on diagnostic output
    """

    flags = np.array(["" for i in range(obs_var.data.shape[0])])
    all_years = np.unique(station.years)

    for month in range(1, 13):

        month_averages = prepare_monthly_data(obs_var,
                                              station,
                                              month,
                                              diagnostics=diagnostics)

        # read in the scaling
        try:
            climatology = float(
                utils.read_qc_config(config_file,
                                     "MDISTRIBUTION-{}".format(obs_var.name),
                                     "{}-clim".format(month)))
            spread = float(
                utils.read_qc_config(config_file,
                                     "MDISTRIBUTION-{}".format(obs_var.name),
                                     "{}-spread".format(month)))
        except KeyError:
            print("Information missing in config file")
            find_monthly_scaling(obs_var,
                                 station,
                                 config_file,
                                 diagnostics=diagnostics)
            climatology = float(
                utils.read_qc_config(config_file,
                                     "MDISTRIBUTION-{}".format(obs_var.name),
                                     "{}-clim".format(month)))
            spread = float(
                utils.read_qc_config(config_file,
                                     "MDISTRIBUTION-{}".format(obs_var.name),
                                     "{}-spread".format(month)))

        if climatology == utils.MDI and spread == utils.MDI:
            # these weren't calculable, move on
            continue

        standardised_months = (month_averages - climatology) / spread

        bins = utils.create_bins(standardised_months, BIN_WIDTH, obs_var.name)
        hist, bin_edges = np.histogram(standardised_months, bins)

        # flag months with very large offsets
        bad, = np.where(np.abs(standardised_months) >= LARGE_LIMIT)
        # now follow flag locations back up through the process
        for bad_month_id in bad:
            # year ID for this set of calendar months
            locs, = np.where(
                np.logical_and(station.months == month,
                               station.years == all_years[bad_month_id]))
            flags[locs] = "D"

        # walk distribution from centre to find assymetry
        sort_order = standardised_months.argsort()
        mid_point = len(standardised_months) // 2
        good = True
        step = 1
        bad = []
        while good:

            if standardised_months[sort_order][
                    mid_point -
                    step] != standardised_months[sort_order][mid_point + step]:

                suspect_months = [np.abs(standardised_months[sort_order][mid_point - step]), \
                                      np.abs(standardised_months[sort_order][mid_point + step])]

                if min(suspect_months) != 0:
                    # not all clustered at origin

                    if max(suspect_months) / min(suspect_months) >= 2. and min(
                            suspect_months) >= 1.5:
                        # at least 1.5x spread from centre and difference of two in location (longer tail)
                        # flag everything further from this bin for that tail
                        if suspect_months[0] == max(suspect_months):
                            # LHS has issue (remember that have removed the sign)
                            bad = sort_order[:mid_point - (
                                step -
                                1)]  # need -1 given array indexing standards
                        elif suspect_months[1] == max(suspect_months):
                            # RHS has issue
                            bad = sort_order[mid_point + step:]
                        good = False

            step += 1
            if (mid_point - step) < 0 or (
                    mid_point + step) == standardised_months.shape[0]:
                # reached end
                break

        # now follow flag locations back up through the process
        for bad_month_id in bad:
            # year ID for this set of calendar months
            locs, = np.where(
                np.logical_and(station.months == month,
                               station.years == all_years[bad_month_id]))
            flags[locs] = "D"

        if plots:
            import matplotlib.pyplot as plt

            plt.step(bins[1:], hist, color='k', where="pre")
            if len(bad) > 0:
                bad_hist, dummy = np.histogram(standardised_months[bad], bins)
                plt.step(bins[1:], bad_hist, color='r', where="pre")

            plt.ylabel("Number of Months")
            plt.xlabel(obs_var.name.capitalize())
            plt.title("{} - month {}".format(station.id, month))

            plt.show()

    # append flags to object
    obs_var.flags = utils.insert_flags(obs_var.flags, flags)

    if diagnostics:

        print("Distribution (monthly) {}".format(obs_var.name))
        print("   Cumulative number of flags set: {}".format(
            len(np.where(flags != "")[0])))

    return  # monthly_gap
Exemple #18
0
def variance_check(obs_var,
                   station,
                   config_file,
                   plots=False,
                   diagnostics=False,
                   winsorize=True):
    """
    Use distribution to identify threshold values.  Then also store in config file.

    :param MetVar obs_var: meteorological variable object
    :param Station station: station object
    :param str config_file: configuration file to store critical values
    :param bool plots: turn on plots
    :param bool diagnostics: turn on diagnostic output
    :param bool winsorize: apply winsorization at 5%/95%
    """

    flags = np.array(["" for i in range(obs_var.data.shape[0])])

    # get hourly climatology for each month
    for month in range(1, 13):
        month_locs, = np.where(station.months == month)

        variances = prepare_data(obs_var,
                                 station,
                                 month,
                                 diagnostics=diagnostics,
                                 winsorize=winsorize)

        try:
            average_variance = float(
                utils.read_qc_config(config_file,
                                     "VARIANCE-{}".format(obs_var.name),
                                     "{}-average".format(month)))
            variance_spread = float(
                utils.read_qc_config(config_file,
                                     "VARIANCE-{}".format(obs_var.name),
                                     "{}-spread".format(month)))
        except KeyError:
            print("Information missing in config file")
            find_thresholds(obs_var,
                            station,
                            config_file,
                            plots=plots,
                            diagnostics=diagnostics)
            average_variance = float(
                utils.read_qc_config(config_file,
                                     "VARIANCE-{}".format(obs_var.name),
                                     "{}-average".format(month)))
            variance_spread = float(
                utils.read_qc_config(config_file,
                                     "VARIANCE-{}".format(obs_var.name),
                                     "{}-spread".format(month)))

        if average_variance == utils.MDI and variance_spread == utils.MDI:
            # couldn't be calculated, move on
            continue

        bad_years, = np.where(
            np.abs(variances - average_variance) /
            variance_spread > SPREAD_THRESHOLD)

        # prepare wind and pressure data in case needed to check for storms
        if obs_var.name in [
                "station_level_pressure", "sea_level_pressure", "wind_speed"
        ]:
            wind_monthly_data = station.wind_speed.data[month_locs]
            if obs_var.name in [
                    "station_level_pressure", "sea_level_pressure"
            ]:
                pressure_monthly_data = obs_var.data[month_locs]
            else:
                pressure_monthly_data = station.sea_level_pressure.data[
                    month_locs]

            if len(pressure_monthly_data.compressed()) < utils.DATA_COUNT_THRESHOLD or \
                    len(wind_monthly_data.compressed()) < utils.DATA_COUNT_THRESHOLD:
                # need sufficient data to work with for storm check to work, else can't tell
                #    move on
                continue

            wind_average = utils.average(wind_monthly_data)
            wind_spread = utils.spread(wind_monthly_data)

            pressure_average = utils.average(pressure_monthly_data)
            pressure_spread = utils.spread(pressure_monthly_data)

        # go through each bad year for this month
        all_years = np.unique(station.years)
        for year in bad_years:

            # corresponding locations
            ym_locs, = np.where(
                np.logical_and(station.months == month,
                               station.years == all_years[year]))

            # if pressure or wind speed, need to do some further checking before applying flags
            if obs_var.name in [
                    "station_level_pressure", "sea_level_pressure",
                    "wind_speed"
            ]:

                # pull out the data
                wind_data = station.wind_speed.data[ym_locs]
                if obs_var.name in [
                        "station_level_pressure", "sea_level_pressure"
                ]:
                    pressure_data = obs_var.data[ym_locs]
                else:
                    pressure_data = station.sea_level_pressure.data[ym_locs]

                # need sufficient data to work with for storm check to work, else can't tell
                if len(pressure_data.compressed()) < utils.DATA_COUNT_THRESHOLD or \
                        len(wind_data.compressed()) < utils.DATA_COUNT_THRESHOLD:
                    # move on
                    continue

                # find locations of high wind speeds and low pressures, cross match
                high_winds, = np.ma.where(
                    (wind_data - wind_average) / wind_spread > STORM_THRESHOLD)
                low_pressures, = np.ma.where(
                    (pressure_average - pressure_data) /
                    pressure_spread > STORM_THRESHOLD)

                match = np.in1d(high_winds, low_pressures)

                couldbe_storm = False
                if len(match) > 0:
                    # this could be a storm, either at tropical station (relatively constant pressure)
                    # or out of season in mid-latitudes.
                    couldbe_storm = True

                if obs_var.name in [
                        "station_level_pressure", "sea_level_pressure"
                ]:
                    diffs = np.ma.diff(pressure_data)
                elif obs_var.name == "wind_speed":
                    diffs = np.ma.diff(wind_data)

                # count up the largest number of sequential negative and positive differences
                negs, poss = 0, 0
                biggest_neg, biggest_pos = 0, 0

                for diff in diffs:

                    if diff > 0:
                        if negs > biggest_neg: biggest_neg = negs
                        negs = 0
                        poss += 1
                    else:
                        if poss > biggest_pos: biggest_pos = poss
                        poss = 0
                        negs += 1

                if (biggest_neg < 10) and (biggest_pos <
                                           10) and not couldbe_storm:
                    # insufficient to identify as a storm (HadISD values)
                    # leave flags set
                    pass
                else:
                    # could be a storm, so better to leave this month unflagged
                    # zero length array to flag
                    ym_locs = np.ma.array([])

            # copy over the flags, if any
            if len(ym_locs) != 0:
                # and set the flags
                flags[ym_locs] = "V"

        # diagnostic plots
        if plots:
            import matplotlib.pyplot as plt

            scaled_variances = ((variances - average_variance) /
                                variance_spread)
            bins = utils.create_bins(scaled_variances, 0.25, obs_var.name)
            hist, bin_edges = np.histogram(scaled_variances, bins)

            plt.clf()
            plt.step(bins[1:], hist, color='k', where="pre")
            plt.yscale("log")

            plt.ylabel("Number of Months")
            plt.xlabel("Scaled {} Variances".format(obs_var.name.capitalize()))
            plt.title("{} - month {}".format(station.id, month))

            plt.ylim([0.1, max(hist) * 2])
            plt.axvline(SPREAD_THRESHOLD, c="r")
            plt.axvline(-SPREAD_THRESHOLD, c="r")

            bad_hist, dummy = np.histogram(scaled_variances[bad_years], bins)
            plt.step(bins[1:], bad_hist, color='r', where="pre")

            plt.show()

    # append flags to object
    obs_var.flags = utils.insert_flags(obs_var.flags, flags)

    if diagnostics:

        print("Variance {}".format(obs_var.name))
        print("   Cumulative number of flags set: {}".format(
            len(np.where(flags != "")[0])))

    return  # variance_check
Exemple #19
0
def coc(station, variable_list, flag_col, start, end, logfile, diagnostics = False, plots = False, idl = False):
    
    for v, variable in enumerate(variable_list):
        
        st_var = getattr(station, variable)
        all_filtered = utils.apply_filter_flags(st_var)
        
        # is this needed 13th Nov 2014 RJHD
        #reporting_resolution = utils.reporting_accuracy(utils.apply_filter_flags(st_var))
        
        month_ranges = utils.month_starts_in_pairs(start, end)
        month_ranges = month_ranges.reshape(-1,12,2)
    
        for month in range(12):
            
            hourly_climatologies = np.zeros(24)
            hourly_climatologies.fill(st_var.mdi)
            
            # append all e.g. Januaries together

            this_month, year_ids, dummy = utils.concatenate_months(month_ranges[:,month,:], st_var.data, hours = True)
            this_month_filtered, dummy, dummy = utils.concatenate_months(month_ranges[:,month,:], all_filtered, hours = True)

            # if fixed climatology period, sort this here
            
            # get as array of 24 hrs.  
            this_month = np.ma.array(this_month)
            this_month = this_month.reshape(-1,24)

            this_month_filtered = np.ma.array(this_month_filtered)
            this_month_filtered = this_month_filtered.reshape(-1,24)

            # get hourly climatology for each month
            for hour in range(24):
                
                this_hour = this_month[:,hour]

                # need to have data if this is going to work!
                if len(this_hour.compressed()) > 0:

                    # winsorize & climatologies - done to match IDL
                    if idl:
                        this_hour = utils.winsorize(np.append(this_hour.compressed(), -999999), 0.05, idl = idl)
                        hourly_climatologies[hour] = np.ma.sum(this_hour)/(len(this_hour) - 1)

                    else:
                        this_hour = utils.winsorize(this_hour.compressed(), 0.05, idl = idl)
                        hourly_climatologies[hour] = np.ma.mean(this_hour)



            if len(this_month.compressed()) > 0:
                # can get stations with few obs in a particular variable.

                # anomalise each hour over month appropriately

                anomalies = this_month - np.tile(hourly_climatologies, (this_month.shape[0],1))
                anomalies_filtered = this_month_filtered - np.tile(hourly_climatologies, (this_month_filtered.shape[0],1))

                if len(anomalies.compressed()) >= 10:
                    iqr = utils.IQR(anomalies.compressed().reshape(-1))/2.  # to match IDL
                    if iqr < 1.5: iqr = 1.5
                else:
                    iqr = st_var.mdi

                normed_anomalies = anomalies / iqr
                normed_anomalies_filtered = anomalies_filtered / iqr


                # get average anomaly for year
                year_ids = np.array(year_ids)
                monthly_vqvs = np.ma.zeros(month_ranges.shape[0])
                monthly_vqvs.mask = [False for x in range(month_ranges.shape[0])]
                for year in range(month_ranges.shape[0]):
                    year_locs = np.where(year_ids == year)
                    this_year = normed_anomalies_filtered[year_locs,:]

                    if len(this_year.compressed()) > 0:
                        # need to have data for this to work!
                        if idl:
                            monthly_vqvs[year] = utils.idl_median(this_year.compressed().reshape(-1))
                        else:
                            monthly_vqvs[year] = np.ma.median(this_year)
                    else:
                        monthly_vqvs.mask[year] = True


                # low pass filter
                normed_anomalies = coc_low_pass_filter(normed_anomalies, year_ids, monthly_vqvs, month_ranges.shape[0])

                # copy from distributional_gap.py - refactor!
                # get the threshold value
                bins, bincenters = utils.create_bins(normed_anomalies, 1.)

                hist, binEdges = np.histogram(normed_anomalies, bins = bins)

                gaussian = utils.fit_gaussian(bincenters, hist, max(hist), mu=np.mean(normed_anomalies), sig = np.std(normed_anomalies))
                minimum_threshold = round(1. + utils.invert_gaussian(FREQUENCY_THRESHOLD, gaussian))

                if diagnostics:
                    print iqr, minimum_threshold, 1. + utils.invert_gaussian(FREQUENCY_THRESHOLD, gaussian)
                    print gaussian
                    print hist

                if plots:
                    coc_set_up_plot(bincenters, hist, gaussian, variable, threshold = minimum_threshold, sub_par = "observations")


                uppercount = len(np.where(normed_anomalies > minimum_threshold)[0])
                lowercount = len(np.where(normed_anomalies < -minimum_threshold)[0])

                these_flags = station.qc_flags[:, flag_col[v]]
                gap_plot_values, tentative_plot_values = [], []

                # find the gaps and apply the flags

                gap_start = dgc.dgc_find_gap(hist, binEdges, minimum_threshold, gap_size = 1) # in DGC it is 2.
                these_flags, gap_plot_values, tentative_plot_values =\
                    coc_find_and_apply_flags(month_ranges[:,month,:],normed_anomalies, these_flags, year_ids, minimum_threshold, gap_start, \
                                                           upper = True, plots = plots, gpv = gap_plot_values, tpv = tentative_plot_values)

                gap_start = dgc.dgc_find_gap(hist, binEdges, -minimum_threshold, gap_size = 1) # in DGC it is 2.
                these_flags, gap_plot_values, tentative_plot_values =\
                    coc_find_and_apply_flags(month_ranges[:,month,:],normed_anomalies, these_flags, year_ids, minimum_threshold, gap_start, \
                                                           upper = False, plots = plots, gpv = gap_plot_values, tpv = tentative_plot_values)

                station.qc_flags[:, flag_col[v]] = these_flags

                if uppercount + lowercount > 1000:
                    #print "not sorted spurious stations yet"
                    pass
                if plots:
                    import matplotlib.pyplot as plt
                    hist, binEdges = np.histogram(tentative_plot_values, bins = bins)
                    plot_hist = np.array([0.01 if h == 0 else h for h in hist])
                    plt.step(bincenters, plot_hist, c='orange', ls='-', label = 'tentative', where='mid')

                    hist, binEdges = np.histogram(gap_plot_values, bins = bins)
                    plot_hist = np.array([0.01 if h == 0 else h for h in hist])
                    plt.step(bincenters, plot_hist, 'r-', label = 'flagged', where='mid')
                    import calendar
                    plt.text(0.1,0.9,calendar.month_name[month+1], transform = plt.gca().transAxes)
                    leg=plt.legend(loc='lower center',ncol=4, bbox_to_anchor=(0.5,-0.2),frameon=False,prop={'size':13},labelspacing=0.15,columnspacing=0.5)
                    plt.setp(leg.get_title(), fontsize=14)
                    plt.show()
                    #plt.savefig(IMAGELOCATION+'/'+station.id+'_ClimatologicalGap_'+str(month+1)+'.png')


        
        flag_locs = np.where(station.qc_flags[:, flag_col[v]] != 0)

        # copy flags into attribute
        st_var.flags[flag_locs] = 1


        if plots or diagnostics:
            utils.print_flagged_obs_number(logfile, "Climatological", variable, len(flag_locs[0]), noWrite = True)
            print "where\n"
            nflags = len(np.where(station.qc_flags[:, flag_col[v]] == 1)[0])
            utils.print_flagged_obs_number(logfile, "  Firm Clim", variable, nflags, noWrite = True)
            nflags = len(np.where(station.qc_flags[:, flag_col[v]] == 2)[0])
            utils.print_flagged_obs_number(logfile, "  Tentative Clim", variable, nflags, noWrite = True)
        else:
            utils.print_flagged_obs_number(logfile, "Climatological", variable, len(flag_locs[0]))
            logfile.write("where\n")
            nflags = len(np.where(station.qc_flags[:, flag_col[v]] == 1)[0])
            utils.print_flagged_obs_number(logfile, "  Firm Clim", variable, nflags)
            nflags = len(np.where(station.qc_flags[:, flag_col[v]] == 2)[0])
            utils.print_flagged_obs_number(logfile, "  Tentative Clim", variable, nflags)

        # firm flags match 030220
    station = utils.append_history(station, "Climatological Check")  
                     
    return
Exemple #20
0
def find_month_thresholds(obs_var,
                          station,
                          config_file,
                          plots=False,
                          diagnostics=False,
                          winsorize=True):
    """
    Use distribution to identify threshold values.  Then also store in config file.

    :param MetVar obs_var: meteorological variable object
    :param Station station: station object
    :param str config_file: configuration file to store critical values
    :param bool plots: turn on plots
    :param bool diagnostics: turn on diagnostic output
    :param bool winsorize: apply winsorization at 5%/95%
    """

    # get hourly climatology for each month
    for month in range(1, 13):

        normalised_anomalies = prepare_data(obs_var,
                                            station,
                                            month,
                                            diagnostics=diagnostics,
                                            winsorize=winsorize)

        if len(normalised_anomalies.compressed()
               ) >= utils.DATA_COUNT_THRESHOLD:

            bins = utils.create_bins(normalised_anomalies, BIN_WIDTH,
                                     obs_var.name)
            hist, bin_edges = np.histogram(normalised_anomalies.compressed(),
                                           bins)

            gaussian_fit = utils.fit_gaussian(
                bins[1:],
                hist,
                max(hist),
                mu=bins[np.argmax(hist)],
                sig=utils.spread(normalised_anomalies))

            fitted_curve = utils.gaussian(bins[1:], gaussian_fit)

            # diagnostic plots
            if plots:
                import matplotlib.pyplot as plt
                plt.clf()
                plt.step(bins[1:], hist, color='k', where="pre")
                plt.yscale("log")

                plt.ylabel("Number of Observations")
                plt.xlabel("Scaled {}".format(obs_var.name.capitalize()))
                plt.title("{} - month {}".format(station.id, month))

                plt.plot(bins[1:], fitted_curve)
                plt.ylim([0.1, max(hist) * 2])

            # use bins and curve to find points where curve is < FREQUENCY_THRESHOLD
            try:
                lower_threshold = bins[1:][np.where(
                    np.logical_and(fitted_curve < FREQUENCY_THRESHOLD,
                                   bins[1:] < 0))[0]][-1]
            except:
                lower_threshold = bins[1]
            try:
                upper_threshold = bins[1:][np.where(
                    np.logical_and(fitted_curve < FREQUENCY_THRESHOLD,
                                   bins[1:] > 0))[0]][0]
            except:
                upper_threshold = bins[-1]

            if plots:
                plt.axvline(upper_threshold, c="r")
                plt.axvline(lower_threshold, c="r")
                plt.show()

            utils.write_qc_config(config_file,
                                  "CLIMATOLOGICAL-{}".format(obs_var.name),
                                  "{}-uthresh".format(month),
                                  "{}".format(upper_threshold),
                                  diagnostics=diagnostics)
            utils.write_qc_config(config_file,
                                  "CLIMATOLOGICAL-{}".format(obs_var.name),
                                  "{}-lthresh".format(month),
                                  "{}".format(lower_threshold),
                                  diagnostics=diagnostics)

    return  # find_month_thresholds
Exemple #21
0
def dgc_all_obs(station, variable, flags, start, end, plots = False, diagnostics = False, idl = False, windspeeds = False, GH = False):
    '''RJHD addition working on all observations'''
    
    if plots:
        import matplotlib.pyplot as plt

    st_var = getattr(station, variable)
    
    month_ranges = utils.month_starts_in_pairs(start, end)
    month_ranges = month_ranges.reshape(-1,12,2)
    
    all_filtered = utils.apply_filter_flags(st_var)

 
    for month in range(12):
    
        if windspeeds == True:
            st_var_wind = getattr(station, "windspeeds")
            
            # get monthly averages
            windspeeds_month = np.empty([])
            for y, year in enumerate(month_ranges[:,month,:]):
            
                if y == 0:
                    windspeeds_month = np.ma.array(st_var_wind.data[year[0]:year[1]])
                else:
                    windspeeds_month = np.ma.concatenate([windspeeds_month, st_var_wind.data[year[0]:year[1]]])
                  
            windspeeds_month_average = dgc_get_monthly_averages(windspeeds_month, OBS_LIMIT, st_var_wind.mdi, MEAN)
            windspeeds_month_mad = utils.mean_absolute_deviation(windspeeds_month, median=True)
    
        
        this_month_data = np.array([])
        this_month_filtered = np.array([])
        
        this_month_data, dummy, dummy = utils.concatenate_months(month_ranges[:,month,:], st_var.data, hours = False)
        this_month_filtered, dummy, dummy = utils.concatenate_months(month_ranges[:,month,:], all_filtered, hours = False)
                
        if len(this_month_filtered.compressed()) > OBS_LIMIT:
            
            if idl:
                monthly_median = utils.idl_median(this_month_filtered.compressed().reshape(-1))
            else:
                monthly_median = np.ma.median(this_month_filtered)
                  
            iqr = utils.IQR(this_month_filtered.compressed())
            
            
            if iqr == 0.0:
                # to get some spread if IQR too small                   
                iqr = utils.IQR(this_month_filtered.compressed(), percentile = 0.05)
                
                print "Spurious_stations file not yet sorted"
    

            if iqr != 0.0:               
                monthly_values = np.ma.array((this_month_data.compressed() - monthly_median) / iqr)

                bins, bincenters = utils.create_bins(monthly_values, BIN_SIZE)
                dummy, plot_bincenters = utils.create_bins(monthly_values, BIN_SIZE/10.)
        
                hist, binEdges = np.histogram(monthly_values, bins = bins)
                                               
                if GH:
                    # Use Gauss-Hermite polynomials to add skew and kurtosis to Gaussian fit - January 2015 ^RJHD

                    initial_values = [np.max(hist), np.mean(monthly_values), np.std(monthly_values), stats.skew(monthly_values), stats.kurtosis(monthly_values)] # norm, mean, std, skew, kurtosis
                    
                    fit = leastsq(utils.residualsGH, initial_values, [bincenters, hist, np.ones(len(hist))])
                    res = utils.hermite2gauss(fit[0], diagnostics = diagnostics)
                    
                    plot_gaussian = utils.funcGH(fit[0], plot_bincenters)

                    # adjust to remove the rising bumps seen in some fits - artefacts of GH fitting?
                    mid_point = np.argmax(plot_gaussian)
                    bad, = np.where(plot_gaussian[mid_point:] < FREQUENCY_THRESHOLD/10.)
                    if len(bad) > 0: plot_gaussian[mid_point:][bad[0]:] = FREQUENCY_THRESHOLD/10.

                    bad, = np.where(plot_gaussian[:mid_point] < FREQUENCY_THRESHOLD/10.)
                    if len(bad) > 0: plot_gaussian[:mid_point][:bad[-1]] = FREQUENCY_THRESHOLD/10.                   

                    # extract threshold values
                    good_values = np.argwhere(plot_gaussian > FREQUENCY_THRESHOLD)

                    l_minimum_threshold = round(plot_bincenters[good_values[0]]) - 1
                    u_minimum_threshold = 1 + round(plot_bincenters[good_values[-1]])
                                      

                else:
                    gaussian = utils.fit_gaussian(bincenters, hist, max(hist), mu = np.mean(monthly_values), sig = np.std(monthly_values))

                    # assume the same threshold value
                    u_minimum_threshold = 1 + round(utils.invert_gaussian(FREQUENCY_THRESHOLD, gaussian))
                    l_minimum_threshold = -u_minimum_threshold


                    plot_gaussian = utils.gaussian(plot_bincenters, gaussian)

                if diagnostics:
                    if GH:
                        print hist
                        print res
                        print iqr, l_minimum_threshold, u_minimum_threshold

                    else:
                        print hist
                        print gaussian
                        print iqr, u_minimum_threshold, 1. + utils.invert_gaussian(FREQUENCY_THRESHOLD, gaussian)

                if plots:
                    dgc_set_up_plot(plot_gaussian, monthly_values, variable, threshold = (u_minimum_threshold, l_minimum_threshold), sub_par = "observations", GH = GH)
                     
                    if GH:
                        plt.figtext(0.15, 0.67, 'Mean %.2f, S.d. %.2f,\nSkew %.2f, Kurtosis %.2f' %(res['mean'], res['dispersion'], res['skewness'], res['kurtosis']), color='k', size='small')

                    

                uppercount = len(np.where(monthly_values > u_minimum_threshold)[0])
                lowercount = len(np.where(monthly_values < l_minimum_threshold)[0])
                
                # this needs refactoring - but lots of variables to pass in
                if plots or diagnostics: gap_plot_values = np.array([])

                if uppercount > 0:
                    gap_start = dgc_find_gap(hist, binEdges, u_minimum_threshold)
                        
                    if gap_start != 0:
                        
                        for y, year in enumerate(month_ranges[:,month,:]):
                
                            this_year_data = np.ma.array(all_filtered[year[0]:year[1]])
                            this_year_flags = np.array(flags[year[0]:year[1]])
                            gap_cleaned_locations = np.where(((this_year_data - monthly_median) / iqr) > gap_start)

                            this_year_flags[gap_cleaned_locations] = 1
                            flags[year[0]:year[1]] = this_year_flags

                            if plots or diagnostics: gap_plot_values = np.append(gap_plot_values, (this_year_data[gap_cleaned_locations].compressed() - monthly_median)/iqr)


                if lowercount > 0:
                    gap_start = dgc_find_gap(hist, binEdges, l_minimum_threshold)
                        
                    if gap_start != 0:

                        for y, year in enumerate(month_ranges[:,month,:]):
                
                            this_year_data = np.ma.array(all_filtered[year[0]:year[1]])
                            this_year_flags = np.array(flags[year[0]:year[1]])
                            gap_cleaned_locations = np.where(np.logical_and(((this_year_data - monthly_median) / iqr) < gap_start, this_year_data.mask != True))

                            this_year_flags[gap_cleaned_locations] = 1
                            flags[year[0]:year[1]] = this_year_flags

                            if plots or diagnostics: gap_plot_values = np.append(gap_plot_values, (this_year_data[gap_cleaned_locations].compressed() - monthly_median)/iqr)
                    

                            if windspeeds:
                                this_year_flags[gap_cleaned_locations] = 2 # tentative flags
                                
                                slp_average = dgc_get_monthly_averages(this_month_data, OBS_LIMIT, st_var.mdi, MEAN)
                                slp_mad = utils.mean_absolute_deviation(this_month_data, median=True)
                                storms = np.where((((windspeeds_month - windspeeds_month_average) / windspeeds_month_mad) > 4.5) &\
                                                   (((this_month_data - slp_average) / slp_mad) > 4.5))
                                
                                if len(storms[0]) >= 2:
                                    
                                    storm_1diffs = np.diff(storms)
                                    
                                    separations = np.where(storm_1diffs != 1)

                                    #for sep in separations:


                if plots:
                    hist, binEdges = np.histogram(gap_plot_values, bins = bins)
                    plot_hist = np.array([0.01 if h == 0 else h for h in hist])
                    plt.step(bincenters, plot_hist, 'r-', label = 'flagged', where='mid')
                    import calendar
                    plt.text(0.1,0.9,calendar.month_name[month+1], transform = plt.gca().transAxes)
                    plt.legend(loc='lower center',ncol=3, bbox_to_anchor=(0.5,-0.2),frameon=False,prop={'size':13})
                    plt.show()
                    #plt.savefig(IMAGELOCATION+'/'+station.id+'_DistributionalGap_'+str(month+1)+'.png')
    if diagnostics:
        utils.print_flagged_obs_number("", "Distributional Gap", variable, len(gap_plot_values), noWrite=True)

    return flags # dgc_all_obs
Exemple #22
0
def coc(station,
        variable_list,
        flag_col,
        start,
        end,
        logfile,
        diagnostics=False,
        plots=False,
        idl=False):

    for v, variable in enumerate(variable_list):

        st_var = getattr(station, variable)
        all_filtered = utils.apply_filter_flags(st_var)

        # is this needed 13th Nov 2014 RJHD
        #reporting_resolution = utils.reporting_accuracy(utils.apply_filter_flags(st_var))

        month_ranges = utils.month_starts_in_pairs(start, end)
        month_ranges = month_ranges.reshape(-1, 12, 2)

        for month in range(12):

            hourly_climatologies = np.zeros(24)
            hourly_climatologies.fill(st_var.mdi)

            # append all e.g. Januaries together

            this_month, year_ids, dummy = utils.concatenate_months(
                month_ranges[:, month, :], st_var.data, hours=True)
            this_month_filtered, dummy, dummy = utils.concatenate_months(
                month_ranges[:, month, :], all_filtered, hours=True)

            # if fixed climatology period, sort this here

            # get as array of 24 hrs.
            this_month = np.ma.array(this_month)
            this_month = this_month.reshape(-1, 24)

            this_month_filtered = np.ma.array(this_month_filtered)
            this_month_filtered = this_month_filtered.reshape(-1, 24)

            # get hourly climatology for each month
            for hour in range(24):

                this_hour = this_month[:, hour]

                # need to have data if this is going to work!
                if len(this_hour.compressed()) > 0:

                    # winsorize & climatologies - done to match IDL
                    if idl:
                        this_hour = utils.winsorize(np.append(
                            this_hour.compressed(), -999999),
                                                    0.05,
                                                    idl=idl)
                        hourly_climatologies[hour] = np.ma.sum(this_hour) / (
                            len(this_hour) - 1)

                    else:
                        this_hour = utils.winsorize(this_hour.compressed(),
                                                    0.05,
                                                    idl=idl)
                        hourly_climatologies[hour] = np.ma.mean(this_hour)

            if len(this_month.compressed()) > 0:
                # can get stations with few obs in a particular variable.

                # anomalise each hour over month appropriately

                anomalies = this_month - np.tile(hourly_climatologies,
                                                 (this_month.shape[0], 1))
                anomalies_filtered = this_month_filtered - np.tile(
                    hourly_climatologies, (this_month_filtered.shape[0], 1))

                if len(anomalies.compressed()) >= 10:
                    iqr = utils.IQR(anomalies.compressed().reshape(
                        -1)) / 2.  # to match IDL
                    if iqr < 1.5: iqr = 1.5
                else:
                    iqr = st_var.mdi

                normed_anomalies = anomalies / iqr
                normed_anomalies_filtered = anomalies_filtered / iqr

                # get average anomaly for year
                year_ids = np.array(year_ids)
                monthly_vqvs = np.ma.zeros(month_ranges.shape[0])
                monthly_vqvs.mask = [
                    False for x in range(month_ranges.shape[0])
                ]
                for year in range(month_ranges.shape[0]):
                    year_locs = np.where(year_ids == year)
                    this_year = normed_anomalies_filtered[year_locs, :]

                    if len(this_year.compressed()) > 0:
                        # need to have data for this to work!
                        if idl:
                            monthly_vqvs[year] = utils.idl_median(
                                this_year.compressed().reshape(-1))
                        else:
                            monthly_vqvs[year] = np.ma.median(this_year)
                    else:
                        monthly_vqvs.mask[year] = True

                # low pass filter
                normed_anomalies = coc_low_pass_filter(normed_anomalies,
                                                       year_ids, monthly_vqvs,
                                                       month_ranges.shape[0])

                # copy from distributional_gap.py - refactor!
                # get the threshold value
                bins, bincenters = utils.create_bins(normed_anomalies, 1.)

                hist, binEdges = np.histogram(normed_anomalies, bins=bins)

                gaussian = utils.fit_gaussian(bincenters,
                                              hist,
                                              max(hist),
                                              mu=np.mean(normed_anomalies),
                                              sig=np.std(normed_anomalies))
                minimum_threshold = round(
                    1. + utils.invert_gaussian(FREQUENCY_THRESHOLD, gaussian))

                if diagnostics:
                    print iqr, minimum_threshold, 1. + utils.invert_gaussian(
                        FREQUENCY_THRESHOLD, gaussian)
                    print gaussian
                    print hist

                if plots:
                    coc_set_up_plot(bincenters,
                                    hist,
                                    gaussian,
                                    variable,
                                    threshold=minimum_threshold,
                                    sub_par="observations")

                uppercount = len(
                    np.where(normed_anomalies > minimum_threshold)[0])
                lowercount = len(
                    np.where(normed_anomalies < -minimum_threshold)[0])

                these_flags = station.qc_flags[:, flag_col[v]]
                gap_plot_values, tentative_plot_values = [], []

                # find the gaps and apply the flags

                gap_start = dgc.dgc_find_gap(hist,
                                             binEdges,
                                             minimum_threshold,
                                             gap_size=1)  # in DGC it is 2.
                these_flags, gap_plot_values, tentative_plot_values =\
                    coc_find_and_apply_flags(month_ranges[:,month,:],normed_anomalies, these_flags, year_ids, minimum_threshold, gap_start, \
                                                           upper = True, plots = plots, gpv = gap_plot_values, tpv = tentative_plot_values)

                gap_start = dgc.dgc_find_gap(hist,
                                             binEdges,
                                             -minimum_threshold,
                                             gap_size=1)  # in DGC it is 2.
                these_flags, gap_plot_values, tentative_plot_values =\
                    coc_find_and_apply_flags(month_ranges[:,month,:],normed_anomalies, these_flags, year_ids, minimum_threshold, gap_start, \
                                                           upper = False, plots = plots, gpv = gap_plot_values, tpv = tentative_plot_values)

                station.qc_flags[:, flag_col[v]] = these_flags

                if uppercount + lowercount > 1000:
                    #print "not sorted spurious stations yet"
                    pass
                if plots:
                    import matplotlib.pyplot as plt
                    hist, binEdges = np.histogram(tentative_plot_values,
                                                  bins=bins)
                    plot_hist = np.array([0.01 if h == 0 else h for h in hist])
                    plt.step(bincenters,
                             plot_hist,
                             c='orange',
                             ls='-',
                             label='tentative',
                             where='mid')

                    hist, binEdges = np.histogram(gap_plot_values, bins=bins)
                    plot_hist = np.array([0.01 if h == 0 else h for h in hist])
                    plt.step(bincenters,
                             plot_hist,
                             'r-',
                             label='flagged',
                             where='mid')
                    import calendar
                    plt.text(0.1,
                             0.9,
                             calendar.month_name[month + 1],
                             transform=plt.gca().transAxes)
                    leg = plt.legend(loc='lower center',
                                     ncol=4,
                                     bbox_to_anchor=(0.5, -0.2),
                                     frameon=False,
                                     prop={'size': 13},
                                     labelspacing=0.15,
                                     columnspacing=0.5)
                    plt.setp(leg.get_title(), fontsize=14)
                    plt.show()
                    #plt.savefig(IMAGELOCATION+'/'+station.id+'_ClimatologicalGap_'+str(month+1)+'.png')

        flag_locs = np.where(station.qc_flags[:, flag_col[v]] != 0)

        # copy flags into attribute
        st_var.flags[flag_locs] = 1

        if plots or diagnostics:
            utils.print_flagged_obs_number(logfile,
                                           "Climatological",
                                           variable,
                                           len(flag_locs[0]),
                                           noWrite=True)
            print "where\n"
            nflags = len(np.where(station.qc_flags[:, flag_col[v]] == 1)[0])
            utils.print_flagged_obs_number(logfile,
                                           "  Firm Clim",
                                           variable,
                                           nflags,
                                           noWrite=True)
            nflags = len(np.where(station.qc_flags[:, flag_col[v]] == 2)[0])
            utils.print_flagged_obs_number(logfile,
                                           "  Tentative Clim",
                                           variable,
                                           nflags,
                                           noWrite=True)
        else:
            utils.print_flagged_obs_number(logfile, "Climatological", variable,
                                           len(flag_locs[0]))
            logfile.write("where\n")
            nflags = len(np.where(station.qc_flags[:, flag_col[v]] == 1)[0])
            utils.print_flagged_obs_number(logfile, "  Firm Clim", variable,
                                           nflags)
            nflags = len(np.where(station.qc_flags[:, flag_col[v]] == 2)[0])
            utils.print_flagged_obs_number(logfile, "  Tentative Clim",
                                           variable, nflags)

        # firm flags match 030220
    station = utils.append_history(station, "Climatological Check")

    return
Exemple #23
0
def monthly_clim(obs_var,
                 station,
                 config_file,
                 logfile="",
                 plots=False,
                 diagnostics=False,
                 winsorize=True):
    """
    Run through the variables and pass to the Distributional Gap Checks

    :param MetVar obs_var: meteorological variable object
    :param Station station: station object
    :param str configfile: string for configuration file
    :param str logfile: string for log file
    :param bool plots: turn on plots
    :param bool diagnostics: turn on diagnostic output
    """
    flags = np.array(["" for i in range(obs_var.data.shape[0])])

    for month in range(1, 13):

        month_locs, = np.where(station.months == month)

        # note these are for the whole record, just this month is unmasked
        normalised_anomalies = prepare_data(obs_var,
                                            station,
                                            month,
                                            diagnostics=diagnostics,
                                            winsorize=winsorize)

        if len(normalised_anomalies.compressed()
               ) >= utils.DATA_COUNT_THRESHOLD:

            bins = utils.create_bins(normalised_anomalies, BIN_WIDTH,
                                     obs_var.name)
            hist, bin_edges = np.histogram(normalised_anomalies.compressed(),
                                           bins)

            try:
                upper_threshold = float(
                    utils.read_qc_config(
                        config_file, "CLIMATOLOGICAL-{}".format(obs_var.name),
                        "{}-uthresh".format(month)))
                lower_threshold = float(
                    utils.read_qc_config(
                        config_file, "CLIMATOLOGICAL-{}".format(obs_var.name),
                        "{}-lthresh".format(month)))
            except KeyError:
                print("Information missing in config file")
                find_month_thresholds(obs_var,
                                      station,
                                      config_file,
                                      plots=plots,
                                      diagnostics=diagnostics)
                upper_threshold = float(
                    utils.read_qc_config(
                        config_file, "CLIMATOLOGICAL-{}".format(obs_var.name),
                        "{}-uthresh".format(month)))
                lower_threshold = float(
                    utils.read_qc_config(
                        config_file, "CLIMATOLOGICAL-{}".format(obs_var.name),
                        "{}-lthresh".format(month)))

            # now to find the gaps
            uppercount = len(
                np.where(normalised_anomalies > upper_threshold)[0])
            lowercount = len(
                np.where(normalised_anomalies < lower_threshold)[0])

            if uppercount > 0:
                gap_start = utils.find_gap(hist, bins, upper_threshold,
                                           GAP_SIZE)

                if gap_start != 0:
                    bad_locs, = np.ma.where(
                        normalised_anomalies >
                        gap_start)  # all years for one month

                    # normalised_anomalies are for the whole record, just this month is unmasked
                    flags[bad_locs] = "C"

            if lowercount > 0:
                gap_start = utils.find_gap(hist,
                                           bins,
                                           lower_threshold,
                                           GAP_SIZE,
                                           upwards=False)

                if gap_start != 0:
                    bad_locs, = np.ma.where(
                        normalised_anomalies <
                        gap_start)  # all years for one month

                    flags[bad_locs] = "C"

            # diagnostic plots
            if plots:
                import matplotlib.pyplot as plt
                plt.clf()
                plt.step(bins[1:], hist, color='k', where="pre")
                plt.yscale("log")

                plt.ylabel("Number of Observations")
                plt.xlabel("Scaled {}".format(obs_var.name.capitalize()))
                plt.title("{} - month {}".format(station.id, month))

                plt.ylim([0.1, max(hist) * 2])
                plt.axvline(upper_threshold, c="r")
                plt.axvline(lower_threshold, c="r")

                bad_locs, = np.where(flags[month_locs] == "C")
                bad_hist, dummy = np.histogram(
                    normalised_anomalies[month_locs][bad_locs], bins)
                plt.step(bins[1:], bad_hist, color='r', where="pre")

                plt.show()

    # append flags to object
    obs_var.flags = utils.insert_flags(obs_var.flags, flags)

    if diagnostics:

        print("Climatological {}".format(obs_var.name))
        print("   Cumulative number of flags set: {}".format(
            len(np.where(flags != "")[0])))

    return  # monthly_clim