コード例 #1
0
ファイル: climatological.py プロジェクト: wk1984/HadISD_v2
def coc(station,
        variable_list,
        flag_col,
        start,
        end,
        logfile,
        diagnostics=False,
        plots=False,
        idl=False):

    for v, variable in enumerate(variable_list):

        st_var = getattr(station, variable)
        all_filtered = utils.apply_filter_flags(st_var)

        # is this needed 13th Nov 2014 RJHD
        #reporting_resolution = utils.reporting_accuracy(utils.apply_filter_flags(st_var))

        month_ranges = utils.month_starts_in_pairs(start, end)
        month_ranges = month_ranges.reshape(-1, 12, 2)

        for month in range(12):

            hourly_climatologies = np.zeros(24)
            hourly_climatologies.fill(st_var.mdi)

            # append all e.g. Januaries together

            this_month, year_ids, dummy = utils.concatenate_months(
                month_ranges[:, month, :], st_var.data, hours=True)
            this_month_filtered, dummy, dummy = utils.concatenate_months(
                month_ranges[:, month, :], all_filtered, hours=True)

            # if fixed climatology period, sort this here

            # get as array of 24 hrs.
            this_month = np.ma.array(this_month)
            this_month = this_month.reshape(-1, 24)

            this_month_filtered = np.ma.array(this_month_filtered)
            this_month_filtered = this_month_filtered.reshape(-1, 24)

            # get hourly climatology for each month
            for hour in range(24):

                this_hour = this_month[:, hour]

                # need to have data if this is going to work!
                if len(this_hour.compressed()) > 0:

                    # winsorize & climatologies - done to match IDL
                    if idl:
                        this_hour = utils.winsorize(np.append(
                            this_hour.compressed(), -999999),
                                                    0.05,
                                                    idl=idl)
                        hourly_climatologies[hour] = np.ma.sum(this_hour) / (
                            len(this_hour) - 1)

                    else:
                        this_hour = utils.winsorize(this_hour.compressed(),
                                                    0.05,
                                                    idl=idl)
                        hourly_climatologies[hour] = np.ma.mean(this_hour)

            if len(this_month.compressed()) > 0:
                # can get stations with few obs in a particular variable.

                # anomalise each hour over month appropriately

                anomalies = this_month - np.tile(hourly_climatologies,
                                                 (this_month.shape[0], 1))
                anomalies_filtered = this_month_filtered - np.tile(
                    hourly_climatologies, (this_month_filtered.shape[0], 1))

                if len(anomalies.compressed()) >= 10:
                    iqr = utils.IQR(anomalies.compressed().reshape(
                        -1)) / 2.  # to match IDL
                    if iqr < 1.5: iqr = 1.5
                else:
                    iqr = st_var.mdi

                normed_anomalies = anomalies / iqr
                normed_anomalies_filtered = anomalies_filtered / iqr

                # get average anomaly for year
                year_ids = np.array(year_ids)
                monthly_vqvs = np.ma.zeros(month_ranges.shape[0])
                monthly_vqvs.mask = [
                    False for x in range(month_ranges.shape[0])
                ]
                for year in range(month_ranges.shape[0]):
                    year_locs = np.where(year_ids == year)
                    this_year = normed_anomalies_filtered[year_locs, :]

                    if len(this_year.compressed()) > 0:
                        # need to have data for this to work!
                        if idl:
                            monthly_vqvs[year] = utils.idl_median(
                                this_year.compressed().reshape(-1))
                        else:
                            monthly_vqvs[year] = np.ma.median(this_year)
                    else:
                        monthly_vqvs.mask[year] = True

                # low pass filter
                normed_anomalies = coc_low_pass_filter(normed_anomalies,
                                                       year_ids, monthly_vqvs,
                                                       month_ranges.shape[0])

                # copy from distributional_gap.py - refactor!
                # get the threshold value
                bins, bincenters = utils.create_bins(normed_anomalies, 1.)

                hist, binEdges = np.histogram(normed_anomalies, bins=bins)

                gaussian = utils.fit_gaussian(bincenters,
                                              hist,
                                              max(hist),
                                              mu=np.mean(normed_anomalies),
                                              sig=np.std(normed_anomalies))
                minimum_threshold = round(
                    1. + utils.invert_gaussian(FREQUENCY_THRESHOLD, gaussian))

                if diagnostics:
                    print iqr, minimum_threshold, 1. + utils.invert_gaussian(
                        FREQUENCY_THRESHOLD, gaussian)
                    print gaussian
                    print hist

                if plots:
                    coc_set_up_plot(bincenters,
                                    hist,
                                    gaussian,
                                    variable,
                                    threshold=minimum_threshold,
                                    sub_par="observations")

                uppercount = len(
                    np.where(normed_anomalies > minimum_threshold)[0])
                lowercount = len(
                    np.where(normed_anomalies < -minimum_threshold)[0])

                these_flags = station.qc_flags[:, flag_col[v]]
                gap_plot_values, tentative_plot_values = [], []

                # find the gaps and apply the flags

                gap_start = dgc.dgc_find_gap(hist,
                                             binEdges,
                                             minimum_threshold,
                                             gap_size=1)  # in DGC it is 2.
                these_flags, gap_plot_values, tentative_plot_values =\
                    coc_find_and_apply_flags(month_ranges[:,month,:],normed_anomalies, these_flags, year_ids, minimum_threshold, gap_start, \
                                                           upper = True, plots = plots, gpv = gap_plot_values, tpv = tentative_plot_values)

                gap_start = dgc.dgc_find_gap(hist,
                                             binEdges,
                                             -minimum_threshold,
                                             gap_size=1)  # in DGC it is 2.
                these_flags, gap_plot_values, tentative_plot_values =\
                    coc_find_and_apply_flags(month_ranges[:,month,:],normed_anomalies, these_flags, year_ids, minimum_threshold, gap_start, \
                                                           upper = False, plots = plots, gpv = gap_plot_values, tpv = tentative_plot_values)

                station.qc_flags[:, flag_col[v]] = these_flags

                if uppercount + lowercount > 1000:
                    #print "not sorted spurious stations yet"
                    pass
                if plots:
                    import matplotlib.pyplot as plt
                    hist, binEdges = np.histogram(tentative_plot_values,
                                                  bins=bins)
                    plot_hist = np.array([0.01 if h == 0 else h for h in hist])
                    plt.step(bincenters,
                             plot_hist,
                             c='orange',
                             ls='-',
                             label='tentative',
                             where='mid')

                    hist, binEdges = np.histogram(gap_plot_values, bins=bins)
                    plot_hist = np.array([0.01 if h == 0 else h for h in hist])
                    plt.step(bincenters,
                             plot_hist,
                             'r-',
                             label='flagged',
                             where='mid')
                    import calendar
                    plt.text(0.1,
                             0.9,
                             calendar.month_name[month + 1],
                             transform=plt.gca().transAxes)
                    leg = plt.legend(loc='lower center',
                                     ncol=4,
                                     bbox_to_anchor=(0.5, -0.2),
                                     frameon=False,
                                     prop={'size': 13},
                                     labelspacing=0.15,
                                     columnspacing=0.5)
                    plt.setp(leg.get_title(), fontsize=14)
                    plt.show()
                    #plt.savefig(IMAGELOCATION+'/'+station.id+'_ClimatologicalGap_'+str(month+1)+'.png')

        flag_locs = np.where(station.qc_flags[:, flag_col[v]] != 0)

        # copy flags into attribute
        st_var.flags[flag_locs] = 1

        if plots or diagnostics:
            utils.print_flagged_obs_number(logfile,
                                           "Climatological",
                                           variable,
                                           len(flag_locs[0]),
                                           noWrite=True)
            print "where\n"
            nflags = len(np.where(station.qc_flags[:, flag_col[v]] == 1)[0])
            utils.print_flagged_obs_number(logfile,
                                           "  Firm Clim",
                                           variable,
                                           nflags,
                                           noWrite=True)
            nflags = len(np.where(station.qc_flags[:, flag_col[v]] == 2)[0])
            utils.print_flagged_obs_number(logfile,
                                           "  Tentative Clim",
                                           variable,
                                           nflags,
                                           noWrite=True)
        else:
            utils.print_flagged_obs_number(logfile, "Climatological", variable,
                                           len(flag_locs[0]))
            logfile.write("where\n")
            nflags = len(np.where(station.qc_flags[:, flag_col[v]] == 1)[0])
            utils.print_flagged_obs_number(logfile, "  Firm Clim", variable,
                                           nflags)
            nflags = len(np.where(station.qc_flags[:, flag_col[v]] == 2)[0])
            utils.print_flagged_obs_number(logfile, "  Tentative Clim",
                                           variable, nflags)

        # firm flags match 030220
    station = utils.append_history(station, "Climatological Check")

    return
コード例 #2
0
def dgc_all_obs(station,
                variable,
                flags,
                start,
                end,
                plots=False,
                diagnostics=False,
                idl=False,
                windspeeds=False,
                GH=False):
    '''RJHD addition working on all observations'''

    if plots:
        import matplotlib.pyplot as plt

    st_var = getattr(station, variable)

    month_ranges = utils.month_starts_in_pairs(start, end)
    month_ranges = month_ranges.reshape(-1, 12, 2)

    all_filtered = utils.apply_filter_flags(st_var)

    for month in range(12):

        if windspeeds == True:
            st_var_wind = getattr(station, "windspeeds")

            # get monthly averages
            windspeeds_month = np.empty([])
            for y, year in enumerate(month_ranges[:, month, :]):

                if y == 0:
                    windspeeds_month = np.ma.array(
                        st_var_wind.data[year[0]:year[1]])
                else:
                    windspeeds_month = np.ma.concatenate(
                        [windspeeds_month, st_var_wind.data[year[0]:year[1]]])

            windspeeds_month_average = dgc_get_monthly_averages(
                windspeeds_month, OBS_LIMIT, st_var_wind.mdi, MEAN)
            windspeeds_month_mad = utils.mean_absolute_deviation(
                windspeeds_month, median=True)

        this_month_data = np.array([])
        this_month_filtered = np.array([])

        this_month_data, dummy, dummy = utils.concatenate_months(
            month_ranges[:, month, :], st_var.data, hours=False)
        this_month_filtered, dummy, dummy = utils.concatenate_months(
            month_ranges[:, month, :], all_filtered, hours=False)

        if len(this_month_filtered.compressed()) > OBS_LIMIT:

            if idl:
                monthly_median = utils.idl_median(
                    this_month_filtered.compressed().reshape(-1))
            else:
                monthly_median = np.ma.median(this_month_filtered)

            iqr = utils.IQR(this_month_filtered.compressed())

            if iqr == 0.0:
                # to get some spread if IQR too small
                iqr = utils.IQR(this_month_filtered.compressed(),
                                percentile=0.05)

                print "Spurious_stations file not yet sorted"

            if iqr != 0.0:
                monthly_values = np.ma.array(
                    (this_month_data.compressed() - monthly_median) / iqr)

                bins, bincenters = utils.create_bins(monthly_values, BIN_SIZE)
                dummy, plot_bincenters = utils.create_bins(
                    monthly_values, BIN_SIZE / 10.)

                hist, binEdges = np.histogram(monthly_values, bins=bins)

                if GH:
                    # Use Gauss-Hermite polynomials to add skew and kurtosis to Gaussian fit - January 2015 ^RJHD

                    initial_values = [
                        np.max(hist),
                        np.mean(monthly_values),
                        np.std(monthly_values),
                        stats.skew(monthly_values),
                        stats.kurtosis(monthly_values)
                    ]  # norm, mean, std, skew, kurtosis

                    fit = leastsq(utils.residualsGH, initial_values,
                                  [bincenters, hist,
                                   np.ones(len(hist))])
                    res = utils.hermite2gauss(fit[0], diagnostics=diagnostics)

                    plot_gaussian = utils.funcGH(fit[0], plot_bincenters)

                    # adjust to remove the rising bumps seen in some fits - artefacts of GH fitting?
                    mid_point = np.argmax(plot_gaussian)
                    bad, = np.where(
                        plot_gaussian[mid_point:] < FREQUENCY_THRESHOLD / 10.)
                    if len(bad) > 0:
                        plot_gaussian[mid_point:][
                            bad[0]:] = FREQUENCY_THRESHOLD / 10.

                    bad, = np.where(
                        plot_gaussian[:mid_point] < FREQUENCY_THRESHOLD / 10.)
                    if len(bad) > 0:
                        plot_gaussian[:mid_point][:bad[
                            -1]] = FREQUENCY_THRESHOLD / 10.

                    # extract threshold values
                    good_values = np.argwhere(
                        plot_gaussian > FREQUENCY_THRESHOLD)

                    l_minimum_threshold = round(
                        plot_bincenters[good_values[0]]) - 1
                    u_minimum_threshold = 1 + round(
                        plot_bincenters[good_values[-1]])

                else:
                    gaussian = utils.fit_gaussian(bincenters,
                                                  hist,
                                                  max(hist),
                                                  mu=np.mean(monthly_values),
                                                  sig=np.std(monthly_values))

                    # assume the same threshold value
                    u_minimum_threshold = 1 + round(
                        utils.invert_gaussian(FREQUENCY_THRESHOLD, gaussian))
                    l_minimum_threshold = -u_minimum_threshold

                    plot_gaussian = utils.gaussian(plot_bincenters, gaussian)

                if diagnostics:
                    if GH:
                        print hist
                        print res
                        print iqr, l_minimum_threshold, u_minimum_threshold

                    else:
                        print hist
                        print gaussian
                        print iqr, u_minimum_threshold, 1. + utils.invert_gaussian(
                            FREQUENCY_THRESHOLD, gaussian)

                if plots:
                    dgc_set_up_plot(plot_gaussian,
                                    monthly_values,
                                    variable,
                                    threshold=(u_minimum_threshold,
                                               l_minimum_threshold),
                                    sub_par="observations",
                                    GH=GH)

                    if GH:
                        plt.figtext(
                            0.15,
                            0.67,
                            'Mean %.2f, S.d. %.2f,\nSkew %.2f, Kurtosis %.2f' %
                            (res['mean'], res['dispersion'], res['skewness'],
                             res['kurtosis']),
                            color='k',
                            size='small')

                uppercount = len(
                    np.where(monthly_values > u_minimum_threshold)[0])
                lowercount = len(
                    np.where(monthly_values < l_minimum_threshold)[0])

                # this needs refactoring - but lots of variables to pass in
                if plots or diagnostics: gap_plot_values = np.array([])

                if uppercount > 0:
                    gap_start = dgc_find_gap(hist, binEdges,
                                             u_minimum_threshold)

                    if gap_start != 0:

                        for y, year in enumerate(month_ranges[:, month, :]):

                            this_year_data = np.ma.array(
                                all_filtered[year[0]:year[1]])
                            this_year_flags = np.array(flags[year[0]:year[1]])
                            gap_cleaned_locations = np.where(
                                ((this_year_data - monthly_median) /
                                 iqr) > gap_start)

                            this_year_flags[gap_cleaned_locations] = 1
                            flags[year[0]:year[1]] = this_year_flags

                            if plots or diagnostics:
                                gap_plot_values = np.append(
                                    gap_plot_values,
                                    (this_year_data[gap_cleaned_locations].
                                     compressed() - monthly_median) / iqr)

                if lowercount > 0:
                    gap_start = dgc_find_gap(hist, binEdges,
                                             l_minimum_threshold)

                    if gap_start != 0:

                        for y, year in enumerate(month_ranges[:, month, :]):

                            this_year_data = np.ma.array(
                                all_filtered[year[0]:year[1]])
                            this_year_flags = np.array(flags[year[0]:year[1]])
                            gap_cleaned_locations = np.where(
                                np.logical_and(
                                    ((this_year_data - monthly_median) / iqr) <
                                    gap_start, this_year_data.mask != True))

                            this_year_flags[gap_cleaned_locations] = 1
                            flags[year[0]:year[1]] = this_year_flags

                            if plots or diagnostics:
                                gap_plot_values = np.append(
                                    gap_plot_values,
                                    (this_year_data[gap_cleaned_locations].
                                     compressed() - monthly_median) / iqr)

                            if windspeeds:
                                this_year_flags[
                                    gap_cleaned_locations] = 2  # tentative flags

                                slp_average = dgc_get_monthly_averages(
                                    this_month_data, OBS_LIMIT, st_var.mdi,
                                    MEAN)
                                slp_mad = utils.mean_absolute_deviation(
                                    this_month_data, median=True)
                                storms = np.where((((windspeeds_month - windspeeds_month_average) / windspeeds_month_mad) > 4.5) &\
                                                   (((this_month_data - slp_average) / slp_mad) > 4.5))

                                if len(storms[0]) >= 2:

                                    storm_1diffs = np.diff(storms)

                                    separations = np.where(storm_1diffs != 1)

                                    #for sep in separations:

                if plots:
                    hist, binEdges = np.histogram(gap_plot_values, bins=bins)
                    plot_hist = np.array([0.01 if h == 0 else h for h in hist])
                    plt.step(bincenters,
                             plot_hist,
                             'r-',
                             label='flagged',
                             where='mid')
                    import calendar
                    plt.text(0.1,
                             0.9,
                             calendar.month_name[month + 1],
                             transform=plt.gca().transAxes)
                    plt.legend(loc='lower center',
                               ncol=3,
                               bbox_to_anchor=(0.5, -0.2),
                               frameon=False,
                               prop={'size': 13})
                    plt.show()
                    #plt.savefig(IMAGELOCATION+'/'+station.id+'_DistributionalGap_'+str(month+1)+'.png')
    if diagnostics:
        utils.print_flagged_obs_number("",
                                       "Distributional Gap",
                                       variable,
                                       len(gap_plot_values),
                                       noWrite=True)

    return flags  # dgc_all_obs
コード例 #3
0
ファイル: variance.py プロジェクト: wk1984/HadISD_v2
def evc(station, variable_list, flag_col, start, end, logfile, diagnostics = False, plots = False, idl = False):
    
    if plots or diagnostics:
        import matplotlib.pyplot as plt
        import calendar

    
    # very similar to climatological check - ensure that not duplicating
    
    for v, variable in enumerate(variable_list):
    
        st_var = getattr(station, variable)
    
        reporting_resolution = utils.reporting_accuracy(utils.apply_filter_flags(st_var))
        reporting_freq = utils.reporting_frequency(utils.apply_filter_flags(st_var))
   
        month_ranges = utils.month_starts_in_pairs(start, end)
        month_ranges = month_ranges.reshape(-1,12,2)

        month_data_count = np.zeros(month_ranges.shape[0:2])

        # for each month
        for month in range(12):

            # set up hourly climatologies
            hourly_clims = np.zeros(24)
            hourly_clims.fill(st_var.data.fill_value)

            this_month, year_ids, month_data_count[:,month] = utils.concatenate_months(month_ranges[:,month,:], st_var.data, hours = True)

            
            # # extract each year and append together
            # year_ids = [] # counter to determine which year each day corresponds to
            # for year in range(month_ranges.shape[0]):
                
            #     this_year = st_var.data[month_ranges[year,month][0]:month_ranges[year,month][1]]
            #     if year == 0:
            #         # store so can access each hour of day separately
            #         this_month = this_year.reshape(-1,24)
                    
            #         year_ids = [year for x in range(this_month.shape[0])]
                    
            #         month_data_count[year,month] = len(this_year.compressed())
                    
            #     else:
            #         this_year = this_year.reshape(-1,24)
                       
            #         this_month = np.ma.concatenate((this_month, this_year), axis = 0)
                    
            #         year_ids.extend([year for x in range(this_year.shape[0])])
                    
            #         month_data_count[year,month] = len(this_year.compressed())

                
                  
            # winsorize and get hourly climatology 
            for h in range(24):
                
                this_hour = this_month[:,h]
                
                if len(this_hour.compressed()) > 100:

                    
                    # winsorize & climatologies - done to match IDL
                    if idl:
                        this_hour_winsorized = utils.winsorize(np.append(this_hour.compressed(), -999999), 0.05, idl = idl)
                        hourly_clims[h] = np.ma.sum(this_hour_winsorized)/(len(this_hour_winsorized) - 1)
                        
                    else:
                        this_hour_winsorized = utils.winsorize(this_hour.compressed(), 0.05, idl = idl)
                        hourly_clims[h] = np.ma.mean(this_hour_winsorized)
                    
            
            hourly_clims = np.ma.masked_where(hourly_clims == st_var.data.fill_value, hourly_clims)           
            anomalies = this_month - np.tile(hourly_clims, (this_month.shape[0], 1))
            
            # extract IQR of anomalies (using 1/2 value to match IDL)
            if len(anomalies.compressed()) >= 10:
                
                iqr = utils.IQR(anomalies.compressed().reshape(-1)) / 2. # to match IDL
                if iqr < 1.5: iqr = 1.5

            else:
                iqr = st_var.mdi
            
            normed_anomalies = anomalies / iqr
            

            variances = np.ma.zeros(month_ranges.shape[0])
            variances.mask = [False for i in range(month_ranges.shape[0])]
            rep_accuracies = np.zeros(month_ranges.shape[0])
            rep_freqs = np.zeros(month_ranges.shape[0])
            
            variances.fill(st_var.mdi)
            rep_accuracies.fill(st_var.mdi)
            rep_freqs.fill(st_var.mdi)
                
            year_ids = np.array(year_ids)
            
            # extract variance of normalised anomalies for each year
            for y, year in enumerate(range(month_ranges.shape[0])):
            
                year_locs = np.where(year_ids == y)
            
                this_year = normed_anomalies[year_locs,:]
                this_year = this_year.reshape(-1)
                
            # end of similarity with Climatological check
            
                if len(this_year.compressed()) >= 30:
            
                    variances[y] = utils.mean_absolute_deviation(this_year, median = True)
                    
                    rep_accuracies[y] = utils.reporting_accuracy(this_year)
                    rep_freqs[y] = utils.reporting_frequency(this_year)

                else:
                    variances.mask[y] = True

            good = np.where(month_data_count[:,month] >= 100)
            
            # get median and IQR of variance for all years for this month
            if len(good[0]) >= 10:
                
                median_variance = np.median(variances[good])
                
                iqr_variance = utils.IQR(variances[good]) / 2. # to match IDL
                
                if iqr_variance < 0.01: iqr_variance = 0.01
            else:
                
                median_variance = st_var.mdi
                iqr_variance = st_var.mdi

                
            # if SLP, then get median and MAD of SLP and windspeed for month
            if variable in ["slp", "windspeeds"]:
                
                winds = getattr(station, "windspeeds")
                slp = getattr(station, "slp")
        
                # refactor this as similar in style to how target data extracted  
                for y, year in enumerate(range(month_ranges.shape[0])):
                    
                    if y == 0:
                        winds_year = winds.data[month_ranges[year,month][0]:month_ranges[year,month][1]]
                        winds_month = winds_year.reshape(-1,24)
                                            
                        slp_year = slp.data[month_ranges[year,month][0]:month_ranges[year,month][1]]
                        slp_month = slp_year.reshape(-1,24)
                                            
                    else:
                        winds_year = winds.data[month_ranges[year,month][0]:month_ranges[year,month][1]]                  
                        winds_year = winds_year.reshape(-1,24)
                        winds_month = np.ma.concatenate((winds_month, winds_year), axis = 0)
                        
                        slp_year = slp.data[month_ranges[year,month][0]:month_ranges[year,month][1]]                  
                        slp_year =  slp_year.reshape(-1,24)
                        slp_month = np.ma.concatenate((slp_month, slp_year), axis = 0)
                        
                median_wind = np.ma.median(winds_month)
                median_slp  = np.ma.median(slp_month)
                
                wind_MAD = utils.mean_absolute_deviation(winds_month.compressed())
                slp_MAD = utils.mean_absolute_deviation(slp_month.compressed())
                
                if diagnostics:
                    print "median windspeed {} m/s, MAD = {}".format(median_wind, wind_MAD)
                    print "median slp {} hPa, MAD = {}".format(median_slp, slp_MAD)

            # now test to see if variance exceeds expected range
            for y, year in enumerate(range(month_ranges.shape[0])):


                if (variances[y] != st_var.mdi) and (iqr_variance != st_var.mdi) and \
                    (median_variance != st_var.mdi) and (month_data_count[y,month] >= DATA_COUNT_THRESHOLD):
                    
                    # if SLP, then need to test if deep low pressure ("hurricane/storm") present
                    #   as this will increase the variance for this month + year
                    if variable in ["slp", "windspeeds"]:
                        
                        iqr_threshold = 6.
                        
                        # increase threshold if reporting frequency and resolution of this
                        #   year doesn't match average
                        if (rep_accuracies[y] != reporting_resolution) and \
                            (rep_freqs[y] != reporting_freq):
                            iqr_threshold = 8.
                       
                        if diagnostics:
                            print np.abs(variances[y] - median_variance) / iqr_variance, variances[y] , median_variance , iqr_variance , iqr_threshold, month+1, year+start.year
                        
                        if np.abs((variances[y] - median_variance) / iqr_variance) > iqr_threshold:
                        
                            # check for storms     
                            winds_month = winds.data[month_ranges[year,month][0]:month_ranges[year,month][1]]                  
                            slp_month = slp.data[month_ranges[year,month][0]:month_ranges[year,month][1]]                  
                   
                            storm = False
                            if (len(winds_month.compressed()) >= 1) and (len(slp_month.compressed()) >= 1):
                                # find max wind & min SLP
                                # max_wind_loc = np.where(winds_month == np.max(winds_month))[0][0]
                                # min_slp_loc = np.where(slp_month == np.min(slp_month))[0][0]

                                # if these are above thresholds and within one day of each other,
                                #    then it likely was a storm
                                # print "fix this in case of multiple max/min locations"
                                # if (np.abs(max_wind_loc - min_slp_loc) <= 24) and \ 
                                #     (((np.max(winds_month) - median_wind) / wind_MAD) > MAD_THRESHOLD) and \
                                #     (((median_slp - np.min(slp_month)) / slp_MAD) > MAD_THRESHOLD): 

                                # locations where winds greater than threshold
                                high_winds, = np.where((winds_month - median_wind)/wind_MAD > MAD_THRESHOLD)
                                # and where SLP less than threshold
                                low_slps, = np.where((median_slp - slp_month)/slp_MAD > MAD_THRESHOLD)

                                # if any locations match, then it's a storm
                                match_loc = high_winds[np.in1d(high_winds, low_slps)]
                                    
                                if len(match_loc) > 0:
                                    storm = True
                            else:
                                print "write spurious"
                                
                            # check the SLP first difference series
                            #   to ensure a drop down and climb out of minimum SLP/or climb up and down from maximum wind speed
                            if variable == "slp":
                                diffs = np.diff(slp_month.compressed())
                            elif variable == "windspeeds":
                                diffs = np.diff(winds_month.compressed())
                            
                            negs, poss = 0,0
                            biggest_neg, biggest_pos = 0,0
                            
                            for diff in diffs:
                                
                                if diff > 0:
                                    if negs > biggest_neg: biggest_neg = negs
                                    negs = 0
                                    poss += 1
                                else:
                                    if poss > biggest_pos: biggest_pos = poss
                                    poss = 0
                                    negs += 1
                                
                            if (biggest_neg < 10) and (biggest_pos < 10) and not storm:
                                
                                # not a hurricane, so mask
                                station.qc_flags[month_ranges[year,month,0]:month_ranges[year,month,1], flag_col[v]] = 1
                                if plots or diagnostics:
                                    print "No Storm or Hurricane in %i %i - flagging\n" % (month+1, y+start.year)
                                else:
                                    logfile.write("No Storm or Hurricane in %i %i - flagging\n" % (month+1, y+start.year))
                                
                            else:
                                # hurricane
                                if plots or diagnostics:
                                    print "Storm or Hurricane in %i %i - not flagging\n" % (month+1, y+start.year)
                                else:
                                    logfile.write("Storm or Hurricane in %i %i - not flagging\n" % (month+1, y+start.year))
                        
                            if plots:
                                # plot showing the pressure, pressure first differences and the wind speeds
                                plot_times = utils.times_hours_to_datetime(station.time.data[month_ranges[year,month][0]:month_ranges[year,month][1]], start)

                                evc_plot_slp_wind(plot_times, slp_month, diffs, median_slp, slp_MAD, winds_month, median_wind, wind_MAD)

                    else:
                        
                        iqr_threshold = 8.
                        
                        if (rep_accuracies[y] != reporting_resolution) and \
                            (rep_freqs[y] != reporting_freq):
                            iqr_threshold = 10.
                            

                        if np.abs(variances[y] - median_variance) / iqr_variance > iqr_threshold:
                                
                            if diagnostics:
                                print "flagging {} {}".format(year+start.year,calendar.month_name[month+1])
                            # remove the data 
                            station.qc_flags[month_ranges[year,month,0]:month_ranges[year,month,1], flag_col[v]] = 1


            if plots:
                plot_variances = (variances - median_variance) / iqr_variance

                plot_variances = np.ma.masked_where(month_data_count[:,month] < DATA_COUNT_THRESHOLD,plot_variances)
                
                evc_plot_hist(plot_variances, iqr_threshold, "Variance Check - %s - %s" % (variable, calendar.month_name[month+1]))
 
        flag_locs = np.where(station.qc_flags[:, flag_col[v]] != 0)
        if plots or diagnostics:
            utils.print_flagged_obs_number(logfile, "Variance", variable, len(flag_locs[0]), noWrite = True)
        else:
            utils.print_flagged_obs_number(logfile, "Variance", variable, len(flag_locs[0]))
            
        # copy flags into attribute
        st_var.flags[flag_locs] = 1

    # matches 030660 for T, D and SLP 21/8/2014

    station = utils.append_history(station, "Excess Variance Check")

    return # evc
コード例 #4
0
ファイル: climatological.py プロジェクト: rjhd2/HadISD_v2
def coc(station, variable_list, flag_col, start, end, logfile, diagnostics = False, plots = False, idl = False):
    
    for v, variable in enumerate(variable_list):
        
        st_var = getattr(station, variable)
        all_filtered = utils.apply_filter_flags(st_var)
        
        # is this needed 13th Nov 2014 RJHD
        #reporting_resolution = utils.reporting_accuracy(utils.apply_filter_flags(st_var))
        
        month_ranges = utils.month_starts_in_pairs(start, end)
        month_ranges = month_ranges.reshape(-1,12,2)
    
        for month in range(12):
            
            hourly_climatologies = np.zeros(24)
            hourly_climatologies.fill(st_var.mdi)
            
            # append all e.g. Januaries together

            this_month, year_ids, dummy = utils.concatenate_months(month_ranges[:,month,:], st_var.data, hours = True)
            this_month_filtered, dummy, dummy = utils.concatenate_months(month_ranges[:,month,:], all_filtered, hours = True)

            # if fixed climatology period, sort this here
            
            # get as array of 24 hrs.  
            this_month = np.ma.array(this_month)
            this_month = this_month.reshape(-1,24)

            this_month_filtered = np.ma.array(this_month_filtered)
            this_month_filtered = this_month_filtered.reshape(-1,24)

            # get hourly climatology for each month
            for hour in range(24):
                
                this_hour = this_month[:,hour]

                # need to have data if this is going to work!
                if len(this_hour.compressed()) > 0:

                    # winsorize & climatologies - done to match IDL
                    if idl:
                        this_hour = utils.winsorize(np.append(this_hour.compressed(), -999999), 0.05, idl = idl)
                        hourly_climatologies[hour] = np.ma.sum(this_hour)/(len(this_hour) - 1)

                    else:
                        this_hour = utils.winsorize(this_hour.compressed(), 0.05, idl = idl)
                        hourly_climatologies[hour] = np.ma.mean(this_hour)



            if len(this_month.compressed()) > 0:
                # can get stations with few obs in a particular variable.

                # anomalise each hour over month appropriately

                anomalies = this_month - np.tile(hourly_climatologies, (this_month.shape[0],1))
                anomalies_filtered = this_month_filtered - np.tile(hourly_climatologies, (this_month_filtered.shape[0],1))

                if len(anomalies.compressed()) >= 10:
                    iqr = utils.IQR(anomalies.compressed().reshape(-1))/2.  # to match IDL
                    if iqr < 1.5: iqr = 1.5
                else:
                    iqr = st_var.mdi

                normed_anomalies = anomalies / iqr
                normed_anomalies_filtered = anomalies_filtered / iqr


                # get average anomaly for year
                year_ids = np.array(year_ids)
                monthly_vqvs = np.ma.zeros(month_ranges.shape[0])
                monthly_vqvs.mask = [False for x in range(month_ranges.shape[0])]
                for year in range(month_ranges.shape[0]):
                    year_locs = np.where(year_ids == year)
                    this_year = normed_anomalies_filtered[year_locs,:]

                    if len(this_year.compressed()) > 0:
                        # need to have data for this to work!
                        if idl:
                            monthly_vqvs[year] = utils.idl_median(this_year.compressed().reshape(-1))
                        else:
                            monthly_vqvs[year] = np.ma.median(this_year)
                    else:
                        monthly_vqvs.mask[year] = True


                # low pass filter
                normed_anomalies = coc_low_pass_filter(normed_anomalies, year_ids, monthly_vqvs, month_ranges.shape[0])

                # copy from distributional_gap.py - refactor!
                # get the threshold value
                bins, bincenters = utils.create_bins(normed_anomalies, 1.)

                hist, binEdges = np.histogram(normed_anomalies, bins = bins)

                gaussian = utils.fit_gaussian(bincenters, hist, max(hist), mu=np.mean(normed_anomalies), sig = np.std(normed_anomalies))
                minimum_threshold = round(1. + utils.invert_gaussian(FREQUENCY_THRESHOLD, gaussian))

                if diagnostics:
                    print iqr, minimum_threshold, 1. + utils.invert_gaussian(FREQUENCY_THRESHOLD, gaussian)
                    print gaussian
                    print hist

                if plots:
                    coc_set_up_plot(bincenters, hist, gaussian, variable, threshold = minimum_threshold, sub_par = "observations")


                uppercount = len(np.where(normed_anomalies > minimum_threshold)[0])
                lowercount = len(np.where(normed_anomalies < -minimum_threshold)[0])

                these_flags = station.qc_flags[:, flag_col[v]]
                gap_plot_values, tentative_plot_values = [], []

                # find the gaps and apply the flags

                gap_start = dgc.dgc_find_gap(hist, binEdges, minimum_threshold, gap_size = 1) # in DGC it is 2.
                these_flags, gap_plot_values, tentative_plot_values =\
                    coc_find_and_apply_flags(month_ranges[:,month,:],normed_anomalies, these_flags, year_ids, minimum_threshold, gap_start, \
                                                           upper = True, plots = plots, gpv = gap_plot_values, tpv = tentative_plot_values)

                gap_start = dgc.dgc_find_gap(hist, binEdges, -minimum_threshold, gap_size = 1) # in DGC it is 2.
                these_flags, gap_plot_values, tentative_plot_values =\
                    coc_find_and_apply_flags(month_ranges[:,month,:],normed_anomalies, these_flags, year_ids, minimum_threshold, gap_start, \
                                                           upper = False, plots = plots, gpv = gap_plot_values, tpv = tentative_plot_values)

                station.qc_flags[:, flag_col[v]] = these_flags

                if uppercount + lowercount > 1000:
                    #print "not sorted spurious stations yet"
                    pass
                if plots:
                    import matplotlib.pyplot as plt
                    hist, binEdges = np.histogram(tentative_plot_values, bins = bins)
                    plot_hist = np.array([0.01 if h == 0 else h for h in hist])
                    plt.step(bincenters, plot_hist, c='orange', ls='-', label = 'tentative', where='mid')

                    hist, binEdges = np.histogram(gap_plot_values, bins = bins)
                    plot_hist = np.array([0.01 if h == 0 else h for h in hist])
                    plt.step(bincenters, plot_hist, 'r-', label = 'flagged', where='mid')
                    import calendar
                    plt.text(0.1,0.9,calendar.month_name[month+1], transform = plt.gca().transAxes)
                    leg=plt.legend(loc='lower center',ncol=4, bbox_to_anchor=(0.5,-0.2),frameon=False,prop={'size':13},labelspacing=0.15,columnspacing=0.5)
                    plt.setp(leg.get_title(), fontsize=14)
                    plt.show()
                    #plt.savefig(IMAGELOCATION+'/'+station.id+'_ClimatologicalGap_'+str(month+1)+'.png')


        
        flag_locs = np.where(station.qc_flags[:, flag_col[v]] != 0)

        # copy flags into attribute
        st_var.flags[flag_locs] = 1


        if plots or diagnostics:
            utils.print_flagged_obs_number(logfile, "Climatological", variable, len(flag_locs[0]), noWrite = True)
            print "where\n"
            nflags = len(np.where(station.qc_flags[:, flag_col[v]] == 1)[0])
            utils.print_flagged_obs_number(logfile, "  Firm Clim", variable, nflags, noWrite = True)
            nflags = len(np.where(station.qc_flags[:, flag_col[v]] == 2)[0])
            utils.print_flagged_obs_number(logfile, "  Tentative Clim", variable, nflags, noWrite = True)
        else:
            utils.print_flagged_obs_number(logfile, "Climatological", variable, len(flag_locs[0]))
            logfile.write("where\n")
            nflags = len(np.where(station.qc_flags[:, flag_col[v]] == 1)[0])
            utils.print_flagged_obs_number(logfile, "  Firm Clim", variable, nflags)
            nflags = len(np.where(station.qc_flags[:, flag_col[v]] == 2)[0])
            utils.print_flagged_obs_number(logfile, "  Tentative Clim", variable, nflags)

        # firm flags match 030220
    station = utils.append_history(station, "Climatological Check")  
                     
    return
コード例 #5
0
def detect(station,
           neighbour,
           variable,
           flags,
           neighbour_count,
           start,
           end,
           distance=0,
           diagnostics=False,
           plots=False):
    '''
    Detect which observations are outliers

    :param MetVar station: station object (target)
    :param MetVar neighbour: station object (neighbour)
    :param string variable: which variable to process
    :param array flags: array to store how many neighbours thing each obs is bad
    :param array neighbour_count: how many neighbours present at each obs
    :param datetime start: start of dataset
    :param datetime end: end of dataset
    :param int distance: separation of target and neighbour
    :param bool diagnostics: extra output
    :param bool plots: make figures

    :returns: None
    '''

    FILTERING_FLAG_COL = {
        "temperatures": [0, 1, 4, 5, 8, 12, 16, 20, 27, 41, 44, 58],
        "dewpoints":
        [0, 2, 4, 6, 8, 9, 13, 17, 21, 28, 30, 31, 32, 42, 45, 59],
        "slp": [0, 3, 4, 7, 11, 15, 19, 23, 29, 43, 46, 60],
        "windspeeds": [0, 4, 10, 14, 18, 22, 56, 62, 63, 64]
    }  # not used, but ready for it.

    st_var = getattr(station, variable)
    neigh_var = getattr(neighbour, variable)

    # filter by flags - not all (no Climatological [24,25], or Odd cluster [54,55,56,57]), T record check not in D,
    total_flags = np.sum(station.qc_flags[:, FILTERING_FLAG_COL[variable]],
                         axis=1)
    st_filtered = np.ma.masked_where(total_flags == 1, st_var.data)
    neigh_filtered = np.ma.masked_where(total_flags == 1, neigh_var.data)

    # match the observation times
    match = np.where(
        np.logical_and((st_filtered.data != st_var.mdi),
                       (neigh_filtered.data != neigh_var.mdi)))

    month_ranges = utils.month_starts_in_pairs(start, end).reshape(
        -1, 12, 2)  # in year-long sets of pairs.

    if len(match[0]) >= 100:

        neighbour_count[match] += 1  # number of neighbours with data present

        differences = np.ma.zeros(len(st_filtered))
        differences.fill(st_var.mdi)
        differences.mask = True

        differences[
            match] = st_filtered.data[match] - neigh_filtered.data[match]
        differences.mask[match] = False

        all_iqrs = np.zeros(len(differences))
        # get monthly IQR values
        for month in range(12):

            this_month, dummy1, dummy2 = utils.concatenate_months(
                month_ranges[:, month, :], differences, hours=False)

            if len(this_month.compressed()) > 4:

                iqr = utils.IQR(this_month.compressed())
                if iqr <= 2.: iqr = 2.
            else:
                iqr = 2.

            # and copy back into the array
            for year in month_ranges[:, month, :]:
                all_iqrs[year[0]:year[1]] = iqr

        if plots:
            plot_target_neigh_diffs_dist(differences, min(all_iqrs))

        dubious = np.ma.where(np.ma.abs(differences) > 5. * all_iqrs)

        if len(dubious[0]) >= 1.:

            if variable == "slp":
                # check if they are storms
                positive = np.ma.where(differences > 5. * iqr)
                negative = np.ma.where(differences < -5. * iqr)

                # if majority negative (2/3) and separation > 100

                if (distance > 100.) and (
                        float(len(positive[0])) / len(dubious[0]) < 0.333):

                    if len(positive[0]) > 0:
                        flags[positive] += 1
                    if len(negative[0]) > 0:
                        neighbour_count[match] -= 1

                else:
                    flags[dubious] += 1
            else:

                flags[dubious] += 1

    return  # detect
コード例 #6
0
ファイル: distributional_gap.py プロジェクト: rjhd2/HadISD_v2
def dgc_all_obs(station, variable, flags, start, end, plots = False, diagnostics = False, idl = False, windspeeds = False, GH = False):
    '''RJHD addition working on all observations'''
    
    if plots:
        import matplotlib.pyplot as plt

    st_var = getattr(station, variable)
    
    month_ranges = utils.month_starts_in_pairs(start, end)
    month_ranges = month_ranges.reshape(-1,12,2)
    
    all_filtered = utils.apply_filter_flags(st_var)

 
    for month in range(12):
    
        if windspeeds == True:
            st_var_wind = getattr(station, "windspeeds")
            
            # get monthly averages
            windspeeds_month = np.empty([])
            for y, year in enumerate(month_ranges[:,month,:]):
            
                if y == 0:
                    windspeeds_month = np.ma.array(st_var_wind.data[year[0]:year[1]])
                else:
                    windspeeds_month = np.ma.concatenate([windspeeds_month, st_var_wind.data[year[0]:year[1]]])
                  
            windspeeds_month_average = dgc_get_monthly_averages(windspeeds_month, OBS_LIMIT, st_var_wind.mdi, MEAN)
            windspeeds_month_mad = utils.mean_absolute_deviation(windspeeds_month, median=True)
    
        
        this_month_data = np.array([])
        this_month_filtered = np.array([])
        
        this_month_data, dummy, dummy = utils.concatenate_months(month_ranges[:,month,:], st_var.data, hours = False)
        this_month_filtered, dummy, dummy = utils.concatenate_months(month_ranges[:,month,:], all_filtered, hours = False)
                
        if len(this_month_filtered.compressed()) > OBS_LIMIT:
            
            if idl:
                monthly_median = utils.idl_median(this_month_filtered.compressed().reshape(-1))
            else:
                monthly_median = np.ma.median(this_month_filtered)
                  
            iqr = utils.IQR(this_month_filtered.compressed())
            
            
            if iqr == 0.0:
                # to get some spread if IQR too small                   
                iqr = utils.IQR(this_month_filtered.compressed(), percentile = 0.05)
                
                print "Spurious_stations file not yet sorted"
    

            if iqr != 0.0:               
                monthly_values = np.ma.array((this_month_data.compressed() - monthly_median) / iqr)

                bins, bincenters = utils.create_bins(monthly_values, BIN_SIZE)
                dummy, plot_bincenters = utils.create_bins(monthly_values, BIN_SIZE/10.)
        
                hist, binEdges = np.histogram(monthly_values, bins = bins)
                                               
                if GH:
                    # Use Gauss-Hermite polynomials to add skew and kurtosis to Gaussian fit - January 2015 ^RJHD

                    initial_values = [np.max(hist), np.mean(monthly_values), np.std(monthly_values), stats.skew(monthly_values), stats.kurtosis(monthly_values)] # norm, mean, std, skew, kurtosis
                    
                    fit = leastsq(utils.residualsGH, initial_values, [bincenters, hist, np.ones(len(hist))])
                    res = utils.hermite2gauss(fit[0], diagnostics = diagnostics)
                    
                    plot_gaussian = utils.funcGH(fit[0], plot_bincenters)

                    # adjust to remove the rising bumps seen in some fits - artefacts of GH fitting?
                    mid_point = np.argmax(plot_gaussian)
                    bad, = np.where(plot_gaussian[mid_point:] < FREQUENCY_THRESHOLD/10.)
                    if len(bad) > 0: plot_gaussian[mid_point:][bad[0]:] = FREQUENCY_THRESHOLD/10.

                    bad, = np.where(plot_gaussian[:mid_point] < FREQUENCY_THRESHOLD/10.)
                    if len(bad) > 0: plot_gaussian[:mid_point][:bad[-1]] = FREQUENCY_THRESHOLD/10.                   

                    # extract threshold values
                    good_values = np.argwhere(plot_gaussian > FREQUENCY_THRESHOLD)

                    l_minimum_threshold = round(plot_bincenters[good_values[0]]) - 1
                    u_minimum_threshold = 1 + round(plot_bincenters[good_values[-1]])
                                      

                else:
                    gaussian = utils.fit_gaussian(bincenters, hist, max(hist), mu = np.mean(monthly_values), sig = np.std(monthly_values))

                    # assume the same threshold value
                    u_minimum_threshold = 1 + round(utils.invert_gaussian(FREQUENCY_THRESHOLD, gaussian))
                    l_minimum_threshold = -u_minimum_threshold


                    plot_gaussian = utils.gaussian(plot_bincenters, gaussian)

                if diagnostics:
                    if GH:
                        print hist
                        print res
                        print iqr, l_minimum_threshold, u_minimum_threshold

                    else:
                        print hist
                        print gaussian
                        print iqr, u_minimum_threshold, 1. + utils.invert_gaussian(FREQUENCY_THRESHOLD, gaussian)

                if plots:
                    dgc_set_up_plot(plot_gaussian, monthly_values, variable, threshold = (u_minimum_threshold, l_minimum_threshold), sub_par = "observations", GH = GH)
                     
                    if GH:
                        plt.figtext(0.15, 0.67, 'Mean %.2f, S.d. %.2f,\nSkew %.2f, Kurtosis %.2f' %(res['mean'], res['dispersion'], res['skewness'], res['kurtosis']), color='k', size='small')

                    

                uppercount = len(np.where(monthly_values > u_minimum_threshold)[0])
                lowercount = len(np.where(monthly_values < l_minimum_threshold)[0])
                
                # this needs refactoring - but lots of variables to pass in
                if plots or diagnostics: gap_plot_values = np.array([])

                if uppercount > 0:
                    gap_start = dgc_find_gap(hist, binEdges, u_minimum_threshold)
                        
                    if gap_start != 0:
                        
                        for y, year in enumerate(month_ranges[:,month,:]):
                
                            this_year_data = np.ma.array(all_filtered[year[0]:year[1]])
                            this_year_flags = np.array(flags[year[0]:year[1]])
                            gap_cleaned_locations = np.where(((this_year_data - monthly_median) / iqr) > gap_start)

                            this_year_flags[gap_cleaned_locations] = 1
                            flags[year[0]:year[1]] = this_year_flags

                            if plots or diagnostics: gap_plot_values = np.append(gap_plot_values, (this_year_data[gap_cleaned_locations].compressed() - monthly_median)/iqr)


                if lowercount > 0:
                    gap_start = dgc_find_gap(hist, binEdges, l_minimum_threshold)
                        
                    if gap_start != 0:

                        for y, year in enumerate(month_ranges[:,month,:]):
                
                            this_year_data = np.ma.array(all_filtered[year[0]:year[1]])
                            this_year_flags = np.array(flags[year[0]:year[1]])
                            gap_cleaned_locations = np.where(np.logical_and(((this_year_data - monthly_median) / iqr) < gap_start, this_year_data.mask != True))

                            this_year_flags[gap_cleaned_locations] = 1
                            flags[year[0]:year[1]] = this_year_flags

                            if plots or diagnostics: gap_plot_values = np.append(gap_plot_values, (this_year_data[gap_cleaned_locations].compressed() - monthly_median)/iqr)
                    

                            if windspeeds:
                                this_year_flags[gap_cleaned_locations] = 2 # tentative flags
                                
                                slp_average = dgc_get_monthly_averages(this_month_data, OBS_LIMIT, st_var.mdi, MEAN)
                                slp_mad = utils.mean_absolute_deviation(this_month_data, median=True)
                                storms = np.where((((windspeeds_month - windspeeds_month_average) / windspeeds_month_mad) > 4.5) &\
                                                   (((this_month_data - slp_average) / slp_mad) > 4.5))
                                
                                if len(storms[0]) >= 2:
                                    
                                    storm_1diffs = np.diff(storms)
                                    
                                    separations = np.where(storm_1diffs != 1)

                                    #for sep in separations:


                if plots:
                    hist, binEdges = np.histogram(gap_plot_values, bins = bins)
                    plot_hist = np.array([0.01 if h == 0 else h for h in hist])
                    plt.step(bincenters, plot_hist, 'r-', label = 'flagged', where='mid')
                    import calendar
                    plt.text(0.1,0.9,calendar.month_name[month+1], transform = plt.gca().transAxes)
                    plt.legend(loc='lower center',ncol=3, bbox_to_anchor=(0.5,-0.2),frameon=False,prop={'size':13})
                    plt.show()
                    #plt.savefig(IMAGELOCATION+'/'+station.id+'_DistributionalGap_'+str(month+1)+'.png')
    if diagnostics:
        utils.print_flagged_obs_number("", "Distributional Gap", variable, len(gap_plot_values), noWrite=True)

    return flags # dgc_all_obs
コード例 #7
0
def dgc_all_obs(station,
                variable,
                flags,
                start,
                end,
                logfile,
                plots=False,
                diagnostics=False,
                idl=False,
                windspeeds=False,
                GH=False,
                doMonth=False):
    '''RJHD addition working on all observations'''

    if plots:
        import matplotlib.pyplot as plt

    month_ranges = utils.month_starts_in_pairs(start, end)
    month_ranges = month_ranges.reshape(-1, 12, 2)

    # extract variable
    st_var = getattr(station, variable)
    # apply flags (and mask incomplete year if appropriate)
    all_filtered = utils.apply_filter_flags(st_var,
                                            doMonth=doMonth,
                                            start=start,
                                            end=end)

    st_var_complete_year = copy.deepcopy(st_var)
    if doMonth:
        # restrict the incomplete year if appropriate - keep other flagged obs.
        full_year_end = utils.get_first_hour_this_year(start, end)
        st_var_complete_year.data.mask[full_year_end:] = True

    for month in range(12):

        # if requiring wind data, extract data and find monthly averages
        if windspeeds == True:
            st_var_wind = getattr(station, "windspeeds")

            if doMonth:
                # restrict the incomplete year if appropriate
                st_var_wind.data.mask[full_year_end:] = True

            # get monthly averages
            windspeeds_month = np.empty([])
            for y, year in enumerate(month_ranges[:, month, :]):

                if y == 0:
                    windspeeds_month = np.ma.array(
                        st_var_wind.data[year[0]:year[1]])
                else:
                    windspeeds_month = np.ma.concatenate(
                        [windspeeds_month, st_var_wind.data[year[0]:year[1]]])

            windspeeds_month_average = dgc_get_monthly_averages(
                windspeeds_month, OBS_LIMIT, st_var_wind.mdi, MEAN)
            windspeeds_month_mad = utils.mean_absolute_deviation(
                windspeeds_month, median=True)

        # pull data from each calendar month together
        this_month_data, dummy, dummy = utils.concatenate_months(
            month_ranges[:, month, :], st_var.data, hours=False)
        this_month_filtered, dummy, dummy = utils.concatenate_months(
            month_ranges[:, month, :], all_filtered, hours=False)
        this_month_complete, dummy, dummy = utils.concatenate_months(
            month_ranges[:, month, :], st_var_complete_year.data, hours=False)

        # if enough clean and complete data for this calendar month find the median and IQR
        if len(this_month_filtered.compressed()) > OBS_LIMIT:

            if idl:
                monthly_median = utils.idl_median(
                    this_month_filtered.compressed().reshape(-1))
            else:
                monthly_median = np.ma.median(this_month_filtered)

            iqr = utils.IQR(this_month_filtered.compressed())

            if iqr == 0.0:
                # to get some spread if IQR too small
                iqr = utils.IQR(this_month_filtered.compressed(),
                                percentile=0.05)
                print "Spurious_stations file not yet sorted"

            # if have an IQR, anomalise using median and standardise using IQR
            if iqr != 0.0:

                monthly_values = np.ma.array(
                    (this_month_data.compressed() - monthly_median) / iqr)
                complete_values = np.ma.array(
                    (this_month_complete.compressed() - monthly_median) / iqr)

                # use complete years only for the histogram - aiming to find outliers.
                bins, bincenters = utils.create_bins(complete_values, BIN_SIZE)
                dummy, plot_bincenters = utils.create_bins(
                    complete_values, BIN_SIZE / 10.)
                hist, binEdges = np.histogram(complete_values, bins=bins)
                """
                Change to monthly updates Oct 2017
                Thought about changing distribution to use filtered values
                But this changes the test beyond just dealing with additional months
                Commented out lines below would be alternative.
                """
                # bins, bincenters = utils.create_bins(filtered_values, BIN_SIZE)
                # dummy, plot_bincenters = utils.create_bins(filtered_values, BIN_SIZE/10.)
                # hist, binEdges = np.histogram(filtered_values, bins = bins)

                # used filtered (incl. incomplete year mask) to determine the distribution.
                if GH:
                    # Use Gauss-Hermite polynomials to add skew and kurtosis to Gaussian fit - January 2015 ^RJHD

                    # Feb 2019 - if large amounts off centre, can affect initial values
                    # switched to median and MAD
                    initial_values = [
                        np.max(hist),
                        np.median(complete_values),
                        utils.mean_absolute_deviation(complete_values,
                                                      median=True),
                        stats.skew(complete_values),
                        stats.kurtosis(complete_values)
                    ]  # norm, mean, std, skew, kurtosis

                    fit = leastsq(utils.residualsGH, initial_values,
                                  [bincenters, hist,
                                   np.ones(len(hist))])
                    res = utils.hermite2gauss(fit[0], diagnostics=diagnostics)

                    plot_gaussian = utils.funcGH(fit[0], plot_bincenters)

                    # adjust to remove the rising bumps seen in some fits - artefacts of GH fitting?
                    mid_point = np.argmax(plot_gaussian)
                    bad, = np.where(
                        plot_gaussian[mid_point:] < FREQUENCY_THRESHOLD / 10.)
                    if len(bad) > 0:
                        plot_gaussian[mid_point:][
                            bad[0]:] = FREQUENCY_THRESHOLD / 10.

                    bad, = np.where(
                        plot_gaussian[:mid_point] < FREQUENCY_THRESHOLD / 10.)
                    if len(bad) > 0:
                        plot_gaussian[:mid_point][:bad[
                            -1]] = FREQUENCY_THRESHOLD / 10.

                    # extract threshold values
                    good_values = np.argwhere(
                        plot_gaussian > FREQUENCY_THRESHOLD)

                    l_minimum_threshold = round(
                        plot_bincenters[good_values[0]]) - 1
                    u_minimum_threshold = 1 + round(
                        plot_bincenters[good_values[-1]])

                    if diagnostics:
                        print hist
                        print res
                        print iqr, l_minimum_threshold, u_minimum_threshold

                # or just a standard Gaussian
                else:
                    gaussian = utils.fit_gaussian(
                        bincenters,
                        hist,
                        max(hist),
                        mu=np.median(complete_values),
                        sig=utils.mean_absolute_value(complete_values))

                    # assume the same threshold value
                    u_minimum_threshold = 1 + round(
                        utils.invert_gaussian(FREQUENCY_THRESHOLD, gaussian))
                    l_minimum_threshold = -u_minimum_threshold

                    plot_gaussian = utils.gaussian(plot_bincenters, gaussian)

                    if diagnostics:
                        print hist
                        print gaussian
                        print iqr, u_minimum_threshold, 1. + utils.invert_gaussian(
                            FREQUENCY_THRESHOLD, gaussian)

                if plots:
                    dgc_set_up_plot(plot_gaussian,
                                    complete_values,
                                    variable,
                                    threshold=(u_minimum_threshold,
                                               l_minimum_threshold),
                                    sub_par="observations",
                                    GH=GH)

                    if GH:
                        plt.figtext(
                            0.15,
                            0.67,
                            'Mean %.2f, S.d. %.2f,\nSkew %.2f, Kurtosis %.2f' %
                            (res['mean'], res['dispersion'], res['skewness'],
                             res['kurtosis']),
                            color='k',
                            size='small')

                # now trying to find gaps in the distribution
                uppercount = len(
                    np.where(monthly_values > u_minimum_threshold)[0])
                lowercount = len(
                    np.where(monthly_values < l_minimum_threshold)[0])

                # this needs refactoring - but lots of variables to pass in
                if plots or diagnostics: gap_plot_values = np.array([])

                # do one side of distribution and then other
                if uppercount > 0:
                    gap_start = dgc_find_gap(hist, binEdges,
                                             u_minimum_threshold)

                    if gap_start != 0:

                        # if found a gap, then go through each year for this calendar month
                        #  and flag observations further from middle
                        for y, year in enumerate(month_ranges[:, month, :]):

                            # not using filtered - checking all available data
                            this_year_data = np.ma.array(
                                st_var.data[year[0]:year[1]])
                            this_year_flags = np.array(flags[year[0]:year[1]])
                            gap_cleaned_locations = np.ma.where(
                                ((this_year_data - monthly_median) /
                                 iqr) > gap_start)

                            this_year_flags[gap_cleaned_locations] = 1
                            flags[year[0]:year[1]] = this_year_flags

                            if plots or diagnostics:
                                gap_plot_values = np.append(
                                    gap_plot_values,
                                    (this_year_data[gap_cleaned_locations].
                                     compressed() - monthly_median) / iqr)

                                if len(gap_cleaned_locations[0]) > 0:
                                    print "Upper {}-{} - {} obs flagged".format(
                                        y + start.year, month,
                                        len(gap_cleaned_locations[0]))
                                    print gap_cleaned_locations, this_year_data[
                                        gap_cleaned_locations]

                if lowercount > 0:
                    gap_start = dgc_find_gap(hist, binEdges,
                                             l_minimum_threshold)

                    if gap_start != 0:

                        # if found a gap, then go through each year for this calendar month
                        #  and flag observations further from middle
                        for y, year in enumerate(month_ranges[:, month, :]):

                            this_year_data = np.ma.array(
                                st_var.data[year[0]:year[1]])
                            this_year_flags = np.array(flags[year[0]:year[1]])
                            gap_cleaned_locations = np.ma.where(
                                np.logical_and(
                                    ((this_year_data - monthly_median) / iqr) <
                                    gap_start, this_year_data.mask != True))
                            # add flag requirement for low pressure bit if appropriate

                            this_year_flags[gap_cleaned_locations] = 1
                            flags[year[0]:year[1]] = this_year_flags

                            if plots or diagnostics:
                                gap_plot_values = np.append(
                                    gap_plot_values,
                                    (this_year_data[gap_cleaned_locations].
                                     compressed() - monthly_median) / iqr)

                                if len(gap_cleaned_locations[0]) > 0:
                                    print "Lower {}-{} - {} obs flagged".format(
                                        y + start.year, month,
                                        len(gap_cleaned_locations[0]))
                                    print gap_cleaned_locations, this_year_data[
                                        gap_cleaned_locations]

                            # if doing SLP then do extra checks for storms
                            if windspeeds:
                                windspeeds_year = np.ma.array(
                                    st_var_wind.data[year[0]:year[1]])

                                this_year_flags[
                                    gap_cleaned_locations] = 2  # tentative flags

                                slp_average = dgc_get_monthly_averages(
                                    this_month_data, OBS_LIMIT, st_var.mdi,
                                    MEAN)
                                slp_mad = utils.mean_absolute_deviation(
                                    this_month_data, median=True)

                                # need to ensure that this_year_data is less than slp_average, hence order of test
                                storms, = np.ma.where((((windspeeds_year - windspeeds_month_average) / windspeeds_month_mad) > MAD_THRESHOLD) &\
                                                   (((slp_average - this_year_data) / slp_mad) > MAD_THRESHOLD))

                                # using IDL terminology
                                if len(storms) >= 2:
                                    # use the first difference series to find when there are gaps in
                                    # contiguous sequences of storm observations - want to split up into
                                    # separate storm events
                                    storm_1diffs = np.diff(storms)
                                    separations, = np.where(storm_1diffs != 1)

                                    # expand around storm signal so that all low SLP values covered, and unflagged
                                    if len(separations) >= 1:
                                        print "  multiple storms in {} {}".format(
                                            y + start.year, month)

                                        # if more than one storm signal that month, then use intervals
                                        #    in the first difference series to expand around the first interval alone
                                        storm_start = 0
                                        storm_finish = separations[0] + 1
                                        first_storm = dgc_expand_storms(
                                            storms[storm_start:storm_finish],
                                            len(this_year_data))
                                        final_storms = copy.deepcopy(
                                            first_storm)

                                        for j in range(len(separations)):
                                            # then do the rest in a loop

                                            if j + 1 == len(separations):
                                                # final one
                                                this_storm = dgc_expand_storms(
                                                    storms[separations[j] +
                                                           1:],
                                                    len(this_year_data))
                                            else:
                                                this_storm = dgc_expand_storms(
                                                    storms[separations[j] +
                                                           1:separations[j +
                                                                         1] +
                                                           1],
                                                    len(this_year_data))

                                            final_storms = np.append(
                                                final_storms, this_storm)

                                    else:
                                        # else just expand around the signal by 6 hours either way
                                        final_storms = dgc_expand_storms(
                                            storms, len(this_year_data))

                                else:
                                    final_storms = storms

                                if len(storms) >= 1:
                                    print "Tropical Storm signal in {} {}".format(
                                        y + start.year, month)
                                    this_year_flags[final_storms] = 0

                            # and write flags back into array
                            flags[year[0]:year[1]] = this_year_flags

                if plots:
                    hist, binEdges = np.histogram(gap_plot_values, bins=bins)
                    plot_hist = np.array([0.01 if h == 0 else h for h in hist])
                    plt.step(bincenters,
                             plot_hist,
                             'r-',
                             label='flagged',
                             where='mid')
                    import calendar
                    plt.text(0.1,
                             0.9,
                             calendar.month_name[month + 1],
                             transform=plt.gca().transAxes)
                    plt.legend(loc='lower center',
                               ncol=3,
                               bbox_to_anchor=(0.5, -0.2),
                               frameon=False,
                               prop={'size': 13})
                    plt.show()
                    #plt.savefig(IMAGELOCATION+'/'+station.id+'_DistributionalGap_'+str(month+1)+'.png')

    nflags, = np.where(flags != 0)
    utils.print_flagged_obs_number(logfile,
                                   "Distributional Gap All",
                                   variable,
                                   len(nflags),
                                   noWrite=diagnostics)

    return flags  # dgc_all_obs
コード例 #8
0
ファイル: neighbour_utils.py プロジェクト: rjhd2/HadISD_v2
def detect(station, neighbour, variable, flags, neighbour_count, start, end, distance = 0, diagnostics = False, plots = False):
    '''
    Detect which observations are outliers

    :param MetVar station: station object (target)
    :param MetVar neighbour: station object (neighbour)
    :param string variable: which variable to process
    :param array flags: array to store how many neighbours thing each obs is bad
    :param array neighbour_count: how many neighbours present at each obs
    :param datetime start: start of dataset
    :param datetime end: end of dataset
    :param int distance: separation of target and neighbour
    :param bool diagnostics: extra output
    :param bool plots: make figures

    :returns: None
    '''

    FILTERING_FLAG_COL = {"temperatures":[0,1,4,5,8,12,16,20,27,41,44,58],
                          "dewpoints":[0,2,4,6,8,9,13,17,21,28,30,31,32,42,45,59],
                          "slp":[0,3,4,7,11,15,19,23,29,43,46,60],
                          "windspeeds":[0,4,10,14,18,22,56,62,63,64]} # not used, but ready for it.
    
    
    st_var = getattr(station, variable)
    neigh_var = getattr(neighbour, variable)

    # filter by flags - not all (no Climatological or Odd cluster), T record check not in D, 
    total_flags = np.sum(station.qc_flags[:,FILTERING_FLAG_COL[variable]], axis = 1)
    st_filtered = np.ma.masked_where(total_flags == 1, st_var.data)
    neigh_filtered = np.ma.masked_where(total_flags == 1, neigh_var.data)

    # match the observation times
    match = np.where(np.logical_and((st_filtered.data != st_var.mdi), (neigh_filtered.data != neigh_var.mdi)))

    month_ranges = utils.month_starts_in_pairs(start, end).reshape(-1,12,2) # in year-long sets of pairs.        

    if len(match[0]) >= 100:

        neighbour_count[match] += 1 # number of neighbours with data present

        differences = np.ma.zeros(len(st_filtered))
        differences.fill(st_var.mdi)
        differences.mask = True

        differences[match] = st_filtered.data[match] - neigh_filtered.data[match]
        differences.mask[match] = False

        all_iqrs = np.zeros(len(differences))
        # get monthly IQR values
        for month in range(12):
  
            this_month, dummy1, dummy2 = utils.concatenate_months(month_ranges[:,month,:], differences, hours = False)

            if len(this_month.compressed()) > 4:

                iqr = utils.IQR(this_month.compressed())
                if iqr <= 2.: iqr = 2.
            else:
                iqr = 2.

            # and copy back into the array
            for year in month_ranges[:,month,:]:
                all_iqrs[year[0]:year[1]] = iqr

        if plots:
            plot_target_neigh_diffs_dist(differences, min(iqr))

        dubious = np.ma.where(np.ma.abs(differences) > 5. * all_iqrs)

        if len(dubious[0]) >= 1.:

            if variable == "slp":
                # check if they are storms
                positive = np.ma.where(differences > 5. * iqr)
                negative = np.ma.where(differences < -5. * iqr)
                
                # if majority negative (2/3) and separation > 100

                if (distance > 100.) and (float(len(positive[0]))/len(dubious[0]) < 0.333):

                    if len(positive[0]) > 0:
                        flags[positive] += 1
                    if len(negative[0]) > 0:
                        neighbour_count[match] -= 1
                    
                else:
                    flags[dubious] += 1
            else:

                flags[dubious] += 1

    return # detect