コード例 #1
def rsc_hourly_repeats(st_var, times, n_hrs, n_wday, diagnostics = False, plots = False):
    Repeat of same value at given hour for >N days 
    :param object st_var: station variable object
    :param array times: timestamps
    :param int n_hrs: number of hours to exceed
    :param int n_wday: number of whole days to exceed (passed on)
    :param bool diagnostics: do diagnostic output
    :param bool plots: do plots

    # n_hrs fixed threshold - incomplete year irrelevant

    flags = np.zeros(len(st_var.data))
    hourly_data = utils.apply_filter_flags(st_var)
    hourly_data = hourly_data.reshape(-1,24)
    hourly_times = times.reshape(hourly_data.shape)
    for hour in range(24):
        match_values = -999.
        match_times = [] # assumes start at time zero

        len_matches = [] # for distribution

        for day in range(hourly_data.shape[0]):
            # for each day at each given hour
            if hourly_data.mask[day, hour] == False:
                if hourly_data[day, hour] != match_values:
                    # if different value, check if string/streak above threshold
                    if len(match_times) > n_hrs:
                        bad = np.where(match_times == times)
                        flags[bad] = 1

                        if plots:
                            rsc_diagnostics_and_plot(st_var.time.data, st_var.data, bad, st_var.name, start, plots = plots)           
                    len_matches += [len(match_times)]
                    match_values = hourly_data[day, hour]
                    match_times = [hourly_times[day, hour]]
                    # if same value
                    match_times +=[hourly_times[day, hour]]
    day_flags = rsc_whole_day_repeats(hourly_data, n_wday, st_var, diagnostics = diagnostics, plots = plots)
    return flags, day_flags # rsc_hourly_repeats
コード例 #2
def rsc(station, var_list, flag_col, start, end, logfile, diagnostics = False, plots = False, doMonth = False):
    ''' Wrapper for the four individual repeating streak check tests '''
    times = station.time.data
    for v, variable in enumerate(var_list):
        st_var = getattr(station, variable)
        if len(utils.apply_filter_flags(st_var).compressed()) > 0:
            wind = False
            if variable == "windspeeds": wind = True
            winddir= False
            if variable == "winddirs": winddir = True

            reporting_resolution = utils.reporting_accuracy(utils.apply_filter_flags(st_var, doMonth = doMonth, start = start, end = end), winddir = winddir, plots = plots)

            limits = limits_dict[variable][reporting_resolution]  

            # need to apply flags to st_var.flags each time for filtering
            station.qc_flags[:,flag_col[v][0]] = rsc_straight_strings(st_var, times, limits[0], limits[1], start, end, reporting = reporting_resolution, wind = wind, diagnostics = diagnostics, plots = plots, dynamic = True, doMonth = doMonth)

            # no effect of final incomplete year ("month" option) as limits[2] and limits[3] fixed
            station.qc_flags[:, flag_col[v][1]], station.qc_flags[:, flag_col[v][2]] = rsc_hourly_repeats(st_var, times, limits[2], limits[3], diagnostics = diagnostics, plots = plots)

            for streak_type in range(3):
                flag_locs = np.where(station.qc_flags[:, flag_col[v][streak_type]] != 0)
                utils.print_flagged_obs_number(logfile, "Streak Check", variable, len(flag_locs[0]), noWrite = diagnostics)

                # copy flags into attribute
                st_var.flags[flag_locs] = 1

    station = utils.append_history(station, "Streak Check")
コード例 #3
def rsc_get_straight_string_threshold(st_var, start, end, reporting = 0., diagnostics = False, plots = False, doMonth = False, old_threshold = 0):
    Derive threshold number for strings/streaks of repeating values
    :param object st_var: station variable object
    :param datetime start: start of data
    :param datetime end: end of data    
    :param float reporting: reporting accuracy
    :param bool diagnostics: do diagnostic output
    :param bool plots: do plots
    :param float old_threshold: old threshold to use as comparison
    all_filtered = utils.apply_filter_flags(st_var, doMonth = doMonth, start = start, end = end)
    # find and count the length of all repeating strings
    prev_value = st_var.mdi
    this_string = []
    string_lengths =[]
    # run through all obs, the inefficient (non-pythonic) way
    for o, obs in enumerate(all_filtered):
        if all_filtered.mask[o] == False:
            if obs != prev_value:
                # if different value to before
                string_lengths += [len(this_string)]
                this_string = [o]
                # if same value as before, note and continue
                this_string += [o]
            prev_value = obs

    if plots:
        import calendar
        title = "Straight String Distribution"                  
        line_label = st_var.name
        xlabel = "String length"
        title, line_label, xlabel = "","",""
    threshold = utils.get_critical_values(string_lengths, binmin = 1, binwidth = 1, plots = plots, diagnostics = diagnostics, title = title, line_label = line_label, xlabel = xlabel, old_threshold = old_threshold)
    if diagnostics:
        print "threshold {}".format(threshold)

    return threshold # rsc_get_straight_string_threshold
コード例 #4
ファイル: streaks.py プロジェクト: rjhd2/HadISD_v2
def rsc_hourly_repeats(st_var, times, n_hrs, n_wday, diagnostics = False, plots = False):
    Repeat of same value at given hour for >N days 
    :param object st_var: station variable object
    :param array times: timestamps
    :param int n_hrs: number of hours to exceed
    :param int n_wday: number of whole days to exceed (passed on)
    :param bool diagnostics: do diagnostic output
    :param bool plots: do plots

    flags = np.zeros(len(st_var.data))
    hourly_data = utils.apply_filter_flags(st_var)
    hourly_data = hourly_data.reshape(-1,24)
    hourly_times = times.reshape(hourly_data.shape)
    for hour in range(24):
        match_values = -999.
        match_times = [] # assumes start at time zero

        len_matches = [] # for distribution

        for day in range(hourly_data.shape[0]):
            # for each day at each given hour
            if hourly_data.mask[day, hour] == False:
                if hourly_data[day, hour] != match_values:
                    # if different value, check if string/streak above threshold
                    if len(match_times) > n_hrs:
                        bad = np.where(match_times == times)
                        flags[bad] = 1

                        if plots:
                            rsc_diagnostics_and_plot(st_var.time.data, st_var.data, bad, st_var.name, start, plots = plots)           
                    len_matches += [len(match_times)]
                    match_values = hourly_data[day, hour]
                    match_times = [hourly_times[day, hour]]
                    # if same value
                    match_times +=[hourly_times[day, hour]]
    day_flags = rsc_whole_day_repeats(hourly_data, n_wday, st_var, diagnostics = diagnostics, plots = plots)
    return flags, day_flags # rsc_hourly_repeats
コード例 #5
ファイル: streaks.py プロジェクト: rjhd2/HadISD_v2
def rsc(station, var_list, flag_col, start, end, logfile, diagnostics = False, plots = False):
    ''' Wrapper for the four individual repeating streak check tests '''
    times = station.time.data
    for v, variable in enumerate(var_list):
        st_var = getattr(station, variable)
        if len(utils.apply_filter_flags(st_var).compressed()) > 0:
            reporting_resolution = utils.reporting_accuracy(utils.apply_filter_flags(st_var))

            limits = limits_dict[variable][reporting_resolution]  

            wind = False
            if variable == "windspeeds": wind = True

            # need to apply flags to st_var.flags each time for filtering
            station.qc_flags[:,flag_col[v][0]] = rsc_straight_strings(st_var, times, limits[0], limits[1], start, end, reporting = reporting_resolution, wind = wind, diagnostics = diagnostics, plots = plots, dynamic = True)

            station.qc_flags[:, flag_col[v][1]], station.qc_flags[:, flag_col[v][2]]= rsc_hourly_repeats(st_var, times, limits[2], limits[3], diagnostics = diagnostics, plots = plots)

            for streak_type in range(3):
                flag_locs = np.where(station.qc_flags[:, flag_col[v][streak_type]] != 0)
                if plots or diagnostics:
                    utils.print_flagged_obs_number(logfile, "Streak Check", variable, len(flag_locs[0]), noWrite = True)
                    utils.print_flagged_obs_number(logfile, "Streak Check", variable, len(flag_locs[0]))

                # copy flags into attribute
                st_var.flags[flag_locs] = 1

    station = utils.append_history(station, "Streak Check")
コード例 #6
ファイル: streaks.py プロジェクト: rjhd2/HadISD_v2
def rsc_get_straight_string_threshold(st_var, start, end, reporting = 0., diagnostics = False, plots = False, old_threshold = 0):
    Derive threshold number for strings/streaks of repeating values
    :param object st_var: station variable object
    :param datetime start: start of data
    :param datetime end: end of data    
    :param float reporting: reporting accuracy
    :param bool diagnostics: do diagnostic output
    :param bool plots: do plots
    :param float old_threshold: old threshold to use as comparison
    all_filtered = utils.apply_filter_flags(st_var)
    # find and count the length of all repeating strings
    prev_value = st_var.mdi
    this_string = []
    string_lengths =[]
    # run through all obs, the inefficient (non-pythonic) way
    for o, obs in enumerate(all_filtered):
        if all_filtered.mask[o] == False:
            if obs != prev_value:
                # if different value to before
                string_lengths += [len(this_string)]
                this_string = [o]
                # if same value as before, note and continue
                this_string += [o]
            prev_value = obs

    if plots:
        import calendar
        title = "Straight String Distribution"                  
        line_label = st_var.name
        xlabel = "String length"
        title, line_label, xlabel = "","",""
    threshold = utils.get_critical_values(string_lengths, binmin = 1, binwidth = 1, plots = plots, diagnostics = diagnostics, title = title, line_label = line_label, xlabel = xlabel, old_threshold = old_threshold)
    return threshold # rsc_get_straight_string_threshold
コード例 #7
ファイル: records.py プロジェクト: wk1984/HadISD_v2
def krc(station, var_list, flag_col, logfile, diagnostics = False, plots = False):
    Run the known records check for each variable in list
    :param object station: station to process
    :param list var_list: list of variables to process
    :param list flag_col: which columns to use for which variable 
    :param file logfile: logfile to store output 
    :param bool diagnostics: diagnostic output (unused)
    :param bool plots: do the plots (unused)
    for v, variable in enumerate(var_list):
        st_var = getattr(station, variable)
        st_region = krc_get_wmo_region(station.id)
        all_filtered = utils.apply_filter_flags(st_var)
        too_high = np.where(all_filtered > maxes[variable][st_region])
        krc_set_flags(too_high, station.qc_flags, flag_col[v])
        # make sure that don't flag the missing values!
        too_low = np.where(np.logical_and(all_filtered < mins[variable][st_region], all_filtered.mask == False ))
        krc_set_flags(too_low, station.qc_flags, flag_col[v])
        flag_locs = np.where(station.qc_flags[:, flag_col[v]] != 0)
        if plots or diagnostics:
            utils.print_flagged_obs_number(logfile, "World Record", variable, len(flag_locs[0]), noWrite = True)
            utils.print_flagged_obs_number(logfile, "World Record", variable, len(flag_locs[0]))

        # copy flags into attribute
        st_var.flags[flag_locs] = 1
    station = utils.append_history(station, "World Record Check")  

    return # krc
コード例 #8
ファイル: distributional_gap.py プロジェクト: rjhd2/HadISD_v2
def dgc_monthly(station, variable, flags, start, end, plots=False, diagnostics=False, idl = False):
    Original Distributional Gap Check

    :param obj station: station object
    :param str variable: variable to act on
    :param array flags: flags array
    :param datetime start: data start
    :param datetime end: data end
    :param bool plots: run plots
    :param bool diagnostics: run diagnostics
    :param bool idl: run IDL equivalent routines for median
       flags - updated flag array

    if plots:
        import matplotlib.pyplot as plt
    st_var = getattr(station, variable)
    month_ranges = utils.month_starts_in_pairs(start, end)
    # get monthly averages
    month_average = np.empty(month_ranges.shape[0])
    month_average_filtered = np.empty(month_ranges.shape[0])
    all_filtered = utils.apply_filter_flags(st_var)
    for m, month in enumerate(month_ranges):
        data = st_var.data[month[0]:month[1]]
        filtered = all_filtered[month[0]:month[1]]
        month_average[m] = dgc_get_monthly_averages(data, OBS_LIMIT, st_var.mdi, MEAN)
        month_average_filtered[m] = dgc_get_monthly_averages(filtered, OBS_LIMIT, st_var.mdi, MEAN)
    # get overall monthly climatologies - use filtered data
    month_average = month_average.reshape(-1,12)
    month_average_filtered = month_average_filtered.reshape(-1,12)
    standardised_months = np.empty(month_average.shape)
    for m in range(12):
        valid_filtered = np.where(month_average_filtered[:,m] != st_var.mdi)
        if len(valid_filtered[0]) >= VALID_MONTHS:
            valid_data = month_average_filtered[valid_filtered,m][0]
            if MEAN:
                clim = np.mean(valid_data)
                spread = np.stdev(valid_data)
                if idl:
                    clim = utils.idl_median(valid_data.compressed().reshape(-1))
                    clim = np.median(valid_data)
                spread = utils.IQR(valid_data)
                if spread <= SPREAD_LIMIT:
                    spread = SPREAD_LIMIT
            standardised_months[valid_filtered,m] = (month_average[valid_filtered,m] - clim) / spread 
    standardised_months = standardised_months.reshape(month_ranges.shape[0]) 
    good_months = np.where(standardised_months != st_var.mdi)

    # must be able to do this with masked arrays
    if plots:
        bins, bincenters = utils.create_bins(standardised_months[good_months], BIN_SIZE)
        dummy, plot_bincenters = utils.create_bins(standardised_months[good_months], BIN_SIZE/10.)

        hist, binEdges = np.histogram(standardised_months[good_months], bins = bins)   

        fit = utils.fit_gaussian(bincenters, hist, max(hist), mu = np.mean(standardised_months[good_months]), sig = np.std(standardised_months[good_months]))
        plot_gaussian = utils.gaussian(plot_bincenters, fit)

        dgc_set_up_plot(plot_gaussian, standardised_months[good_months], variable, sub_par = "Months")
    # remove all months with a large standardised offset
    if len(good_months[0]) >= MONTH_LIMIT:
        standardised_months = np.ma.masked_values(standardised_months, st_var.mdi)
        large_offsets = np.where(standardised_months >= LARGE_LIMIT)

        if len(large_offsets[0]) > 0:
            for lo in large_offsets[0]:
                flags[month_ranges[lo,0]:month_ranges[lo,1]] = 1
            if plots:
                hist, binEdges = np.histogram(standardised_months[large_offsets], bins = bins)
                plot_hist = np.array([0.01 if h == 0 else h for h in hist])
                plt.step(bincenters, plot_hist, 'g-', label = '> %i' % LARGE_LIMIT, where = 'mid', zorder = 5)

        # walk distribution from centre and see if any assymetry
        sort_order = standardised_months[good_months].argsort()

        mid_point = len(good_months[0]) / 2
        good = True
        iter = 1
        while good:
            if standardised_months[good_months][sort_order][mid_point - iter] != standardised_months[good_months][sort_order][mid_point + iter]:
                # using IDL notation
                tempvals = [np.abs(standardised_months[good_months][sort_order][mid_point - iter]),np.abs(standardised_months[good_months][sort_order][mid_point + iter])]
                if min(tempvals) != 0:
                    if max(tempvals)/min(tempvals) >= 2. and min(tempvals) >= 1.5:
                        # substantial asymmetry in distribution - at least 1.5 from centre and difference of 2.
                        if tempvals[0] == max(tempvals):
                            # LHS
                            bad = good_months[0][sort_order][:mid_point - iter]
                            if plots: badplot = standardised_months[good_months][sort_order][:mid_point - iter]
                        elif tempvals[1] == max(tempvals):
                            bad = good_months[0][sort_order][mid_point + iter:]
                            if plots: badplot = standardised_months[good_months][sort_order][mid_point + iter:]
                        for b in bad:
                            flags[month_ranges[b,0]:month_ranges[b,1]] = 1
                        if plots:
                            hist, binEdges = np.histogram(badplot, bins = bins)
                            plot_hist = np.array([0.01 if h == 0 else h for h in hist])
                            plt.step(bincenters, plot_hist, 'r-', label = 'Gap', where = 'mid', zorder = 4)
                        good = False        
            iter += 1
            if iter == mid_point: break
        if plots: 
            plt.legend(loc='lower center',ncol=4, bbox_to_anchor=(0.5,-0.2),frameon=False,prop={'size':13})
    return flags # dgc_monthly
コード例 #9
ファイル: odd_cluster.py プロジェクト: wk1984/HadISD_v2
def occ(station, variable_list, flag_col, datastart, logfile, diagnostics = False, plots = False, second = False):
    Check for odd clusters of data surrounded by missing 
        up to 6hr/24hr surrounded by at least 48 on each side

    :param MetVar station: the station object
    :param list variable_list: list of observational variables to process
    :param list flag_col: the columns to set on the QC flag array
    :param datetime datastart: dataset start time
    :param file logfile: logfile to store outputs
    :param bool diagnostics: do extra verbose output
    :param bool plots: do plots
    :param bool second: run for second time


    # the four options of what to do with each observation
    #   the keys give values which are subroutines, and can be called
    #   all subroutines have to take the same set of inputs
    options = {0 : occ_normal, 1 : occ_start_cluster, 2 : occ_in_cluster, 3 : occ_after_cluster}

    for v,variable in enumerate(variable_list):
        st_var = getattr(station, variable)

        filtered_data = utils.apply_filter_flags(st_var)

        var_flags = station.qc_flags[:,flag_col[v]]
        prev_flag_number = 0
        if second:
            # count currently existing flags:
            prev_flag_number = len(var_flags[var_flags != 0])	

        # using IDL copy as method to ensure reproducibility (initially)
        oc_details = OddCluster(st_var.mdi, st_var.mdi, 0, st_var.mdi, st_var.mdi, -1)

        obs_type = 1

        for time in station.time.data:

            if filtered_data.mask[time] == False:
                # process observation point using subroutines, called from named tuple

                if plots and (obs_type == 3) and (time - oc_details.end >= 48):
                    # do plotting if matches flagging criteria
                    oc_plots(station, oc_details, time, datastart, filtered_data, variable)

                oc_details, obs_type = options[obs_type](oc_details, obs_type, time, var_flags)

                # have missing data, 
                if obs_type  == 2:
                    obs_type = 3
                elif obs_type == 0:
                    obs_type = 1

        station.qc_flags[:,flag_col[v]] = var_flags

        flag_locs = np.where(station.qc_flags[:, flag_col[v]] != 0)
        if plots or diagnostics:
            utils.print_flagged_obs_number(logfile, "Odd Cluster", variable, len(flag_locs[0]) - prev_flag_number, noWrite = True)
            utils.print_flagged_obs_number(logfile, "Odd Cluster", variable, len(flag_locs[0]) - prev_flag_number)
        # copy flags into attribute
        st_var.flags[flag_locs] = 1

        # matches 032070 temperature 26/8/2014
    station = utils.append_history(station, "Isolated Odd Cluster Check")  

    return # occ
コード例 #10
def clean_up(st_var,
    Clean up the remaining observations if many flagged or few left in a month

    :param MetVar st_var: input station object
    :param array flags: QC flags array
    :param array input_flag_cols: which columns to check over
    :param int out_flag_col: in which column to set the flags
    :param datetime start: start of dataset
    :param datetime end: end of dataset
    :param array times: hourly time stamps
    :param bool plots: show plots

    month_ranges = utils.month_starts_in_pairs(start, end)

    total_flags = np.sum(flags[:, input_flag_cols], axis=1)

    filtered = utils.apply_filter_flags(st_var)

    # test each month
    for month in month_ranges:

        this_month = filtered[month[0]:month[1]]

        # if less than 20 obs, then flag remaining
        if len(this_month.compressed()) < 20:

            locs = np.where(this_month.mask == False)[0]

            # only flag those observations that actually exist.
            if len(locs) > 0:

                month_range = np.arange(month[0], month[1], dtype=("int"))

                flags[month_range[locs], out_flag_col] = 1

                if plots:

        # if 40% of obs flagged, then flag remainig

            good_locs = np.where(this_month.mask == False)[0]
            flag_locs = np.where(total_flags[month[0]:month[1]] > 0)[0]

            # good_locs - internal and neighbour flags already applied, so need to add
            #      these back in.
            proportion = float(
                len(flag_locs)) / float(len(good_locs) + len(flag_locs))

            if proportion > 0.4:

                month_range = np.arange(month[0], month[1], dtype=("int"))
                flags[month_range[good_locs], out_flag_col] = 1

                if plots:
                             extra_text="lots flagged")

    return  # clean_up
コード例 #11
def dcc(station,
    The diurnal cycle check.
    :param object station: the station object to be processed
    :param list variable_list: the variables to be processed
    :param list full_variable_list: the variables for flags to be applied to
    :param list flag_col: which column in the qc_flags array to work on
    :param file logfile: logfile to store outputs
    :param bool plots: to do any plots
    :param bool diagnostics: to do any extra diagnostic output

    # list of flags for each variable
    diurnal_flags = []

    for v, variable in enumerate(variable_list):

        st_var = getattr(station, variable)

        # is this needed 21/08/2014
        #        reporting_accuracy = utils.reporting_accuracy(utils.apply_filter_flags(st_var))

        # apply flags, but discount incomplete year - so that test values against these later.
        all_data = utils.apply_filter_flags(st_var)
        all_data = all_data.reshape(-1, 24)  # working in fulltimes.
        # apply flags - also apply to final incomplete year so that best values only use complete years
        filtered_data = utils.apply_filter_flags(st_var,
        filtered_data = filtered_data.reshape(-1, 24)  # working in fulltimes.
        number_of_days = filtered_data.shape[0]

        if plots:
            import matplotlib.pyplot as plt
            plot_data = np.ma.zeros(filtered_data.shape)
            plot_data.mask = True
#            best_estimate_counter = np.zeros(HOURS)

        diurnal_filtered_fits = np.zeros(filtered_data.shape[0], dtype=(int))
        diurnal_best_fits = np.zeros(st_var.data.shape[0], dtype=(int))
        diurnal_uncertainties = np.zeros(filtered_data.shape[0])

        for d, day in enumerate(all_data):
            '''enough observations and have large enough diurnal range '''
            if len(day.compressed()) >= OBS_PER_DAY:

                obs_daily_range = max(day.compressed()) - min(day.compressed())
                if obs_daily_range >= DAILY_RANGE:

                    if dcc_quartile_check(day):
                        scaled_sine = ((dcc_make_sine() + 1.) / 2. *
                                       obs_daily_range) + min(day.compressed())
                        diffs = np.zeros(HOURS)
                        '''Find differences for each shifted sine --> cost function'''
                        for h in range(HOURS):
                            diffs[h] = np.sum(
                                np.abs(day - scaled_sine).compressed())
                            scaled_sine = np.roll(scaled_sine,
                                                  1)  # matched to IDL SHIFT()

                        # and keep this for testing against the average value later
                        diurnal_best_fits[d] = np.argmin(diffs)

        for d, day in enumerate(filtered_data):
            '''enough observations and have large enough diurnal range '''
            if len(day.compressed()) >= OBS_PER_DAY:

                obs_daily_range = max(day.compressed()) - min(day.compressed())
                if obs_daily_range >= DAILY_RANGE:

                    if dcc_quartile_check(day):
                        scaled_sine = ((dcc_make_sine() + 1.) / 2. *
                                       obs_daily_range) + min(day.compressed())
                        diffs = np.zeros(HOURS)
                        '''Find differences for each shifted sine --> cost function'''
                        for h in range(HOURS):
                            diffs[h] = np.sum(
                                np.abs(day - scaled_sine).compressed())
                            scaled_sine = np.roll(scaled_sine,
                                                  1)  # matched to IDL SHIFT()

                        diurnal_filtered_fits[d] = np.argmin(diffs)

                        # default uncertainty is the average time resolution of the data
                        diurnal_uncertainties[d] = round(
                            float(HOURS) / len(day.compressed()))

                        if DYNAMIC_DIURNAL:
                            critical_value = min(diffs) + (
                                (max(diffs) - min(diffs)) * 0.33)

                            # centre so minimum in middle
                            diffs = np.roll(diffs,
                                            11 - diurnal_filtered_fits[d])

                            uncertainty = 1
                            while uncertainty < 11:
                                if (diffs[11 - uncertainty] > critical_value) and\
                                        (diffs[11 + uncertainty] > critical_value):
                                    # break if both sides greater than critical difference
                                    # when counting outwards
                                    #    see diurnal_example.py

                                uncertainty += 1

                            # check if uncertainty greater than time resolution for day
                            if uncertainty > diurnal_uncertainties[d]:
                                diurnal_uncertainties[d] = uncertainty

                        if plots:
                            #                            best_estimate_counter[np.argmin(diffs)] += 1
                            # scale daily data to range -1 -> 1, plot with random scatter for clarity
                            plot_data[d] = ((2 *
                                             (day - min(day.compressed())) /
                                             obs_daily_range) - 1.)
                                np.arange(24) + np.random.randn(24) * 0.25,
                                plot_data[d] + np.random.randn(24) * 0.05,

        if plots:
                            diurnal_filtered_fits != INTMDI)]))), 'r-')
            plt.xlim([-1, 25])
            plt.ylim([-1.2, 1.2])

        # dumb copy of IDL
        '''For each uncertainty range (1-6h) find median of cycle offset'''
        filtered_fits = np.zeros(6)
        for h in range(6):
            locs = np.where(diurnal_uncertainties == h + 1)

            if len(locs[0]) > 300:
                # filtered_fits[h] = int(np.median(diurnal_filtered_fits[locs]))
                # Numpy median gives average of central two values which may not be integer
                # 25/11/2014 use IDL style which gives lower value
                filtered_fits[h] = utils.idl_median(
        '''Build up range of cycles incl, uncertainty to find where best of best located'''

        hours = np.arange(24)
        hour_matches = np.zeros(24)
        diurnal_peak = -9
        number_estimates = 0
        for h in range(6):
            if filtered_fits[h] != -9:
                '''Store lowest uncertainty best fit as first guess'''
                if diurnal_peak == -9:
                    diurnal_peak = filtered_fits[h]
                    hours = np.roll(hours, 11 - int(diurnal_peak))
                    hour_matches[11 - (h + 1):11 + (h + 2)] = 1
                    number_estimates += 1

                centre, = np.where(hours == filtered_fits[h])

                if (centre[0] - h + 1) >= 0:
                    if (centre[0] + h + 1) <= 23:
                        hour_matches[centre[0] - (h + 1):centre[0] + h +
                                     2] += 1
                        hour_matches[centre[0] - (h + 1):] += 1
                        hour_matches[:centre[0] + h + 2 - 24] += 1
                    hour_matches[:centre[0] + h + 2] += 1
                    hour_matches[centre[0] - (h + 1):] += 1

                number_estimates += 1
        '''If value at lowest uncertainty not found in all others, then see what value is found by all others '''
        if hour_matches[
                11] != number_estimates:  # central estimate at 12 o'clock
            all_match = np.where(hour_matches == number_estimates)

            # if one is, then use it
            if len(all_match[0]) > 0:
                diurnal_peak = all_match[0][0]
                diurnal_peak = -9
        '''Now have value for best fit diurnal offset'''

        potentially_spurious = np.zeros(number_of_days)

        if diurnal_peak != -9:
            hours = np.arange(24)
            hours = np.roll(hours, 11 - int(diurnal_peak))
            for d in range(number_of_days):
                # and now going back to the unfiltered data
                if diurnal_best_fits[d] != INTMDI:
                    '''Checks if global falls inside daily value+/-range
                    rather than seeing if each day falls in global value+/-range'''

                    min_range = 11 - diurnal_uncertainties[d]
                    max_range = 11 + diurnal_uncertainties[d]
                    maxloc = np.where(hours == diurnal_best_fits[d])[0][0]

                    if maxloc < min_range or maxloc > max_range:
                        potentially_spurious[d] = 1
                        potentially_spurious[d] = 0

            # count number of good, missing and not-bad days
            n_good = 0
            n_miss = 0
            n_not_bad = 0
            total_points = 0
            total_not_miss = 0
            to_flag = np.zeros(number_of_days)

            for d in range(number_of_days):

                if potentially_spurious[d] == 1:

                    n_good = 0
                    n_miss = 0
                    n_not_bad = 0
                    total_points += 1
                    total_not_miss += 1


                    if potentially_spurious[d] == 0:

                        n_good += 1
                        n_not_bad += 1
                        if n_miss != 0:
                            n_miss = 0
                        total_not_miss += 1

                    if potentially_spurious[d] == -999:

                        n_miss += 1
                        n_not_bad += 1
                        if n_good != 0:
                            n_good = 0

                    total_points += 1

                    if (n_good == 3) or (n_miss == 3) or (n_not_bad >= 6):

                        if total_points >= 30:
                            if float(total_not_miss) / total_points >= 0.5:
                                to_flag[d - total_points:d] = 1

                        n_good = 0
                        n_miss = 0
                        n_not_bad = 0
                        total_points = 0
                        total_not_miss = 0

            dcc_flags = np.zeros(filtered_data.shape)

            for d in range(number_of_days):

                if to_flag[d] == 1:
                    good = np.where(filtered_data.mask[d, :] == False)
                    if len(good[0]) >= 1:
                        dcc_flags[d, good] = 1

            if diagnostics:
                print len(np.where(dcc_flags == 1)[0])
                print "currently matches IDL, but should all hours in days have flags set, not just the missing/flagged ones?"

            diurnal_flags += [dcc_flags]
            diurnal_flags += [np.zeros(filtered_data.shape)]

        station.qc_flags[:, flag_col[v]] = np.array(diurnal_flags).reshape(-1)

        flag_locs = np.where(station.qc_flags[:, flag_col[v]] != 0)
                                       "Diurnal Cycle",

        # copy flags into attribute
        st_var.flags[flag_locs] = 1

        # CHECKED 030660-99999, 30-06-2014, 855 flagged RJHD

                                    flag_col[variable_list == "temperatures"],
                                    "Diurnal Cycle",

    station = utils.append_history(station, "Diurnal Cycle Check")

    return  # dcc
コード例 #12
def fvc(station,
    Check for certain values occurring more frequently than would be expected
    :param object station: station object to process
    :param list variable_list: list of variables to process
    :param list flag_col: columns to fill in flag array
    :param datetime start: datetime object of start of data
    :param datetime end: datetime object of end of data
    :param file logfile: logfile to store outputs
    :param bool diagnostics: produce extra diagnostic output
    :param bool plots: produce plots
    :param bool month: ignore months after last complete year/season for distribution

    MIN_DATA_REQUIRED = 500  # to create histogram for complete record
    MIN_DATA_REQUIRED_YEAR = 100  # to create histogram

    month_ranges = utils.month_starts_in_pairs(start, end)

    month_ranges_years = month_ranges.reshape(-1, 12, 2)

    for v, variable in enumerate(variable_list):

        st_var = getattr(station, variable)

        reporting_accuracy = utils.reporting_accuracy(

        # apply flags - for detection only
        filtered_data = utils.apply_filter_flags(st_var,

        for season in range(5):  # Year,MAM,JJA,SON,JF+D

            if season == 0:
                # all year
                season_data = np.ma.masked_values(filtered_data.compressed(),
                thresholds = [30, 20, 10]

                thresholds = [20, 15, 10]
                season_data = np.ma.array([])

                for y, year in enumerate(month_ranges_years):
                    # churn through months extracting data, accounting for fdi and concatenating together
                    if season == 1:
                        season_data = np.ma.concatenate([
                    elif season == 2:
                        season_data = np.ma.concatenate([
                    elif season == 3:
                        season_data = np.ma.concatenate([
                    elif season == 4:
                        season_data = np.ma.concatenate([
                        season_data = np.ma.concatenate([

            season_data = season_data.compressed()

            if len(season_data) > MIN_DATA_REQUIRED:

                if 0 < reporting_accuracy <= 0.5:  # -1 used as missing value
                    bins, bincenters = utils.create_bins(season_data, 0.5)
                    bins, bincenters = utils.create_bins(season_data, 1.0)

                hist, binEdges = np.histogram(season_data, bins=bins)

                if plots:
                    plot_hist, bincenters = fvc_plot_setup(season_data,
                                                           title="%s" %

                bad_bin = np.zeros(len(hist))

                # scan through bin values and identify bad ones
                for e, element in enumerate(hist):
                    if e > 3 and e <= (len(hist) - 3):
                        # don't bother with first three or last three bins
                        seven_bins = hist[e - 3:e + 3 + 1]
                        if (seven_bins[3]
                                == seven_bins.max()) and (seven_bins[3] != 0):
                            # is local maximum and != zero
                            if (seven_bins[3] / float(seven_bins.sum()) >=
                                    0.5) and (seven_bins[3] >= thresholds[0]):
                                # contains >50% of data and is greater than threshold
                                bad_bin[e] = 1

                            # for plotting remove good bins
                                if plots: plot_hist[e] = 1e-1
                            if plots: plot_hist[e] = 1e-1
                        if plots: plot_hist[e] = 1e-1

                if plots:
                    import matplotlib.pyplot as plt
                    plt.step(bincenters, plot_hist, 'r-', where='mid')

                # having identified possible bad bins, check each year in turn, on unfiltered data
                for y, year in enumerate(month_ranges_years):

                    if season == 0:
                        # year
                        year_data = np.ma.masked_values(
                            st_var.data[year[0][0]:year[-1][-1]], st_var.fdi)
                        year_flags = station.qc_flags[year[0][0]:year[-1][-1],
                    elif season == 1:
                        year_data = np.ma.masked_values(
                            st_var.data[year[2][0]:year[4][-1]], st_var.fdi)
                        year_flags = station.qc_flags[year[2][0]:year[4][-1],
                    elif season == 2:
                        year_data = np.ma.masked_values(
                            st_var.data[year[5][0]:year[7][-1]], st_var.fdi)
                        year_flags = station.qc_flags[year[5][0]:year[7][-1],
                    elif season == 3:
                        year_data = np.ma.masked_values(
                            st_var.data[year[8][0]:year[10][-1]], st_var.fdi)
                        year_flags = station.qc_flags[year[8][0]:year[10][-1],
                    elif season == 4:
                        year_data = np.ma.concatenate([np.ma.masked_values(st_var.data[year[0][0]:year[1][-1]], st_var.fdi),\
                                                       np.ma.masked_values(st_var.data[year[-1][0]:year[-1][-1]], st_var.fdi)])
                        year_flags = np.append(

                    if len(year_data.compressed()) > MIN_DATA_REQUIRED_YEAR:

                        hist, binEdges = np.histogram(year_data.compressed(),

                        if plots:
                            plot_hist, bincenters = fvc_plot_setup(
                                title="%s - %s" %
                                (y + start.year, SEASONS[season]))

                        for e, element in enumerate(hist):

                            if bad_bin[e] == 1:
                                # only look at pre-identified bins

                                if e >= 3 and e <= (len(hist) - 3):
                                    # don't bother with first three or last three bins
                                    seven_bins = hist[e - 3:e + 3 +
                                    if (seven_bins[3] == seven_bins.max()
                                        ) and (seven_bins[3] != 0):
                                        # is local maximum and != zero
                                        if (seven_bins[3]/seven_bins.sum() >= 0.5 and seven_bins[3] >= thresholds[1]) \
                                            or (seven_bins[3]/seven_bins.sum() >= 0.9 and seven_bins[3] >= thresholds[2]):
                                            # contains >50% or >90% of data and is greater than appropriate threshold

                                            # Flag these data
                                            bad_points = np.where(
                                                (year_data >= binEdges[e]) &
                                                (year_data < binEdges[e + 1]))
                                            year_flags[bad_points] = 1

                                        # for plotting remove good bins
                                            if plots: plot_hist[e] = 1e-1
                                        if plots: plot_hist[e] = 1e-1
                                    if plots: plot_hist[e] = 1e-1
                                if plots: plot_hist[e] = 1e-1

                        if diagnostics or plots:
                            nflags = len(np.where(year_flags != 0)[0])
                            print "{} {}".format(y + start.year, nflags)

                        if plots:
                            if nflags > 0:

                    # copy flags back

                    if season == 0:
                                         flag_col[v]] = year_flags
                    elif season == 1:
                                         flag_col[v]] = year_flags
                    elif season == 2:
                                         flag_col[v]] = year_flags
                    elif season == 3:
                                         flag_col[v]] = year_flags
                    elif season == 4:
                        split = len(station.qc_flags[year[0][0]:year[1][-1],
                                         flag_col[v]] = year_flags[:split]
                                         flag_col[v]] = year_flags[split:]

        flag_locs = np.where(station.qc_flags[:, flag_col[v]] != 0)
                                       "Frequent Value",

        # copy flags into attribute
        st_var.flags[flag_locs] = 1

    station = utils.append_history(station, "Frequent Values Check")

    return  # fvc
コード例 #13
def wind_rose_check(station, flag_col, start, end, logfile, plots = False, diagnostics = False, doMonth = False):
    Checks for large differences in the year-to-year wind-rose shape.  
    Uses RMSE and fits Gaussian.  Finds gap in distribution to flag beyond

    :param MetStation station: station object
    :param int flag_col: which column to store the flags in
    :param datetime start: start of data
    :param datetime end: end of data
    :param bool plots: run the plots
    :param bool diagnostics: run the diagnostics

    st_var_spd = getattr(station, "windspeeds")
    st_var_dir = getattr(station, "winddirs")

    direction = st_var_dir.data
    speed = st_var_spd.data
    flags = station.qc_flags[:,flag_col]

    month_ranges = utils.month_starts_in_pairs(start, end)
    month_ranges_years = month_ranges.reshape(-1,12,2)

    filtered_direction = utils.apply_filter_flags(st_var_dir, doMonth = doMonth, start = start, end = end)
    filtered_speed = utils.apply_filter_flags(st_var_spd, doMonth = doMonth, start = start, end = end)

    # histogram of wind directions ( ~ unravelled wind-rose)
    dir_bins = range(0,360+DEGREEBINS,DEGREEBINS)
    full_hist, full_binEdges = np.histogram(filtered_direction.compressed(), bins = dir_bins, normed = True)
    if diagnostics:
        print full_hist

    # use rmse as this is known (Chi-sq remains just in case)
    rmse, chisq = -np.ma.ones([month_ranges_years.shape[0]]), -np.ma.ones([month_ranges_years.shape[0]])

    # run through each year to extract RMSE's
    for y,year in enumerate(month_ranges_years):

        if len(direction[year[0][0]:year[-1][0]].compressed()) > 0:

            hist, dummy = np.histogram(direction[year[0][0]:year[-1][0]].compressed(),  bins = dir_bins, normed = True)

            chisq[y] = np.sum((full_hist-hist)**2/(full_hist+hist))/2.
            rmse[y] = np.sqrt(np.mean((full_hist-hist)**2))

            rmse.mask[y] = True

    # now to bin up the differences and see what the fit is.
    # need to have values spread so can bin!
    if len(np.unique(rmse.compressed())) > 1:
        rmse_binEdges, rmse_bincenters = wind_create_bins(rmse)
        hist, rmse_binEdges = np.histogram(rmse,  bins = rmse_binEdges)#, density=True)

        norm = get_histogram_norm(rmse, rmse_binEdges)

        # inputs for fit
        mu = np.mean(rmse)
        std = np.std(rmse)

        # try to get decent fit to bulk of obs.
    #    initial_values = [np.max(hist), np.mean(rmse), np.std(rmse), stats.skew(rmse), stats.kurtosis(rmse)] # norm, mean, std, sk#ew, kurtosis
    #    fit = leastsq(utils.residualsGH, initial_values, [bincenters, hist, np.ones(len(hist))])
    #    res = utils.hermite2gauss(fit[0])
    #    plot_gaussian = utils.funcGH(fit[0], bincenters)

        fit = stats.rice.fit(rmse.compressed(), loc = 0, scale = np.ma.std(rmse))
        dist_pdf = stats.rice.pdf(rmse_bincenters, fit[:-2], loc=fit[-2], scale=fit[-1]) * norm

        gaussian = utils.fit_gaussian(rmse_bincenters, hist, max(hist), mu = mu, sig = std)

        # invert Gaussian to find initial threshold, then hunt for first gap beyond
        # threshold = utils.invert_gaussian(PROB_THRESHOLD, gaussian)

        # invert Rician to find initial threshold, then hunt for first gap beyond
        if dist_pdf[-1] < PROB_THRESHOLD:
            # then curve has dropped below the threshold, so can find some updated ones.
            threshold = -np.where(dist_pdf[::-1] > PROB_THRESHOLD)[0][0]
            threshold = rmse_bincenters[-1]

        n = 0
        center = np.argmax(hist)
        gap = rmse_bincenters[-1] # nothing should be beyond this

        while True:
            if center + n + 1 == len(rmse_bincenters): 
                # gone beyond edge - nothing to flag, so just break

            if rmse_bincenters[center + n] < threshold:
                n += 1
                # continue moving outwards

            if hist[center + n] == 0:
                # found one
                if center + n + 1 == len(rmse_bincenters):
                    # gone beyond edge - nothing to flag - escape
                elif hist[center + n + 1] == 0:
                    # has to be two bins wide?
                    gap = rmse_bincenters[center + n]
            n += 1

        # run through each year to extract RMSE's
        for y,year in enumerate(month_ranges_years):

                if rmse[y] > gap:

                    # only flag where there are observations
                    good, = np.where(np.logical_or(direction.mask[year[0][0]:year[-1][0]] == False, speed.mask[year[0][0]:year[-1][0]] == False))

                    if len(good) > 100:

                        flags[year[0][0]:year[-1][0]][good] = 1

                        if diagnostics or plots:
                            print "Flagging {}  RMSE {} > {}".format(y+start.year, rmse[y], gap)

                        if diagnostics or plots:
                            print "{} beyond threshold (RMSE {} > {}) but retained as only {} observations\n".format(y+start.year, rmse[y], gap, len(good))
                        logfile.write("{} beyond threshold but retained as only {} observations\n".format(y+start.year, len(good)))
                elif rmse.mask[y] == False: 
                    if diagnostics or plots:
                        print "{}".format(y+start.year)

        if plots:
            import matplotlib.pyplot as plt
            # plot underlying histogram
            plot_hist = np.array([float(x) if x != 0 else 1e-1 for x in hist])
            plt.step(rmse_binEdges[1:], plot_hist, color = 'k')

            # plot the Rician distribution on top
            plt.plot(rmse_bincenters, dist_pdf, "r-", label = "Rician") 

            # plot the gaussian on top
            plt.plot(rmse_binEdges[1:], utils.gaussian(rmse_bincenters, gaussian), color = 'b', ls = ":", label = "Gaussian")
            plt.ylim([0.001, 2*max(plot_hist)])

            # plot the thresholds
            plt.axvline(threshold, color = 'g')
            plt.axvline(gap, color = 'r')

            # plot flagged values in different colour
            if len(rmse[rmse > gap]) > 0:
                plt.step(rmse_binEdges[1:][rmse_bincenters >= gap], plot_hist[rmse_bincenters >= gap], color = 'r')

            # prettify
            plt.xlabel("RMSE between complete record and each year")
            plt.title(station.id + " annual wind rose differences")
            plt.xlim([0, 1.1*np.ma.max(rmse)])
            plt.legend(loc = "lower right", frameon = False)


            # plot all the annual wind roses, flattened out.


            bincenters = (full_binEdges[:-1] + full_binEdges[1:])/2.
            plt.plot(bincenters, full_hist, "k-", lw = 2)

            for y,year in enumerate(month_ranges_years):
                if len(speed[year[0][0]:year[-1][0]].compressed() > 0):
                    hist, binEdges = np.histogram(direction[year[0][0]:year[-1][0]].compressed(),  bins = dir_bins, normed = True) 
                    plt.plot(bincenters, hist)

            plt.xlabel("Direction (degrees)")

            # plot wind roses as wind roses

            plot_wind_rose(speed, direction, "{} - {}".format(station.id, "all years"))

            for y,year in enumerate(month_ranges_years):
                if len(speed[year[0][0]:year[-1][0]].compressed() > 0):
                    plot_wind_rose(speed[year[0][0]:year[-1][0]], direction[year[0][0]:year[-1][0]], "{} - {}".format(station.id, start.year + y), label = "RMSE {:6.4f}\nThreshold {:6.4f}".format(rmse[y], gap))
                    print "no data for {}".format(year)

    # and apply the flags and output text

    flag_locs, = np.where(flags != 0)

    utils.print_flagged_obs_number(logfile, "Wind Rose Check", "windspeeds/dirs", len(flag_locs), noWrite=diagnostics)
    station.qc_flags[:,flag_col] = flags

    # and flag the variables
    station.windspeeds.flags[flag_locs] = 1
    station.winddirs.flags[flag_locs] = 1

    return # wind_rose_check
コード例 #14
ファイル: neighbour_checks.py プロジェクト: wk1984/HadISD_v2
def neighbour_checks(station_info, restart_id = "", end_id = "", distances=np.array([]), angles=np.array([]), second = False, masking = False, doZip=False, plots = False, diagnostics = False):
    Run through neighbour checks on list of stations passed
    :param list station_info: list of lists - [[ID, lat, lon, elev]] - strings
    :param array distances: array of distances between station pairs
    :param array angles: array of angles between station pairs
    :param bool second: do the second run
    :param bool masking: apply the flags to the data to mask the observations.

    first = not second

    qc_code_version = subprocess.check_output(['svnversion']).strip()

    # if distances and angles not calculated, then do so
    if (len(distances) == 0) or (len(angles) == 0):
        print "calculating distances and bearings matrix"
        distances, angles = get_distances_angles(station_info)

    # extract before truncate the array
    neighbour_elevations = np.array(station_info[:,3], dtype=float) 
    neighbour_ids        = np.array(station_info[:,0])
    neighbour_info       = np.array(station_info[:,:])

    # sort truncated run
    startindex = 0
    if restart_id != "":
        startindex, = np.where(station_info[:,0] == restart_id)

    if end_id != "":
        endindex, = np.where(station_info[:,0] == end_id)
        if endindex != len(station_info) -1:
            station_info = station_info[startindex: endindex+1]
            distances = distances[startindex:endindex+1,:]
            angles = angles[startindex:endindex+1,:]
            station_info = station_info[startindex:]
            distances = distances[startindex:,:]
            angles = angles[startindex:,:]
        station_info = station_info[startindex:]
        distances = distances[startindex:,:]
        angles = angles[startindex:,:]

    # process each neighbour
    for st, stat in enumerate(station_info):       

        print dt.datetime.strftime(dt.datetime.now(), "%A, %d %B %Y, %H:%M:%S")
        print "Neighbour Check"
        print "{:35s} {}".format("Station Identifier :", stat[0])

        if not plots and not diagnostics:
            logfile = file(LOG_OUTFILE_LOCS+stat[0]+'.log','a') # append to file if second iteration.
            logfile.write(dt.datetime.strftime(dt.datetime.now(), "%A, %d %B %Y, %H:%M:%S\n"))
            logfile.write("Neighbour Check\n")
            logfile.write("{:35s} {}\n".format("Station Identifier :", stat[0]))
            logfile = ""

        process_start_time = time.time()

        station = utils.Station(stat[0], float(stat[1]), float(stat[2]), float(stat[3]))

        # if running through the first time
        if first:

            if os.path.exists(os.path.join(NETCDF_DATA_LOCS, station.id + "_internal.nc.gz")):
                # if gzip file, unzip here
                subprocess.call(["gunzip",os.path.join(NETCDF_DATA_LOCS, station.id + "_internal.nc.gz")])
                time.sleep(5) # make sure it is unzipped before proceeding

            # read in the data
            ncdfp.read(os.path.join(NETCDF_DATA_LOCS, station.id + "_internal.nc"), station, process_vars, carry_thru_vars, diagnostics = diagnostics)

            if plots or diagnostics:
                print "{:35s}  {}\n".format("Total station record size :",len(station.time.data))
                logfile.write("{:35s}  {}\n".format("Total station record size :",len(station.time.data)))

            match_to_compress = utils.create_fulltimes(station, process_vars, DATASTART, DATAEND, carry_thru_vars)

        # or if second pass through?
        elif second:
            if os.path.exists(os.path.join(NETCDF_DATA_LOCS, station.id + "internal2.nc.gz")):
                # if gzip file, unzip here
                subprocess.call(["gunzip",os.path.join(NETCDF_DATA_LOCS, station.id + "_internal2.nc.gz")])
                time.sleep(5) # make sure it is unzipped before proceeding

            ncdfp.read(os.path.join(NETCDF_DATA_LOCS, station.id + "_internal2.nc"), station, process_vars, carry_thru_vars, diagnostics = diagnostics)
            if plots or diagnostics:
                print "{:35s}  {}\n".format("Total station record size :",len(station.time.data))
                logfile.write("{:35s}  {}\n".format("Total station record size :",len(station.time.data)))

            match_to_compress = utils.create_fulltimes(station, process_vars, DATASTART, DATAEND, carry_thru_vars)

        # select neighbours
        neighbour_distances  = distances[st,:]
        neighbour_bearings   = angles[st,:]

        # have to add in start index so that can use location in distance file.
        # neighbours = n_utils.get_neighbours(st+startindex, np.float(stat[3]), neighbour_distances, neighbour_bearings, neighbour_elevations)

        # return all neighbours up to a limit from the distance and elevation offsets (500km and 300m respectively)
        neighbours, neighbour_quadrants = n_utils.get_all_neighbours(st+startindex, np.float(stat[3]), neighbour_distances, neighbour_bearings, neighbour_elevations)

        if plots or diagnostics:
            print "{:14s} {:10s} {:10s}".format("Neighbour","Distance","Elevation")
            for n in neighbours:
                print "{:14s} {:10.1f} {:10.1f}".format(neighbour_ids[n],neighbour_distances[n],neighbour_elevations[n])

            logfile.write("{:14s} {:10s} {:10s}\n".format("Neighbour","Distance","Elevation"))
            for n in neighbours:
                logfile.write("{:14s} {:10.1f} {:10.1f}\n".format(neighbour_ids[n],neighbour_distances[n],neighbour_elevations[n]))

        # if sufficient neighbours
        if len(neighbours) >= 3:

            for variable, col in FLAG_OUTLIER_DICT.items():
                # NOTE - this requires multiple reads of the same file
                #      but does make it easier to understand and code

                st_var = getattr(station, variable)

                if plots or diagnostics:
                    print "Length of {} record: {}".format(variable, len(st_var.data.compressed()))
                    logfile.write("Length of {} record: {}\n".format(variable, len(st_var.data.compressed())))

                if len(st_var.data.compressed()) > 0:

                    final_neighbours = n_utils.select_neighbours(station, variable, neighbour_info[neighbours], neighbours, neighbour_distances[neighbours], neighbour_quadrants, NETCDF_DATA_LOCS, DATASTART, DATAEND, logfile, second = second, diagnostics = diagnostics, plots = plots)

                    # now read in final set of neighbours and process

                    neigh_flags = np.zeros(len(station.time.data)) # count up how many neighbours think this obs is bad
                    neigh_count = np.zeros(len(station.time.data)) # number of neighbours at each time stamp
                    dpd_flags = np.zeros(len(station.time.data)) # number of neighbours at each time stamp
                    reporting_accuracies = np.zeros(len(neighbours)) # reporting accuracy of each neighbour

                    all_data = np.ma.zeros([len(final_neighbours), len(station.time.data)]) # store all the neighbour values

                    for nn, nn_loc in enumerate(final_neighbours):

                        neigh_details = neighbour_info[nn_loc]
                        neigh = utils.Station(neigh_details[0], float(neigh_details[1]), float(neigh_details[2]), float(neigh_details[3]))

                        if first:
                            ncdfp.read(os.path.join(NETCDF_DATA_LOCS, neigh.id + "_internal.nc"), neigh, [variable], diagnostics = diagnostics, read_input_station_id = False)
                        elif second:
                            ncdfp.read(os.path.join(NETCDF_DATA_LOCS, neigh.id + "_internal2.nc"), neigh, [variable], diagnostics = diagnostics, read_input_station_id = False)

                        dummy = utils.create_fulltimes(neigh, [variable], DATASTART, DATAEND, [], do_input_station_id = False)

                        all_data[nn, :] = utils.apply_filter_flags(getattr(neigh, variable))

                        if diagnostics:
                            print neigh_details

                        n_utils.detect(station, neigh, variable, neigh_flags, neigh_count, DATASTART, DATAEND, distance = neighbour_distances[nn_loc], diagnostics = diagnostics, plots = plots)

                        reporting_accuracies[nn] = utils.reporting_accuracy(getattr(neigh,variable).data)

                        dpd_flags += neigh.qc_flags[:,31]
                    # gone through all neighbours

                    # if at least 2/3 of neighbours have flagged this point (and at least 3 neighbours)
                    some_flags, = np.where(neigh_flags > 0)            
                    outlier_locs, = np.where(np.logical_and((neigh_count[some_flags] >= 3),(neigh_flags[some_flags].astype("float")/neigh_count[some_flags] > 2./3.)))

                    # flag where < 3 neighbours
                    locs = np.where(neigh_count[some_flags] < 3)
                    station.qc_flags[some_flags[locs], col] = -1

                    if len(outlier_locs) >= 1:
                        station.qc_flags[some_flags[outlier_locs], col] = 1

                        # print number flagged and copy into attribute
                        if plots or diagnostics:
                            utils.print_flagged_obs_number(logfile, "Neighbour", variable, len(outlier_locs), noWrite = True)
                            utils.print_flagged_obs_number(logfile, "Neighbour", variable, len(outlier_locs))
                        st_var = getattr(station, variable)
                        st_var.flags[some_flags[outlier_locs]] = 1

                        if plots or diagnostics:
                            utils.print_flagged_obs_number(logfile, "Neighbour", variable, len(outlier_locs), noWrite = True)
                            utils.print_flagged_obs_number(logfile, "Neighbour", variable, len(outlier_locs))

                    if plots:
                        n_utils.plot_outlier(station, variable, some_flags[outlier_locs], all_data, DATASTART)

                    # unflagging using neighbours
                    n_utils.do_unflagging(station, variable, all_data, reporting_accuracies, neigh_count, dpd_flags, FLAG_COL_DICT, DATASTART, logfile, plots = plots, diagnostics = diagnostics)

                    if plots or diagnostics:
                        print "No observations to assess for {}".format(variable)
                        logfile.write("No observations to assess for {}\n".format(variable))

            # variable loop
            if plots or diagnostics:
                print "Fewer than 3 neighbours"
                logfile.write("Fewer than 3 neighbours\n")

        print dt.datetime.strftime(dt.datetime.now(), "%A, %d %B %Y, %H:%M:%S")
        print "processing took {:4.0f}s\n\n".format(time.time() - process_start_time)

        # end of neighbour check
        utils.append_history(station, "Neighbour Outlier Check")
        # clean up months 

        qc_tests.clean_up.clu(station, ["temperatures","dewpoints","slp","windspeeds","winddirs"], [44,45,46,47,48], FLAG_COL_DICT, DATASTART, DATAEND, logfile, plots = plots, diagnostics = diagnostics)

        if diagnostics or plots: raw_input("stop")

        # masking (at least call from here - optional call from internal?)

        # write to file
        if first:
            ncdfp.write(os.path.join(NETCDF_DATA_LOCS, station.id + "_external.nc"), station, process_vars, os.path.join(INPUT_FILE_LOCS,'attributes.dat'), opt_var_list = carry_thru_vars, compressed = match_to_compress, processing_date = '', qc_code_version = qc_code_version)
            # gzip the raw file
        elif second:
            ncdfp.write(os.path.join(NETCDF_DATA_LOCS, station.id + "_external2.nc"), station, process_vars, os.path.join(INPUT_FILE_LOCS,'attributes.dat'), opt_var_list = carry_thru_vars, compressed = match_to_compress, processing_date = '', qc_code_version = qc_code_version)
            # gzip the raw file

        # masking - apply the flags and copy masked data to flagged_obs attribute
        if masking:

            station = utils.mask(station, process_vars, logfile, FLAG_COL_DICT)

            # write to file
            if first:
                ncdfp.write(os.path.join(NETCDF_DATA_LOCS, station.id + "_mask.nc"), station, process_vars, os.path.join(INPUT_FILE_LOCS,'attributes.dat'), opt_var_list = carry_thru_vars, compressed = match_to_compress, processing_date = '', qc_code_version = qc_code_version)
            elif second:
                ncdfp.write(os.path.join(NETCDF_DATA_LOCS, station.id + "_mask2.nc"), station, process_vars, os.path.join(INPUT_FILE_LOCS,'attributes.dat'), opt_var_list = carry_thru_vars, compressed = match_to_compress, processing_date = '', qc_code_version = qc_code_version)

        if plots or diagnostics:
            print "Masking completed\n"
            print dt.datetime.strftime(dt.datetime.now(), "%A, %d %B %Y, %H:%M:%S\n")
            print "processing took {:4.0f}s\n\n".format(time.time() - process_start_time)
            logfile.write("Masking completed\n")
            logfile.write(dt.datetime.strftime(dt.datetime.now(), "%A, %d %B %Y, %H:%M:%S\n"))
            logfile.write("processing took {:4.0f}s\n\n".format(time.time() - process_start_time))
    # looped through all stations

    # gzip up all the raw files
    if doZip:
        for st, stat in enumerate(station_info):       
            if first:
                subprocess.call(["gzip",os.path.join(NETCDF_DATA_LOCS, stat[0]+"_internal.nc")])
                if masking:
                    subprocess.call(["gzip",os.path.join(NETCDF_DATA_LOCS, stat[0]+"_external.nc")])
                    subprocess.call(["gzip",os.path.join(NETCDF_DATA_LOCS, stat[0]+"_mask.nc")])

            elif second:
                subprocess.call(["gzip",os.path.join(NETCDF_DATA_LOCS, stat[0]+"_internal2.nc")])
                if masking:
                    subprocess.call(["gzip",os.path.join(NETCDF_DATA_LOCS, stat[0]+"_external2.nc")])
                    subprocess.call(["gzip",os.path.join(NETCDF_DATA_LOCS, stat[0]+"_mask2.nc")])

    print "Neighbour Checks completed\n"

    return # neighbour_checks 
コード例 #15
def dgc_monthly(station,
    Original Distributional Gap Check

    :param obj station: station object
    :param str variable: variable to act on
    :param array flags: flags array
    :param datetime start: data start
    :param datetime end: data end
    :param bool plots: run plots
    :param bool diagnostics: run diagnostics
    :param bool idl: run IDL equivalent routines for median
       flags - updated flag array

    if plots:
        import matplotlib.pyplot as plt

    st_var = getattr(station, variable)

    month_ranges = utils.month_starts_in_pairs(start, end)

    # get monthly averages
    month_average = np.empty(month_ranges.shape[0])
    month_average_filtered = np.empty(month_ranges.shape[0])

    all_filtered = utils.apply_filter_flags(st_var)
    for m, month in enumerate(month_ranges):

        data = st_var.data[month[0]:month[1]]

        filtered = all_filtered[month[0]:month[1]]

        month_average[m] = dgc_get_monthly_averages(data, OBS_LIMIT,
                                                    st_var.mdi, MEAN)
        month_average_filtered[m] = dgc_get_monthly_averages(
            filtered, OBS_LIMIT, st_var.mdi, MEAN)

    # get overall monthly climatologies - use filtered data

    month_average = month_average.reshape(-1, 12)
    month_average_filtered = month_average_filtered.reshape(-1, 12)

    standardised_months = np.empty(month_average.shape)

    for m in range(12):

        valid_filtered = np.where(month_average_filtered[:, m] != st_var.mdi)

        if len(valid_filtered[0]) >= VALID_MONTHS:

            valid_data = month_average_filtered[valid_filtered, m][0]

            if MEAN:
                clim = np.mean(valid_data)
                spread = np.stdev(valid_data)

                if idl:
                    clim = utils.idl_median(
                    clim = np.median(valid_data)
                spread = utils.IQR(valid_data)
                if spread <= SPREAD_LIMIT:
                    spread = SPREAD_LIMIT

                                m] = (month_average[valid_filtered, m] -
                                      clim) / spread

    standardised_months = standardised_months.reshape(month_ranges.shape[0])

    good_months = np.where(standardised_months != st_var.mdi)

    # must be able to do this with masked arrays
    if plots:
        bins, bincenters = utils.create_bins(standardised_months[good_months],
        dummy, plot_bincenters = utils.create_bins(
            standardised_months[good_months], BIN_SIZE / 10.)

        hist, binEdges = np.histogram(standardised_months[good_months],

        fit = utils.fit_gaussian(bincenters,
        plot_gaussian = utils.gaussian(plot_bincenters, fit)


    # remove all months with a large standardised offset

    if len(good_months[0]) >= MONTH_LIMIT:

        standardised_months = np.ma.masked_values(standardised_months,
        large_offsets = np.where(standardised_months >= LARGE_LIMIT)

        if len(large_offsets[0]) > 0:

            for lo in large_offsets[0]:
                flags[month_ranges[lo, 0]:month_ranges[lo, 1]] = 1

            if plots:

                hist, binEdges = np.histogram(
                    standardised_months[large_offsets], bins=bins)
                plot_hist = np.array([0.01 if h == 0 else h for h in hist])
                         label='> %i' % LARGE_LIMIT,

                plt.axvline(5, c='g')
                plt.axvline(-5, c='g')

        # walk distribution from centre and see if any assymetry
        sort_order = standardised_months[good_months].argsort()

        mid_point = len(good_months[0]) / 2

        good = True
        iter = 1
        while good:

            if standardised_months[good_months][sort_order][
                    mid_point -
                    iter] != standardised_months[good_months][sort_order][
                        mid_point + iter]:
                # using IDL notation
                tempvals = [
                                                                     - iter]),
                                                                     + iter])

                if min(tempvals) != 0:
                    if max(tempvals) / min(tempvals) >= 2. and min(
                            tempvals) >= 1.5:
                        # substantial asymmetry in distribution - at least 1.5 from centre and difference of 2.

                        if tempvals[0] == max(tempvals):
                            # LHS
                            bad = good_months[0][sort_order][:mid_point - iter]
                            if plots:
                                badplot = standardised_months[good_months][
                                    sort_order][:mid_point - iter]
                        elif tempvals[1] == max(tempvals):
                            bad = good_months[0][sort_order][mid_point + iter:]
                            if plots:
                                badplot = standardised_months[good_months][
                                    sort_order][mid_point + iter:]

                        for b in bad:
                            flags[month_ranges[b, 0]:month_ranges[b, 1]] = 1

                        if plots:

                            hist, binEdges = np.histogram(badplot, bins=bins)
                            plot_hist = np.array(
                                [0.01 if h == 0 else h for h in hist])

                        good = False

            iter += 1
            if iter == mid_point: break

        if plots:
            plt.legend(loc='lower center',
                       bbox_to_anchor=(0.5, -0.2),
                       prop={'size': 13})

    return flags  # dgc_monthly
コード例 #16
ファイル: streaks.py プロジェクト: rjhd2/HadISD_v2
def rsc_straight_strings(st_var, times, n_obs, n_days, start, end, wind = False, reporting = 0., diagnostics = False, plots = False, dynamic = True):
    Check for strings/streaks of repeating values
    :param object st_var: station variable object
    :param int n_days: number of days to exceed
    :param int n_obs: number of observations to exceed
    :param datetime start: start of data
    :param datetime end: end of data    
    :param float reporting: reporting accuracy
    :param bool wind: whether there is wind data to account for - extra minimum value
    :param bool diagnostics: do diagnostic output
    :param bool plots: do plots
    :param bool dynamic: calculate threshold of number of observations dynamically rather than using n_obs

    # January 2015 - changed to dynamically calculating the thresholds, but only use if less than current ^RJHD

    if dynamic:
        threshold = rsc_get_straight_string_threshold(st_var, start, end, reporting = reporting, diagnostics = diagnostics, plots = plots, old_threshold = n_obs)          

        if threshold < n_obs: n_obs = threshold

    all_filtered = utils.apply_filter_flags(st_var)
    flags = np.zeros(len(all_filtered))
    ''' Look for continuous straight strings '''
    prev_value = st_var.mdi
    string_points = []
    # storage for excess over years
    value_starts = []
    value_lengths =[]
    for o, obs in enumerate(all_filtered):
        if all_filtered.mask[o] == False:
            if obs != prev_value:
                # if different value to before, which is long enough (and large enough for Wind)
                if len(string_points) >= 10:
                    if wind == False or (wind == True and prev_value > WIND_MIN_VALUE[reporting]):
                        # note start and length for the annual excess test
                        value_starts += [string_points[0]]
                        value_lengths += [len(string_points)]
                        time_diff = times[string_points[-1]] - times[string_points[0]]
                        # if length above threshold and spread over sufficient time frame, flag
                        if (len(string_points) >= n_obs) or (time_diff >= (n_days * 24)): # measuring time in hours 
                            flags[string_points] = 1
                            if plots or diagnostics:
                                rsc_diagnostics_and_plot(times, all_filtered, string_points, st_var.name, start, plots = plots)           
                string_points = [o]
                # if same value as before, note and continue
                string_points += [o]
            prev_value = obs

    # matches value_lengths 030660-99999, 1/7/2014 - seems to flag more though - compressed vs full time?

    flags = rsc_annual_string_expectance(all_filtered, np.array(value_starts), np.array(value_lengths), flags, start, end, st_var, times, diagnostics = diagnostics, plots = plots)
    return flags # rsc_straight_strings
コード例 #17
ファイル: diurnal_cycle.py プロジェクト: rjhd2/HadISD_v2
def dcc(station, variable_list, full_variable_list, flag_col, logfile, plots = False, diagnostics = False):
    The diurnal cycle check.
    :param object station: the station object to be processed
    :param list variable_list: the variables to be processed
    :param list full_variable_list: the variables for flags to be applied to
    :param list flag_col: which column in the qc_flags array to work on
    :param file logfile: logfile to store outputs
    :param bool plots: to do any plots
    :param bool diagnostics: to do any extra diagnostic output

    # list of flags for each variable
    diurnal_flags = []

    for v,variable in enumerate(variable_list):
        st_var = getattr(station, variable)
 	# is this needed 21/08/2014        
#        reporting_accuracy = utils.reporting_accuracy(utils.apply_filter_flags(st_var))
        # apply flags - for detection only
        filtered_data = utils.apply_filter_flags(st_var)

        filtered_data = filtered_data.reshape(-1,24) # working in fulltimes.
        number_of_days = filtered_data.shape[0]

        if plots:
            import matplotlib.pyplot as plt
            plot_data = np.ma.zeros(filtered_data.shape)
            plot_data.mask = True
#            best_estimate_counter = np.zeros(HOURS)

        diurnal_best_fits     = np.zeros(filtered_data.shape[0], dtype = (int))
        diurnal_uncertainties = np.zeros(filtered_data.shape[0])

        for d,day in enumerate(filtered_data):

            '''enough observations and have large enough diurnal range '''
            if len(day.compressed()) >= OBS_PER_DAY:

                obs_daily_range = max(day.compressed()) - min(day.compressed())
                if obs_daily_range >= DAILY_RANGE:
                    if dcc_quartile_check(day):
                        scaled_sine = ((dcc_make_sine() + 1.) / 2. * obs_daily_range) + min(day.compressed())
                        diffs = np.zeros(HOURS)

                        '''Find differences for each shifted sine --> cost function'''
                        for h in range(HOURS):
                            diffs[h] = np.sum(np.abs(day - scaled_sine).compressed())
                            scaled_sine = np.roll(scaled_sine, 1) # matched to IDL SHIFT()
                        diurnal_best_fits[d] = np.argmin(diffs)

                        # default uncertainty is the average time resolution of the data
                        diurnal_uncertainties[d] = round(float(HOURS) / len(day.compressed()))

                        if DYNAMIC_DIURNAL:
                            critical_value = min(diffs) + ((max(diffs) - min(diffs)) * 0.33)

                            # centre so minimum in middle
                            diffs = np.roll(diffs, 11 - diurnal_best_fits[d])
                            uncertainty = 1
                            while uncertainty < 11:
                                if (diffs[11 - uncertainty] > critical_value) and\
                                        (diffs[11 + uncertainty] > critical_value):
                                    # break if both sides greater than critical difference
                                    # when counting outwards
                                    #    see diurnal_example.py

                                uncertainty += 1

                            # check if uncertainty greater than time resolution for day
                            if uncertainty > diurnal_uncertainties[d] :
                                diurnal_uncertainties[d] = uncertainty

                        if plots:
#                            best_estimate_counter[np.argmin(diffs)] += 1
                            # scale daily data to range -1 -> 1, plot with random scatter for clarity
                            plot_data[d] = ((2 * (day - min(day.compressed())) / obs_daily_range) - 1.)
                            plt.plot(np.arange(24)+np.random.randn(24)*0.25, plot_data[d]+np.random.randn(24)*0.05, 'k,')

        if plots:
            plt.plot(np.arange(24),np.roll(dcc_make_sine(), np.argmax(np.bincount(diurnal_best_fits[np.where(diurnal_best_fits != INTMDI)]))),'r-')

        # dumb copy of IDL

        '''For each uncertainty range (1-6h) find median of cycle offset'''
        best_fits = np.zeros(6)
        for h in range(6):
            locs = np.where(diurnal_uncertainties == h+1)

            if len(locs[0]) > 300:
                # best_fits[h] = int(np.median(diurnal_best_fits[locs])) 
                # Numpy median gives average of central two values which may not be integer
                # 25/11/2014 use IDL style which gives lower value
                best_fits[h] = utils.idl_median(diurnal_best_fits[locs])
        '''Build up range of cycles incl, uncertainty to find where best of best located'''

        hours = np.arange(24)
        diurnal_peak = -9
        number_estimates = 0
        for h in range(6):
            if best_fits[h] != -9:

                '''Store lowest uncertainty best fit as first guess'''
                if diurnal_peak == -9: 
                    diurnal_peak = best_fits[h]
                    hours = np.roll(hours,11-int(diurnal_peak))
                    hour_matches[11-(h+1):11+(h+2)] = 1
                    number_estimates += 1
                centre = np.where(hours == best_fits[h])
                if (centre[0] - h + 1) >= 0:
                    if (centre[0] + h + 1 ) <=23:
                        hour_matches[centre[0] - (h + 1) : centre[0] + h + 2] += 1
                        hour_matches[centre[0] - (h + 1) : ] += 1
                        hour_matches[ : centre[0] + h + 2- 24] += 1                                        
                    hour_matches[: centre[0] + h + 2] += 1
                    hour_matches[centre[0] - (h + 1) :] += 1

                number_estimates += 1

        '''If value at lowest uncertainty not found in all others, then see what value is found by all others '''
        if hour_matches[11] != number_estimates:  # central estimate at 12 o'clock
            all_match = np.where(hour_matches == number_estimates)

            # if one is, then use it
            if len(all_match[0]) > 0:
                diurnal_peak = all_match[0][0]
                diurnal_peak = -9
        '''Now have value for best fit diurnal offset'''

        potentially_spurious = np.zeros(number_of_days)

        if diurnal_peak != -9:
            hours = np.arange(24)
            hours = np.roll(hours,11-int(diurnal_peak))
            for d in range(number_of_days):
                if diurnal_best_fits[d] != INTMDI:

                    '''Checks if global falls inside daily value+/-range
                    rather than seeing if each day falls in global value+/-range'''


                    min_range = 11 - diurnal_uncertainties[d]
                    max_range = 11 + diurnal_uncertainties[d]
                    maxloc = np.where(hours == diurnal_best_fits[d])[0][0]

                    if maxloc < min_range or maxloc > max_range:
                        potentially_spurious[d] = 1
                        potentially_spurious[d] = 0


            # count number of good, missing and not-bad days
            n_good = 0
            n_miss = 0
            n_not_bad = 0
            total_points = 0
            total_not_miss = 0
            to_flag = np.zeros(number_of_days)

            for d in range(number_of_days):

                if potentially_spurious[d] == 1:
                    n_good = 0
                    n_miss = 0
                    n_not_bad = 0
                    total_points += 1
                    total_not_miss +=1

                    if potentially_spurious[d] == 0:

                        n_good += 1
                        n_not_bad += 1
                        if n_miss != 0:
                            n_miss = 0
                        total_not_miss += 1

                    if potentially_spurious[d] == -999:

                        n_miss += 1
                        n_not_bad += 1
                        if n_good != 0:
                            n_good = 0

                    total_points += 1

                    if (n_good == 3) or (n_miss == 3) or (n_not_bad >=6):

                        if total_points >= 30:
                            if float(total_not_miss)/total_points >= 0.5:
                                to_flag[d - total_points : d ] = 1
                        n_good = 0
                        n_miss = 0
                        n_not_bad = 0
                        total_points = 0 
                        total_not_miss = 0

            dcc_flags = np.zeros(filtered_data.shape)

            for d in range(number_of_days):

                if to_flag[d] == 1:
                    good = np.where(filtered_data.mask[d,:] == False)
                    if len(good[0]) >= 1:

            if diagnostics:
                print len(np.where(dcc_flags == 1)[0])
                print "currently matches IDL, but should all hours in days have flags set, not just the missing/flagged ones?"

            diurnal_flags += [dcc_flags]
            diurnal_flags += [np.zeros(filtered_data.shape)]

        station.qc_flags[:, flag_col[v]] = np.array(diurnal_flags).reshape(-1)

        flag_locs = np.where(station.qc_flags[:, flag_col[v]] != 0)
        if plots or diagnostics:
            utils.print_flagged_obs_number(logfile, "Diurnal Cycle", variable, len(flag_locs[0]), noWrite = True)
            utils.print_flagged_obs_number(logfile, "Diurnal Cycle", variable, len(flag_locs[0]))

        # copy flags into attribute
        st_var.flags[flag_locs] = 1

        # CHECKED 030660-99999, 30-06-2014, 855 flagged RJHD
    utils.apply_flags_all_variables(station, full_variable_list, flag_col[variable_list == "temperatures"], logfile, "Diurnal Cycle", plots = plots, diagnostics = diagnostics)

    station = utils.append_history(station, "Diurnal Cycle Check")  
    return # dcc
コード例 #18
ファイル: distributional_gap.py プロジェクト: rjhd2/HadISD_v2
def dgc_all_obs(station, variable, flags, start, end, plots = False, diagnostics = False, idl = False, windspeeds = False, GH = False):
    '''RJHD addition working on all observations'''
    if plots:
        import matplotlib.pyplot as plt

    st_var = getattr(station, variable)
    month_ranges = utils.month_starts_in_pairs(start, end)
    month_ranges = month_ranges.reshape(-1,12,2)
    all_filtered = utils.apply_filter_flags(st_var)

    for month in range(12):
        if windspeeds == True:
            st_var_wind = getattr(station, "windspeeds")
            # get monthly averages
            windspeeds_month = np.empty([])
            for y, year in enumerate(month_ranges[:,month,:]):
                if y == 0:
                    windspeeds_month = np.ma.array(st_var_wind.data[year[0]:year[1]])
                    windspeeds_month = np.ma.concatenate([windspeeds_month, st_var_wind.data[year[0]:year[1]]])
            windspeeds_month_average = dgc_get_monthly_averages(windspeeds_month, OBS_LIMIT, st_var_wind.mdi, MEAN)
            windspeeds_month_mad = utils.mean_absolute_deviation(windspeeds_month, median=True)
        this_month_data = np.array([])
        this_month_filtered = np.array([])
        this_month_data, dummy, dummy = utils.concatenate_months(month_ranges[:,month,:], st_var.data, hours = False)
        this_month_filtered, dummy, dummy = utils.concatenate_months(month_ranges[:,month,:], all_filtered, hours = False)
        if len(this_month_filtered.compressed()) > OBS_LIMIT:
            if idl:
                monthly_median = utils.idl_median(this_month_filtered.compressed().reshape(-1))
                monthly_median = np.ma.median(this_month_filtered)
            iqr = utils.IQR(this_month_filtered.compressed())
            if iqr == 0.0:
                # to get some spread if IQR too small                   
                iqr = utils.IQR(this_month_filtered.compressed(), percentile = 0.05)
                print "Spurious_stations file not yet sorted"

            if iqr != 0.0:               
                monthly_values = np.ma.array((this_month_data.compressed() - monthly_median) / iqr)

                bins, bincenters = utils.create_bins(monthly_values, BIN_SIZE)
                dummy, plot_bincenters = utils.create_bins(monthly_values, BIN_SIZE/10.)
                hist, binEdges = np.histogram(monthly_values, bins = bins)
                if GH:
                    # Use Gauss-Hermite polynomials to add skew and kurtosis to Gaussian fit - January 2015 ^RJHD

                    initial_values = [np.max(hist), np.mean(monthly_values), np.std(monthly_values), stats.skew(monthly_values), stats.kurtosis(monthly_values)] # norm, mean, std, skew, kurtosis
                    fit = leastsq(utils.residualsGH, initial_values, [bincenters, hist, np.ones(len(hist))])
                    res = utils.hermite2gauss(fit[0], diagnostics = diagnostics)
                    plot_gaussian = utils.funcGH(fit[0], plot_bincenters)

                    # adjust to remove the rising bumps seen in some fits - artefacts of GH fitting?
                    mid_point = np.argmax(plot_gaussian)
                    bad, = np.where(plot_gaussian[mid_point:] < FREQUENCY_THRESHOLD/10.)
                    if len(bad) > 0: plot_gaussian[mid_point:][bad[0]:] = FREQUENCY_THRESHOLD/10.

                    bad, = np.where(plot_gaussian[:mid_point] < FREQUENCY_THRESHOLD/10.)
                    if len(bad) > 0: plot_gaussian[:mid_point][:bad[-1]] = FREQUENCY_THRESHOLD/10.                   

                    # extract threshold values
                    good_values = np.argwhere(plot_gaussian > FREQUENCY_THRESHOLD)

                    l_minimum_threshold = round(plot_bincenters[good_values[0]]) - 1
                    u_minimum_threshold = 1 + round(plot_bincenters[good_values[-1]])

                    gaussian = utils.fit_gaussian(bincenters, hist, max(hist), mu = np.mean(monthly_values), sig = np.std(monthly_values))

                    # assume the same threshold value
                    u_minimum_threshold = 1 + round(utils.invert_gaussian(FREQUENCY_THRESHOLD, gaussian))
                    l_minimum_threshold = -u_minimum_threshold

                    plot_gaussian = utils.gaussian(plot_bincenters, gaussian)

                if diagnostics:
                    if GH:
                        print hist
                        print res
                        print iqr, l_minimum_threshold, u_minimum_threshold

                        print hist
                        print gaussian
                        print iqr, u_minimum_threshold, 1. + utils.invert_gaussian(FREQUENCY_THRESHOLD, gaussian)

                if plots:
                    dgc_set_up_plot(plot_gaussian, monthly_values, variable, threshold = (u_minimum_threshold, l_minimum_threshold), sub_par = "observations", GH = GH)
                    if GH:
                        plt.figtext(0.15, 0.67, 'Mean %.2f, S.d. %.2f,\nSkew %.2f, Kurtosis %.2f' %(res['mean'], res['dispersion'], res['skewness'], res['kurtosis']), color='k', size='small')


                uppercount = len(np.where(monthly_values > u_minimum_threshold)[0])
                lowercount = len(np.where(monthly_values < l_minimum_threshold)[0])
                # this needs refactoring - but lots of variables to pass in
                if plots or diagnostics: gap_plot_values = np.array([])

                if uppercount > 0:
                    gap_start = dgc_find_gap(hist, binEdges, u_minimum_threshold)
                    if gap_start != 0:
                        for y, year in enumerate(month_ranges[:,month,:]):
                            this_year_data = np.ma.array(all_filtered[year[0]:year[1]])
                            this_year_flags = np.array(flags[year[0]:year[1]])
                            gap_cleaned_locations = np.where(((this_year_data - monthly_median) / iqr) > gap_start)

                            this_year_flags[gap_cleaned_locations] = 1
                            flags[year[0]:year[1]] = this_year_flags

                            if plots or diagnostics: gap_plot_values = np.append(gap_plot_values, (this_year_data[gap_cleaned_locations].compressed() - monthly_median)/iqr)

                if lowercount > 0:
                    gap_start = dgc_find_gap(hist, binEdges, l_minimum_threshold)
                    if gap_start != 0:

                        for y, year in enumerate(month_ranges[:,month,:]):
                            this_year_data = np.ma.array(all_filtered[year[0]:year[1]])
                            this_year_flags = np.array(flags[year[0]:year[1]])
                            gap_cleaned_locations = np.where(np.logical_and(((this_year_data - monthly_median) / iqr) < gap_start, this_year_data.mask != True))

                            this_year_flags[gap_cleaned_locations] = 1
                            flags[year[0]:year[1]] = this_year_flags

                            if plots or diagnostics: gap_plot_values = np.append(gap_plot_values, (this_year_data[gap_cleaned_locations].compressed() - monthly_median)/iqr)

                            if windspeeds:
                                this_year_flags[gap_cleaned_locations] = 2 # tentative flags
                                slp_average = dgc_get_monthly_averages(this_month_data, OBS_LIMIT, st_var.mdi, MEAN)
                                slp_mad = utils.mean_absolute_deviation(this_month_data, median=True)
                                storms = np.where((((windspeeds_month - windspeeds_month_average) / windspeeds_month_mad) > 4.5) &\
                                                   (((this_month_data - slp_average) / slp_mad) > 4.5))
                                if len(storms[0]) >= 2:
                                    storm_1diffs = np.diff(storms)
                                    separations = np.where(storm_1diffs != 1)

                                    #for sep in separations:

                if plots:
                    hist, binEdges = np.histogram(gap_plot_values, bins = bins)
                    plot_hist = np.array([0.01 if h == 0 else h for h in hist])
                    plt.step(bincenters, plot_hist, 'r-', label = 'flagged', where='mid')
                    import calendar
                    plt.text(0.1,0.9,calendar.month_name[month+1], transform = plt.gca().transAxes)
                    plt.legend(loc='lower center',ncol=3, bbox_to_anchor=(0.5,-0.2),frameon=False,prop={'size':13})
    if diagnostics:
        utils.print_flagged_obs_number("", "Distributional Gap", variable, len(gap_plot_values), noWrite=True)

    return flags # dgc_all_obs
コード例 #19
def dgc_all_obs(station,
    '''RJHD addition working on all observations'''

    if plots:
        import matplotlib.pyplot as plt

    st_var = getattr(station, variable)

    month_ranges = utils.month_starts_in_pairs(start, end)
    month_ranges = month_ranges.reshape(-1, 12, 2)

    all_filtered = utils.apply_filter_flags(st_var)

    for month in range(12):

        if windspeeds == True:
            st_var_wind = getattr(station, "windspeeds")

            # get monthly averages
            windspeeds_month = np.empty([])
            for y, year in enumerate(month_ranges[:, month, :]):

                if y == 0:
                    windspeeds_month = np.ma.array(
                    windspeeds_month = np.ma.concatenate(
                        [windspeeds_month, st_var_wind.data[year[0]:year[1]]])

            windspeeds_month_average = dgc_get_monthly_averages(
                windspeeds_month, OBS_LIMIT, st_var_wind.mdi, MEAN)
            windspeeds_month_mad = utils.mean_absolute_deviation(
                windspeeds_month, median=True)

        this_month_data = np.array([])
        this_month_filtered = np.array([])

        this_month_data, dummy, dummy = utils.concatenate_months(
            month_ranges[:, month, :], st_var.data, hours=False)
        this_month_filtered, dummy, dummy = utils.concatenate_months(
            month_ranges[:, month, :], all_filtered, hours=False)

        if len(this_month_filtered.compressed()) > OBS_LIMIT:

            if idl:
                monthly_median = utils.idl_median(
                monthly_median = np.ma.median(this_month_filtered)

            iqr = utils.IQR(this_month_filtered.compressed())

            if iqr == 0.0:
                # to get some spread if IQR too small
                iqr = utils.IQR(this_month_filtered.compressed(),

                print "Spurious_stations file not yet sorted"

            if iqr != 0.0:
                monthly_values = np.ma.array(
                    (this_month_data.compressed() - monthly_median) / iqr)

                bins, bincenters = utils.create_bins(monthly_values, BIN_SIZE)
                dummy, plot_bincenters = utils.create_bins(
                    monthly_values, BIN_SIZE / 10.)

                hist, binEdges = np.histogram(monthly_values, bins=bins)

                if GH:
                    # Use Gauss-Hermite polynomials to add skew and kurtosis to Gaussian fit - January 2015 ^RJHD

                    initial_values = [
                    ]  # norm, mean, std, skew, kurtosis

                    fit = leastsq(utils.residualsGH, initial_values,
                                  [bincenters, hist,
                    res = utils.hermite2gauss(fit[0], diagnostics=diagnostics)

                    plot_gaussian = utils.funcGH(fit[0], plot_bincenters)

                    # adjust to remove the rising bumps seen in some fits - artefacts of GH fitting?
                    mid_point = np.argmax(plot_gaussian)
                    bad, = np.where(
                        plot_gaussian[mid_point:] < FREQUENCY_THRESHOLD / 10.)
                    if len(bad) > 0:
                            bad[0]:] = FREQUENCY_THRESHOLD / 10.

                    bad, = np.where(
                        plot_gaussian[:mid_point] < FREQUENCY_THRESHOLD / 10.)
                    if len(bad) > 0:
                            -1]] = FREQUENCY_THRESHOLD / 10.

                    # extract threshold values
                    good_values = np.argwhere(
                        plot_gaussian > FREQUENCY_THRESHOLD)

                    l_minimum_threshold = round(
                        plot_bincenters[good_values[0]]) - 1
                    u_minimum_threshold = 1 + round(

                    gaussian = utils.fit_gaussian(bincenters,

                    # assume the same threshold value
                    u_minimum_threshold = 1 + round(
                        utils.invert_gaussian(FREQUENCY_THRESHOLD, gaussian))
                    l_minimum_threshold = -u_minimum_threshold

                    plot_gaussian = utils.gaussian(plot_bincenters, gaussian)

                if diagnostics:
                    if GH:
                        print hist
                        print res
                        print iqr, l_minimum_threshold, u_minimum_threshold

                        print hist
                        print gaussian
                        print iqr, u_minimum_threshold, 1. + utils.invert_gaussian(
                            FREQUENCY_THRESHOLD, gaussian)

                if plots:

                    if GH:
                            'Mean %.2f, S.d. %.2f,\nSkew %.2f, Kurtosis %.2f' %
                            (res['mean'], res['dispersion'], res['skewness'],

                uppercount = len(
                    np.where(monthly_values > u_minimum_threshold)[0])
                lowercount = len(
                    np.where(monthly_values < l_minimum_threshold)[0])

                # this needs refactoring - but lots of variables to pass in
                if plots or diagnostics: gap_plot_values = np.array([])

                if uppercount > 0:
                    gap_start = dgc_find_gap(hist, binEdges,

                    if gap_start != 0:

                        for y, year in enumerate(month_ranges[:, month, :]):

                            this_year_data = np.ma.array(
                            this_year_flags = np.array(flags[year[0]:year[1]])
                            gap_cleaned_locations = np.where(
                                ((this_year_data - monthly_median) /
                                 iqr) > gap_start)

                            this_year_flags[gap_cleaned_locations] = 1
                            flags[year[0]:year[1]] = this_year_flags

                            if plots or diagnostics:
                                gap_plot_values = np.append(
                                     compressed() - monthly_median) / iqr)

                if lowercount > 0:
                    gap_start = dgc_find_gap(hist, binEdges,

                    if gap_start != 0:

                        for y, year in enumerate(month_ranges[:, month, :]):

                            this_year_data = np.ma.array(
                            this_year_flags = np.array(flags[year[0]:year[1]])
                            gap_cleaned_locations = np.where(
                                    ((this_year_data - monthly_median) / iqr) <
                                    gap_start, this_year_data.mask != True))

                            this_year_flags[gap_cleaned_locations] = 1
                            flags[year[0]:year[1]] = this_year_flags

                            if plots or diagnostics:
                                gap_plot_values = np.append(
                                     compressed() - monthly_median) / iqr)

                            if windspeeds:
                                    gap_cleaned_locations] = 2  # tentative flags

                                slp_average = dgc_get_monthly_averages(
                                    this_month_data, OBS_LIMIT, st_var.mdi,
                                slp_mad = utils.mean_absolute_deviation(
                                    this_month_data, median=True)
                                storms = np.where((((windspeeds_month - windspeeds_month_average) / windspeeds_month_mad) > 4.5) &\
                                                   (((this_month_data - slp_average) / slp_mad) > 4.5))

                                if len(storms[0]) >= 2:

                                    storm_1diffs = np.diff(storms)

                                    separations = np.where(storm_1diffs != 1)

                                    #for sep in separations:

                if plots:
                    hist, binEdges = np.histogram(gap_plot_values, bins=bins)
                    plot_hist = np.array([0.01 if h == 0 else h for h in hist])
                    import calendar
                             calendar.month_name[month + 1],
                    plt.legend(loc='lower center',
                               bbox_to_anchor=(0.5, -0.2),
                               prop={'size': 13})
    if diagnostics:
                                       "Distributional Gap",

    return flags  # dgc_all_obs
コード例 #20
ファイル: climatological.py プロジェクト: rjhd2/HadISD_v2
def coc(station, variable_list, flag_col, start, end, logfile, diagnostics = False, plots = False, idl = False):
    for v, variable in enumerate(variable_list):
        st_var = getattr(station, variable)
        all_filtered = utils.apply_filter_flags(st_var)
        # is this needed 13th Nov 2014 RJHD
        #reporting_resolution = utils.reporting_accuracy(utils.apply_filter_flags(st_var))
        month_ranges = utils.month_starts_in_pairs(start, end)
        month_ranges = month_ranges.reshape(-1,12,2)
        for month in range(12):
            hourly_climatologies = np.zeros(24)
            # append all e.g. Januaries together

            this_month, year_ids, dummy = utils.concatenate_months(month_ranges[:,month,:], st_var.data, hours = True)
            this_month_filtered, dummy, dummy = utils.concatenate_months(month_ranges[:,month,:], all_filtered, hours = True)

            # if fixed climatology period, sort this here
            # get as array of 24 hrs.  
            this_month = np.ma.array(this_month)
            this_month = this_month.reshape(-1,24)

            this_month_filtered = np.ma.array(this_month_filtered)
            this_month_filtered = this_month_filtered.reshape(-1,24)

            # get hourly climatology for each month
            for hour in range(24):
                this_hour = this_month[:,hour]

                # need to have data if this is going to work!
                if len(this_hour.compressed()) > 0:

                    # winsorize & climatologies - done to match IDL
                    if idl:
                        this_hour = utils.winsorize(np.append(this_hour.compressed(), -999999), 0.05, idl = idl)
                        hourly_climatologies[hour] = np.ma.sum(this_hour)/(len(this_hour) - 1)

                        this_hour = utils.winsorize(this_hour.compressed(), 0.05, idl = idl)
                        hourly_climatologies[hour] = np.ma.mean(this_hour)

            if len(this_month.compressed()) > 0:
                # can get stations with few obs in a particular variable.

                # anomalise each hour over month appropriately

                anomalies = this_month - np.tile(hourly_climatologies, (this_month.shape[0],1))
                anomalies_filtered = this_month_filtered - np.tile(hourly_climatologies, (this_month_filtered.shape[0],1))

                if len(anomalies.compressed()) >= 10:
                    iqr = utils.IQR(anomalies.compressed().reshape(-1))/2.  # to match IDL
                    if iqr < 1.5: iqr = 1.5
                    iqr = st_var.mdi

                normed_anomalies = anomalies / iqr
                normed_anomalies_filtered = anomalies_filtered / iqr

                # get average anomaly for year
                year_ids = np.array(year_ids)
                monthly_vqvs = np.ma.zeros(month_ranges.shape[0])
                monthly_vqvs.mask = [False for x in range(month_ranges.shape[0])]
                for year in range(month_ranges.shape[0]):
                    year_locs = np.where(year_ids == year)
                    this_year = normed_anomalies_filtered[year_locs,:]

                    if len(this_year.compressed()) > 0:
                        # need to have data for this to work!
                        if idl:
                            monthly_vqvs[year] = utils.idl_median(this_year.compressed().reshape(-1))
                            monthly_vqvs[year] = np.ma.median(this_year)
                        monthly_vqvs.mask[year] = True

                # low pass filter
                normed_anomalies = coc_low_pass_filter(normed_anomalies, year_ids, monthly_vqvs, month_ranges.shape[0])

                # copy from distributional_gap.py - refactor!
                # get the threshold value
                bins, bincenters = utils.create_bins(normed_anomalies, 1.)

                hist, binEdges = np.histogram(normed_anomalies, bins = bins)

                gaussian = utils.fit_gaussian(bincenters, hist, max(hist), mu=np.mean(normed_anomalies), sig = np.std(normed_anomalies))
                minimum_threshold = round(1. + utils.invert_gaussian(FREQUENCY_THRESHOLD, gaussian))

                if diagnostics:
                    print iqr, minimum_threshold, 1. + utils.invert_gaussian(FREQUENCY_THRESHOLD, gaussian)
                    print gaussian
                    print hist

                if plots:
                    coc_set_up_plot(bincenters, hist, gaussian, variable, threshold = minimum_threshold, sub_par = "observations")

                uppercount = len(np.where(normed_anomalies > minimum_threshold)[0])
                lowercount = len(np.where(normed_anomalies < -minimum_threshold)[0])

                these_flags = station.qc_flags[:, flag_col[v]]
                gap_plot_values, tentative_plot_values = [], []

                # find the gaps and apply the flags

                gap_start = dgc.dgc_find_gap(hist, binEdges, minimum_threshold, gap_size = 1) # in DGC it is 2.
                these_flags, gap_plot_values, tentative_plot_values =\
                    coc_find_and_apply_flags(month_ranges[:,month,:],normed_anomalies, these_flags, year_ids, minimum_threshold, gap_start, \
                                                           upper = True, plots = plots, gpv = gap_plot_values, tpv = tentative_plot_values)

                gap_start = dgc.dgc_find_gap(hist, binEdges, -minimum_threshold, gap_size = 1) # in DGC it is 2.
                these_flags, gap_plot_values, tentative_plot_values =\
                    coc_find_and_apply_flags(month_ranges[:,month,:],normed_anomalies, these_flags, year_ids, minimum_threshold, gap_start, \
                                                           upper = False, plots = plots, gpv = gap_plot_values, tpv = tentative_plot_values)

                station.qc_flags[:, flag_col[v]] = these_flags

                if uppercount + lowercount > 1000:
                    #print "not sorted spurious stations yet"
                if plots:
                    import matplotlib.pyplot as plt
                    hist, binEdges = np.histogram(tentative_plot_values, bins = bins)
                    plot_hist = np.array([0.01 if h == 0 else h for h in hist])
                    plt.step(bincenters, plot_hist, c='orange', ls='-', label = 'tentative', where='mid')

                    hist, binEdges = np.histogram(gap_plot_values, bins = bins)
                    plot_hist = np.array([0.01 if h == 0 else h for h in hist])
                    plt.step(bincenters, plot_hist, 'r-', label = 'flagged', where='mid')
                    import calendar
                    plt.text(0.1,0.9,calendar.month_name[month+1], transform = plt.gca().transAxes)
                    leg=plt.legend(loc='lower center',ncol=4, bbox_to_anchor=(0.5,-0.2),frameon=False,prop={'size':13},labelspacing=0.15,columnspacing=0.5)
                    plt.setp(leg.get_title(), fontsize=14)

        flag_locs = np.where(station.qc_flags[:, flag_col[v]] != 0)

        # copy flags into attribute
        st_var.flags[flag_locs] = 1

        if plots or diagnostics:
            utils.print_flagged_obs_number(logfile, "Climatological", variable, len(flag_locs[0]), noWrite = True)
            print "where\n"
            nflags = len(np.where(station.qc_flags[:, flag_col[v]] == 1)[0])
            utils.print_flagged_obs_number(logfile, "  Firm Clim", variable, nflags, noWrite = True)
            nflags = len(np.where(station.qc_flags[:, flag_col[v]] == 2)[0])
            utils.print_flagged_obs_number(logfile, "  Tentative Clim", variable, nflags, noWrite = True)
            utils.print_flagged_obs_number(logfile, "Climatological", variable, len(flag_locs[0]))
            nflags = len(np.where(station.qc_flags[:, flag_col[v]] == 1)[0])
            utils.print_flagged_obs_number(logfile, "  Firm Clim", variable, nflags)
            nflags = len(np.where(station.qc_flags[:, flag_col[v]] == 2)[0])
            utils.print_flagged_obs_number(logfile, "  Tentative Clim", variable, nflags)

        # firm flags match 030220
    station = utils.append_history(station, "Climatological Check")  
コード例 #21
ファイル: variance.py プロジェクト: wk1984/HadISD_v2
def evc(station, variable_list, flag_col, start, end, logfile, diagnostics = False, plots = False, idl = False):
    if plots or diagnostics:
        import matplotlib.pyplot as plt
        import calendar

    # very similar to climatological check - ensure that not duplicating
    for v, variable in enumerate(variable_list):
        st_var = getattr(station, variable)
        reporting_resolution = utils.reporting_accuracy(utils.apply_filter_flags(st_var))
        reporting_freq = utils.reporting_frequency(utils.apply_filter_flags(st_var))
        month_ranges = utils.month_starts_in_pairs(start, end)
        month_ranges = month_ranges.reshape(-1,12,2)

        month_data_count = np.zeros(month_ranges.shape[0:2])

        # for each month
        for month in range(12):

            # set up hourly climatologies
            hourly_clims = np.zeros(24)

            this_month, year_ids, month_data_count[:,month] = utils.concatenate_months(month_ranges[:,month,:], st_var.data, hours = True)

            # # extract each year and append together
            # year_ids = [] # counter to determine which year each day corresponds to
            # for year in range(month_ranges.shape[0]):
            #     this_year = st_var.data[month_ranges[year,month][0]:month_ranges[year,month][1]]
            #     if year == 0:
            #         # store so can access each hour of day separately
            #         this_month = this_year.reshape(-1,24)
            #         year_ids = [year for x in range(this_month.shape[0])]
            #         month_data_count[year,month] = len(this_year.compressed())
            #     else:
            #         this_year = this_year.reshape(-1,24)
            #         this_month = np.ma.concatenate((this_month, this_year), axis = 0)
            #         year_ids.extend([year for x in range(this_year.shape[0])])
            #         month_data_count[year,month] = len(this_year.compressed())

            # winsorize and get hourly climatology 
            for h in range(24):
                this_hour = this_month[:,h]
                if len(this_hour.compressed()) > 100:

                    # winsorize & climatologies - done to match IDL
                    if idl:
                        this_hour_winsorized = utils.winsorize(np.append(this_hour.compressed(), -999999), 0.05, idl = idl)
                        hourly_clims[h] = np.ma.sum(this_hour_winsorized)/(len(this_hour_winsorized) - 1)
                        this_hour_winsorized = utils.winsorize(this_hour.compressed(), 0.05, idl = idl)
                        hourly_clims[h] = np.ma.mean(this_hour_winsorized)
            hourly_clims = np.ma.masked_where(hourly_clims == st_var.data.fill_value, hourly_clims)           
            anomalies = this_month - np.tile(hourly_clims, (this_month.shape[0], 1))
            # extract IQR of anomalies (using 1/2 value to match IDL)
            if len(anomalies.compressed()) >= 10:
                iqr = utils.IQR(anomalies.compressed().reshape(-1)) / 2. # to match IDL
                if iqr < 1.5: iqr = 1.5

                iqr = st_var.mdi
            normed_anomalies = anomalies / iqr

            variances = np.ma.zeros(month_ranges.shape[0])
            variances.mask = [False for i in range(month_ranges.shape[0])]
            rep_accuracies = np.zeros(month_ranges.shape[0])
            rep_freqs = np.zeros(month_ranges.shape[0])
            year_ids = np.array(year_ids)
            # extract variance of normalised anomalies for each year
            for y, year in enumerate(range(month_ranges.shape[0])):
                year_locs = np.where(year_ids == y)
                this_year = normed_anomalies[year_locs,:]
                this_year = this_year.reshape(-1)
            # end of similarity with Climatological check
                if len(this_year.compressed()) >= 30:
                    variances[y] = utils.mean_absolute_deviation(this_year, median = True)
                    rep_accuracies[y] = utils.reporting_accuracy(this_year)
                    rep_freqs[y] = utils.reporting_frequency(this_year)

                    variances.mask[y] = True

            good = np.where(month_data_count[:,month] >= 100)
            # get median and IQR of variance for all years for this month
            if len(good[0]) >= 10:
                median_variance = np.median(variances[good])
                iqr_variance = utils.IQR(variances[good]) / 2. # to match IDL
                if iqr_variance < 0.01: iqr_variance = 0.01
                median_variance = st_var.mdi
                iqr_variance = st_var.mdi

            # if SLP, then get median and MAD of SLP and windspeed for month
            if variable in ["slp", "windspeeds"]:
                winds = getattr(station, "windspeeds")
                slp = getattr(station, "slp")
                # refactor this as similar in style to how target data extracted  
                for y, year in enumerate(range(month_ranges.shape[0])):
                    if y == 0:
                        winds_year = winds.data[month_ranges[year,month][0]:month_ranges[year,month][1]]
                        winds_month = winds_year.reshape(-1,24)
                        slp_year = slp.data[month_ranges[year,month][0]:month_ranges[year,month][1]]
                        slp_month = slp_year.reshape(-1,24)
                        winds_year = winds.data[month_ranges[year,month][0]:month_ranges[year,month][1]]                  
                        winds_year = winds_year.reshape(-1,24)
                        winds_month = np.ma.concatenate((winds_month, winds_year), axis = 0)
                        slp_year = slp.data[month_ranges[year,month][0]:month_ranges[year,month][1]]                  
                        slp_year =  slp_year.reshape(-1,24)
                        slp_month = np.ma.concatenate((slp_month, slp_year), axis = 0)
                median_wind = np.ma.median(winds_month)
                median_slp  = np.ma.median(slp_month)
                wind_MAD = utils.mean_absolute_deviation(winds_month.compressed())
                slp_MAD = utils.mean_absolute_deviation(slp_month.compressed())
                if diagnostics:
                    print "median windspeed {} m/s, MAD = {}".format(median_wind, wind_MAD)
                    print "median slp {} hPa, MAD = {}".format(median_slp, slp_MAD)

            # now test to see if variance exceeds expected range
            for y, year in enumerate(range(month_ranges.shape[0])):

                if (variances[y] != st_var.mdi) and (iqr_variance != st_var.mdi) and \
                    (median_variance != st_var.mdi) and (month_data_count[y,month] >= DATA_COUNT_THRESHOLD):
                    # if SLP, then need to test if deep low pressure ("hurricane/storm") present
                    #   as this will increase the variance for this month + year
                    if variable in ["slp", "windspeeds"]:
                        iqr_threshold = 6.
                        # increase threshold if reporting frequency and resolution of this
                        #   year doesn't match average
                        if (rep_accuracies[y] != reporting_resolution) and \
                            (rep_freqs[y] != reporting_freq):
                            iqr_threshold = 8.
                        if diagnostics:
                            print np.abs(variances[y] - median_variance) / iqr_variance, variances[y] , median_variance , iqr_variance , iqr_threshold, month+1, year+start.year
                        if np.abs((variances[y] - median_variance) / iqr_variance) > iqr_threshold:
                            # check for storms     
                            winds_month = winds.data[month_ranges[year,month][0]:month_ranges[year,month][1]]                  
                            slp_month = slp.data[month_ranges[year,month][0]:month_ranges[year,month][1]]                  
                            storm = False
                            if (len(winds_month.compressed()) >= 1) and (len(slp_month.compressed()) >= 1):
                                # find max wind & min SLP
                                # max_wind_loc = np.where(winds_month == np.max(winds_month))[0][0]
                                # min_slp_loc = np.where(slp_month == np.min(slp_month))[0][0]

                                # if these are above thresholds and within one day of each other,
                                #    then it likely was a storm
                                # print "fix this in case of multiple max/min locations"
                                # if (np.abs(max_wind_loc - min_slp_loc) <= 24) and \ 
                                #     (((np.max(winds_month) - median_wind) / wind_MAD) > MAD_THRESHOLD) and \
                                #     (((median_slp - np.min(slp_month)) / slp_MAD) > MAD_THRESHOLD): 

                                # locations where winds greater than threshold
                                high_winds, = np.where((winds_month - median_wind)/wind_MAD > MAD_THRESHOLD)
                                # and where SLP less than threshold
                                low_slps, = np.where((median_slp - slp_month)/slp_MAD > MAD_THRESHOLD)

                                # if any locations match, then it's a storm
                                match_loc = high_winds[np.in1d(high_winds, low_slps)]
                                if len(match_loc) > 0:
                                    storm = True
                                print "write spurious"
                            # check the SLP first difference series
                            #   to ensure a drop down and climb out of minimum SLP/or climb up and down from maximum wind speed
                            if variable == "slp":
                                diffs = np.diff(slp_month.compressed())
                            elif variable == "windspeeds":
                                diffs = np.diff(winds_month.compressed())
                            negs, poss = 0,0
                            biggest_neg, biggest_pos = 0,0
                            for diff in diffs:
                                if diff > 0:
                                    if negs > biggest_neg: biggest_neg = negs
                                    negs = 0
                                    poss += 1
                                    if poss > biggest_pos: biggest_pos = poss
                                    poss = 0
                                    negs += 1
                            if (biggest_neg < 10) and (biggest_pos < 10) and not storm:
                                # not a hurricane, so mask
                                station.qc_flags[month_ranges[year,month,0]:month_ranges[year,month,1], flag_col[v]] = 1
                                if plots or diagnostics:
                                    print "No Storm or Hurricane in %i %i - flagging\n" % (month+1, y+start.year)
                                    logfile.write("No Storm or Hurricane in %i %i - flagging\n" % (month+1, y+start.year))
                                # hurricane
                                if plots or diagnostics:
                                    print "Storm or Hurricane in %i %i - not flagging\n" % (month+1, y+start.year)
                                    logfile.write("Storm or Hurricane in %i %i - not flagging\n" % (month+1, y+start.year))
                            if plots:
                                # plot showing the pressure, pressure first differences and the wind speeds
                                plot_times = utils.times_hours_to_datetime(station.time.data[month_ranges[year,month][0]:month_ranges[year,month][1]], start)

                                evc_plot_slp_wind(plot_times, slp_month, diffs, median_slp, slp_MAD, winds_month, median_wind, wind_MAD)

                        iqr_threshold = 8.
                        if (rep_accuracies[y] != reporting_resolution) and \
                            (rep_freqs[y] != reporting_freq):
                            iqr_threshold = 10.

                        if np.abs(variances[y] - median_variance) / iqr_variance > iqr_threshold:
                            if diagnostics:
                                print "flagging {} {}".format(year+start.year,calendar.month_name[month+1])
                            # remove the data 
                            station.qc_flags[month_ranges[year,month,0]:month_ranges[year,month,1], flag_col[v]] = 1

            if plots:
                plot_variances = (variances - median_variance) / iqr_variance

                plot_variances = np.ma.masked_where(month_data_count[:,month] < DATA_COUNT_THRESHOLD,plot_variances)
                evc_plot_hist(plot_variances, iqr_threshold, "Variance Check - %s - %s" % (variable, calendar.month_name[month+1]))
        flag_locs = np.where(station.qc_flags[:, flag_col[v]] != 0)
        if plots or diagnostics:
            utils.print_flagged_obs_number(logfile, "Variance", variable, len(flag_locs[0]), noWrite = True)
            utils.print_flagged_obs_number(logfile, "Variance", variable, len(flag_locs[0]))
        # copy flags into attribute
        st_var.flags[flag_locs] = 1

    # matches 030660 for T, D and SLP 21/8/2014

    station = utils.append_history(station, "Excess Variance Check")

    return # evc
コード例 #22
ファイル: clean_up.py プロジェクト: rjhd2/HadISD_v2
def clean_up(st_var, flags, input_flag_cols, out_flag_col, start, end, times, plots=False):
    Clean up the remaining observations if many flagged or few left in a month

    :param MetVar st_var: input station object
    :param array flags: QC flags array
    :param array input_flag_cols: which columns to check over
    :param int out_flag_col: in which column to set the flags
    :param datetime start: start of dataset
    :param datetime end: end of dataset
    :param array times: hourly time stamps
    :param bool plots: show plots

    month_ranges = utils.month_starts_in_pairs(start, end)

    total_flags = np.sum(flags[:, input_flag_cols], axis=1)

    filtered = utils.apply_filter_flags(st_var)

    # test each month
    for month in month_ranges:

        this_month = filtered[month[0] : month[1]]

        # if less than 20 obs, then flag remaining
        if len(this_month.compressed()) < 20:

            locs = np.where(this_month.mask == False)[0]

            # only flag those observations that actually exist.
            if len(locs) > 0:

                month_range = np.arange(month[0], month[1], dtype=("int"))

                flags[month_range[locs], out_flag_col] = 1

                if plots:
                        times, st_var.data, month_range[0], month_range[-1], start, st_var.name, extra_text="few_obs"

        # if 40% of obs flagged, then flag remainig

            good_locs = np.where(this_month.mask == False)[0]
            flag_locs = np.where(total_flags[month[0] : month[1]] > 0)[0]

            # good_locs - internal and neighbour flags already applied, so need to add
            #      these back in.
            proportion = float(len(flag_locs)) / float(len(good_locs) + len(flag_locs))

            if proportion > 0.4:

                month_range = np.arange(month[0], month[1], dtype=("int"))
                flags[month_range[good_locs], out_flag_col] = 1

                if plots:
                        extra_text="lots flagged",

    return  # clean_up
コード例 #23
def spc_diff(sfc, stn, flags, month_ranges, start, end, logfile, plots = False, diagnostics = False, doMonth = False):
    Pressure difference check, on individual obs.  Remove very silly stnlp
    :param array sfc: SLP
    :param array stn: STNLP 
    :param array flags: flags_array
    :param array month_ranges: array of month start and end times
    :param datetime start: DATASTART
    :param datetime end: DATAEND
    :param file logfile: logfile to store outputs
    :param bool plots: do plots or not
    :param bool diagnostics: extra verbose output
    :param bool doMonth: account for spare month

    :returns: flags - locations where flags have been set
    month_ranges = utils.month_starts_in_pairs(start, end)
    month_ranges = month_ranges.reshape(-1,12,2)

    # apply flags (and mask incomplete year if appropriate)
    sfc_filtered = utils.apply_filter_flags(sfc, doMonth = doMonth, start = start, end = end)
    stn_filtered = utils.apply_filter_flags(stn, doMonth = doMonth, start = start, end = end)

    # get the differences
    diffs = sfc.data - stn.data
    diffs_filtered = sfc_filtered - stn_filtered

    # robust statistics
    median_difference = np.ma.median(diffs)
    mad_difference = utils.mean_absolute_deviation(diffs, median = True)

    # where exceed
    high, = np.ma.where(diffs > (median_difference + MAD_THRESHOLD*mad_difference))
    low, = np.ma.where(diffs < (median_difference - MAD_THRESHOLD*mad_difference))

    # set flags
    if len(high) != 0:
        if diagnostics: print "Number of high differences {}".format(len(high))
        flags[high] = 1
    if len(low) != 0:
        if diagnostics: print "Number of low differences {}".format(len(low))
        flags[low] = 1

    if plots:
        import matplotlib.pyplot as plt
        plt.hist(diffs.compressed(), bins = np.arange(np.round(median_difference)-10, np.round(median_difference)+10, 0.1))
        plt.axvline(x = (median_difference + 4*mad_difference), ls = "--", c = "r")
        plt.axvline(x = (median_difference - 4*mad_difference), ls = "--", c = "r")
        plt.xlim([median_difference - 11, median_difference + 11])
        plt.xlabel("Difference (hPa)")

    # How to set the range of allowable values.
    nflags, = np.where(flags != 0)
    utils.print_flagged_obs_number(logfile, "Station Level Pressure", "stnlp", len(nflags), noWrite=diagnostics)

    return flags # spc_diff
コード例 #24
ファイル: frequent_values.py プロジェクト: rjhd2/HadISD_v2
def fvc(station, variable_list, flag_col, start, end, logfile, diagnostics = False, plots = False):
    Check for certain values occurring more frequently than would be expected
    :param object station: station object to process
    :param list variable_list: list of variables to process
    :param list flag_col: columns to fill in flag array
    :param datetime start: datetime object of start of data
    :param datetime end: datetime object of end of data
    :param file logfile: logfile to store outputs
    :param bool diagnostics: produce extra diagnostic output
    :param bool plots: produce plots
    MIN_DATA_REQUIRED = 500 # to create histogram for complete record
    MIN_DATA_REQUIRED_YEAR = 100 # to create histogram

    month_ranges = utils.month_starts_in_pairs(start, end)

    month_ranges_years = month_ranges.reshape(-1,12,2)

    for v,variable in enumerate(variable_list):
        st_var = getattr(station, variable)
        reporting_accuracy = utils.reporting_accuracy(utils.apply_filter_flags(st_var))
        # apply flags - for detection only
        filtered_data = utils.apply_filter_flags(st_var)

        for season in range(5): # Year,MAM,JJA,SON,JF+D
            if season == 0:
                # all year
                season_data = np.ma.masked_values(filtered_data.compressed(), st_var.fdi)
                thresholds = [30,20,10]

                thresholds = [20,15,10]
                season_data = np.ma.array([])
                for y,year in enumerate(month_ranges_years):
                    # churn through months extracting data, accounting for fdi and concatenating together
                    if season == 1:
                        season_data = np.ma.concatenate([season_data, np.ma.masked_values(filtered_data[year[2][0]:year[4][-1]], st_var.fdi)])
                    elif season == 2:
                        season_data = np.ma.concatenate([season_data, np.ma.masked_values(filtered_data[year[5][0]:year[7][-1]], st_var.fdi)])
                    elif season == 3:
                        season_data = np.ma.concatenate([season_data, np.ma.masked_values(filtered_data[year[8][0]:year[10][-1]], st_var.fdi)])
                    elif season == 4:
                        season_data = np.ma.concatenate([season_data, np.ma.masked_values(filtered_data[year[0][0]:year[1][-1]], st_var.fdi)])
                        season_data = np.ma.concatenate([season_data, np.ma.masked_values(filtered_data[year[-1][0]:year[-1][-1]], st_var.fdi)])

            season_data = season_data.compressed()

            if len(season_data) > MIN_DATA_REQUIRED:    

                if 0 < reporting_accuracy <= 0.5: # -1 used as missing value
                    bins, bincenters = utils.create_bins(season_data, 0.5)
                    bins, bincenters = utils.create_bins(season_data, 1.0)

                hist, binEdges = np.histogram(season_data, bins = bins)
                if plots:
                    plot_hist, bincenters = fvc_plot_setup(season_data, hist, binEdges, st_var.name, title = "%s" % (SEASONS[season]))

                bad_bin = np.zeros(len(hist))

                # scan through bin values and identify bad ones
                for e, element in enumerate(hist):                  
                    if e > 3 and e <= (len(hist) - 3):
                        # don't bother with first three or last three bins
                        seven_bins = hist[e-3:e+3+1]
                        if (seven_bins[3] == seven_bins.max()) and (seven_bins[3] != 0):
                            # is local maximum and != zero
                            if (seven_bins[3]/float(seven_bins.sum()) >= 0.5) and (seven_bins[3] >= thresholds[0]):
                                # contains >50% of data and is greater than threshold
                                bad_bin[e] = 1

                            # for plotting remove good bins
                                if plots: plot_hist[e]=1e-1
                            if plots: plot_hist[e]=1e-1
                        if plots: plot_hist[e]=1e-1

                if plots:
                    plt.step(bincenters, plot_hist, 'r-', where='mid')
                # having identified possible bad bins, check each year in turn
                for y,year in enumerate(month_ranges_years):

                    if season == 0:
                        # year
                        year_data = np.ma.masked_values(st_var.data[year[0][0]:year[-1][-1]], st_var.fdi)
                        year_flags = station.qc_flags[year[0][0]:year[-1][-1],flag_col[v]]
                    elif season == 1:
                        year_data = np.ma.masked_values(st_var.data[year[2][0]:year[4][-1]], st_var.fdi)
                        year_flags = station.qc_flags[year[2][0]:year[4][-1],flag_col[v]]
                    elif season == 2:
                        year_data = np.ma.masked_values(st_var.data[year[5][0]:year[7][-1]], st_var.fdi)
                        year_flags = station.qc_flags[year[5][0]:year[7][-1],flag_col[v]]
                    elif season == 3:
                        year_data = np.ma.masked_values(st_var.data[year[8][0]:year[10][-1]], st_var.fdi)
                        year_flags = station.qc_flags[year[8][0]:year[10][-1],flag_col[v]]
                    elif season == 4:
                        year_data = np.ma.concatenate([np.ma.masked_values(st_var.data[year[0][0]:year[1][-1]], st_var.fdi),\
                                                       np.ma.masked_values(st_var.data[year[-1][0]:year[-1][-1]], st_var.fdi)])
                        year_flags = np.append(station.qc_flags[year[0][0]:year[1][-1],flag_col[v]],station.qc_flags[year[-1][0]:year[-1][-1],flag_col[v]])

                    if len(year_data.compressed()) > MIN_DATA_REQUIRED_YEAR:    

                        hist, binEdges = np.histogram(year_data.compressed(), bins = bins)

                        if plots:
                            plot_hist, bincenters = fvc_plot_setup(hist, binEdges, st_var.name, title = "%s - %s" % (y+start.year, SEASONS[season]))

                        for e, element in enumerate(hist):

                            if bad_bin[e] == 1:
                                # only look at pre-identified bins

                                if e >= 3 and e <= (len(hist) - 3):
                                    # don't bother with first three or last three bins
                                    seven_bins = hist[e-3:e+3+1].astype('float')
                                    if (seven_bins[3] == seven_bins.max()) and (seven_bins[3] != 0):
                                        # is local maximum and != zero
                                        if (seven_bins[3]/seven_bins.sum() >= 0.5 and seven_bins[3] >= thresholds[1]) \
                                            or (seven_bins[3]/seven_bins.sum() >= 0.9 and seven_bins[3] >= thresholds[2]):
                                            # contains >50% or >90% of data and is greater than appropriate threshold

                                            # Flag these data
                                            bad_points = np.where((year_data >= binEdges[e]) & (year_data < binEdges[e+1]))
                                            year_flags[bad_points] = 1

                                        # for plotting remove good bins
                                            if plots: plot_hist[e]=1e-1
                                        if plots: plot_hist[e]=1e-1
                                    if plots: plot_hist[e]=1e-1
                                if plots: plot_hist[e]=1e-1

                        if diagnostics or plots:
                            nflags = len(np.where(year_flags != 0)[0])
                            print "{} {}".format(y + start.year, nflags)

                        if plots:
                            if nflags > 0:
                                plt.step(bincenters, plot_hist, 'r-', where='mid')

                    # copy flags back

                    if season == 0:
                        station.qc_flags[year[0][0]:year[-1][-1], flag_col[v]] = year_flags   
                    elif season == 1:
                        station.qc_flags[year[2][0]:year[4][-1], flag_col[v]] = year_flags   
                    elif season == 2:
                        station.qc_flags[year[5][0]:year[7][-1], flag_col[v]] = year_flags   
                    elif season == 3:
                        station.qc_flags[year[8][0]:year[10][-1], flag_col[v]] = year_flags   
                    elif season == 4:
                        split = len(station.qc_flags[year[0][0]:year[1][-1], flag_col[v]])
                        station.qc_flags[year[0][0]:year[1][-1], flag_col[v]] = year_flags[:split]
                        station.qc_flags[year[-1][0]:year[-1][-1], flag_col[v]] = year_flags[split:]
        flag_locs = np.where(station.qc_flags[:, flag_col[v]] != 0)
        if plots or diagnostics:
            utils.print_flagged_obs_number(logfile, "Frequent Value", variable, len(flag_locs[0]), noWrite = True)
            utils.print_flagged_obs_number(logfile, "Frequent Value", variable, len(flag_locs[0]))

        # copy flags into attribute
        st_var.flags[flag_locs] = 1
    station = utils.append_history(station, "Frequent Values Check")  
    return # fvc
コード例 #25
ファイル: neighbour_checks.py プロジェクト: rjhd2/HadISD_v2
def neighbour_checks(station_info, restart_id = "", end_id = "", distances=np.array([]), angles=np.array([]), second = False, masking = False, doZip=False, plots = False, diagnostics = False):
    Run through neighbour checks on list of stations passed
    :param list station_info: list of lists - [[ID, lat, lon, elev]] - strings
    :param array distances: array of distances between station pairs
    :param array angles: array of angles between station pairs
    :param bool second: do the second run
    :param bool masking: apply the flags to the data to mask the observations.

    first = not second

    qc_code_version = subprocess.check_output(['svnversion']).strip()

    # if distances and angles not calculated, then do so
    if (len(distances) == 0) or (len(angles) == 0):
        print "calculating distances and bearings matrix"
        distances, angles = get_distances_angles(station_info)

    # extract before truncate the array
    neighbour_elevations = np.array(station_info[:,3], dtype=float) 
    neighbour_ids        = np.array(station_info[:,0])
    neighbour_info       = np.array(station_info[:,:])

    # sort truncated run
    startindex = 0
    if restart_id != "":
        startindex, = np.where(station_info[:,0] == restart_id)

    if end_id != "":
        endindex, = np.where(station_info[:,0] == end_id)
        if endindex != len(station_info) -1:
            station_info = station_info[startindex: endindex+1]
            distances = distances[startindex:endindex+1,:]
            angles = angles[startindex:endindex+1,:]
            station_info = station_info[startindex:]
            distances = distances[startindex:,:]
            angles = angles[startindex:,:]
        station_info = station_info[startindex:]
        distances = distances[startindex:,:]
        angles = angles[startindex:,:]

    # process each neighbour
    for st, stat in enumerate(station_info):       

        print dt.datetime.strftime(dt.datetime.now(), "%A, %d %B %Y, %H:%M:%S")
        print "Neighbour Check"
        print "{:35s} {}".format("Station Identifier :", stat[0])

        if not plots and not diagnostics:
            logfile = file(LOG_OUTFILE_LOCS+stat[0]+'.log','a') # append to file if second iteration.
            logfile.write(dt.datetime.strftime(dt.datetime.now(), "%A, %d %B %Y, %H:%M:%S\n"))
            logfile.write("Neighbour Check\n")
            logfile.write("{:35s} {}\n".format("Station Identifier :", stat[0]))
            logfile = ""

        process_start_time = time.time()

        station = utils.Station(stat[0], float(stat[1]), float(stat[2]), float(stat[3]))

        # if running through the first time
        if first:

            if os.path.exists(os.path.join(NETCDF_DATA_LOCS, station.id + "_internal.nc.gz")):
                # if gzip file, unzip here
                subprocess.call(["gunzip",os.path.join(NETCDF_DATA_LOCS, station.id + "_internal.nc.gz")])
                time.sleep(5) # make sure it is unzipped before proceeding

            # read in the data
            ncdfp.read(os.path.join(NETCDF_DATA_LOCS, station.id + "_internal.nc"), station, process_vars, carry_thru_vars, diagnostics = diagnostics)

            if plots or diagnostics:
                print "{:35s}  {}\n".format("Total station record size :",len(station.time.data))
                logfile.write("{:35s}  {}\n".format("Total station record size :",len(station.time.data)))

            match_to_compress = utils.create_fulltimes(station, process_vars, DATASTART, DATAEND, carry_thru_vars)

        # or if second pass through?
        elif second:
            if os.path.exists(os.path.join(NETCDF_DATA_LOCS, station.id + "internal2.nc.gz")):
                # if gzip file, unzip here
                subprocess.call(["gunzip",os.path.join(NETCDF_DATA_LOCS, station.id + "_internal2.nc.gz")])
                time.sleep(5) # make sure it is unzipped before proceeding

            ncdfp.read(os.path.join(NETCDF_DATA_LOCS, station.id + "_internal2.nc"), station, process_vars, carry_thru_vars, diagnostics = diagnostics)
            if plots or diagnostics:
                print "{:35s}  {}\n".format("Total station record size :",len(station.time.data))
                logfile.write("{:35s}  {}\n".format("Total station record size :",len(station.time.data)))

            match_to_compress = utils.create_fulltimes(station, process_vars, DATASTART, DATAEND, carry_thru_vars)

        # select neighbours
        neighbour_distances  = distances[st,:]
        neighbour_bearings   = angles[st,:]

        # have to add in start index so that can use location in distance file.
        # neighbours = n_utils.get_neighbours(st+startindex, np.float(stat[3]), neighbour_distances, neighbour_bearings, neighbour_elevations)

        # return all neighbours up to a limit from the distance and elevation offsets (500km and 300m respectively)
        neighbours, neighbour_quadrants = n_utils.get_all_neighbours(st+startindex, np.float(stat[3]), neighbour_distances, neighbour_bearings, neighbour_elevations)

        if plots or diagnostics:
            print "{:14s} {:10s} {:10s}".format("Neighbour","Distance","Elevation")
            for n in neighbours:
                print "{:14s} {:10.1f} {:10.1f}".format(neighbour_ids[n],neighbour_distances[n],neighbour_elevations[n])

            logfile.write("{:14s} {:10s} {:10s}\n".format("Neighbour","Distance","Elevation"))
            for n in neighbours:
                logfile.write("{:14s} {:10.1f} {:10.1f}\n".format(neighbour_ids[n],neighbour_distances[n],neighbour_elevations[n]))

        # if sufficient neighbours
        if len(neighbours) >= 3:

            for variable, col in FLAG_OUTLIER_DICT.items():
                # NOTE - this requires multiple reads of the same file
                #      but does make it easier to understand and code

                st_var = getattr(station, variable)

                if plots or diagnostics:
                    print "Length of {} record: {}".format(variable, len(st_var.data.compressed()))
                    logfile.write("Length of {} record: {}\n".format(variable, len(st_var.data.compressed())))

                if len(st_var.data.compressed()) > 0:

                    final_neighbours = n_utils.select_neighbours(station, variable, neighbour_info[neighbours], neighbours, neighbour_distances[neighbours], neighbour_quadrants, NETCDF_DATA_LOCS, DATASTART, DATAEND, logfile, second = second, diagnostics = diagnostics, plots = plots)

                    # now read in final set of neighbours and process

                    neigh_flags = np.zeros(len(station.time.data)) # count up how many neighbours think this obs is bad
                    neigh_count = np.zeros(len(station.time.data)) # number of neighbours at each time stamp
                    dpd_flags = np.zeros(len(station.time.data)) # number of neighbours at each time stamp
                    reporting_accuracies = np.zeros(len(neighbours)) # reporting accuracy of each neighbour

                    all_data = np.ma.zeros([len(final_neighbours), len(station.time.data)]) # store all the neighbour values

                    for nn, nn_loc in enumerate(final_neighbours):

                        neigh_details = neighbour_info[nn_loc]
                        neigh = utils.Station(neigh_details[0], float(neigh_details[1]), float(neigh_details[2]), float(neigh_details[3]))

                        if first:
                            ncdfp.read(os.path.join(NETCDF_DATA_LOCS, neigh.id + "_internal.nc"), neigh, [variable], diagnostics = diagnostics, read_input_station_id = False)
                        elif second:
                            ncdfp.read(os.path.join(NETCDF_DATA_LOCS, neigh.id + "_internal2.nc"), neigh, [variable], diagnostics = diagnostics, read_input_station_id = False)

                        dummy = utils.create_fulltimes(neigh, [variable], DATASTART, DATAEND, [], do_input_station_id = False)

                        all_data[nn, :] = utils.apply_filter_flags(getattr(neigh, variable))

                        if diagnostics:
                            print neigh_details

                        n_utils.detect(station, neigh, variable, neigh_flags, neigh_count, DATASTART, DATAEND, distance = neighbour_distances[nn_loc], diagnostics = diagnostics, plots = plots)

                        reporting_accuracies[nn] = utils.reporting_accuracy(getattr(neigh,variable).data)

                        dpd_flags += neigh.qc_flags[:,31]
                    # gone through all neighbours

                    # if at least 2/3 of neighbours have flagged this point (and at least 3 neighbours)
                    some_flags, = np.where(neigh_flags > 0)            
                    outlier_locs, = np.where(np.logical_and((neigh_count[some_flags] >= 3),(neigh_flags[some_flags].astype("float")/neigh_count[some_flags] > 2./3.)))

                    # flag where < 3 neighbours
                    locs = np.where(neigh_count[some_flags] < 3)
                    station.qc_flags[some_flags[locs], col] = -1

                    if len(outlier_locs) >= 1:
                        station.qc_flags[some_flags[outlier_locs], col] = 1

                        # print number flagged and copy into attribute
                        if plots or diagnostics:
                            utils.print_flagged_obs_number(logfile, "Neighbour", variable, len(outlier_locs), noWrite = True)
                            utils.print_flagged_obs_number(logfile, "Neighbour", variable, len(outlier_locs))
                        st_var = getattr(station, variable)
                        st_var.flags[some_flags[outlier_locs]] = 1

                        if plots or diagnostics:
                            utils.print_flagged_obs_number(logfile, "Neighbour", variable, len(outlier_locs), noWrite = True)
                            utils.print_flagged_obs_number(logfile, "Neighbour", variable, len(outlier_locs))

                    if plots:
                        n_utils.plot_outlier(station, variable, some_flags[outlier_locs], all_data, DATASTART)

                    # unflagging using neighbours
                    n_utils.do_unflagging(station, variable, all_data, reporting_accuracies, neigh_count, dpd_flags, FLAG_COL_DICT, DATASTART, logfile, plots = plots, diagnostics = diagnostics)

                    if plots or diagnostics:
                        print "No observations to assess for {}".format(variable)
                        logfile.write("No observations to assess for {}\n".format(variable))

            # variable loop
            if plots or diagnostics:
                print "Fewer than 3 neighbours"
                logfile.write("Fewer than 3 neighbours\n")

        print dt.datetime.strftime(dt.datetime.now(), "%A, %d %B %Y, %H:%M:%S")
        print "processing took {:4.0f}s\n\n".format(time.time() - process_start_time)

        # end of neighbour check
	utils.append_history(station, "Neighbour Outlier Check")

        # clean up months 

        qc_tests.clean_up.clu(station, ["temperatures","dewpoints","windspeeds","winddirs","slp"], [44,45,46,47,48], FLAG_COL_DICT, DATASTART, DATAEND, logfile, plots = plots)

        if diagnostics or plots: raw_input("stop")

        # masking (at least call from here - optional call from internal?)

        # write to file
        if first:
            ncdfp.write(os.path.join(NETCDF_DATA_LOCS, station.id + "_external.nc"), station, process_vars, os.path.join(INPUT_FILE_LOCS,'attributes.dat'), opt_var_list = carry_thru_vars, compressed = match_to_compress, processing_date = '', qc_code_version = qc_code_version)
            # gzip the raw file
        elif second:
            ncdfp.write(os.path.join(NETCDF_DATA_LOCS, station.id + "_external2.nc"), station, process_vars, os.path.join(INPUT_FILE_LOCS,'attributes.dat'), opt_var_list = carry_thru_vars, compressed = match_to_compress, processing_date = '', qc_code_version = qc_code_version)
            # gzip the raw file

        # masking - apply the flags and copy masked data to flagged_obs attribute
        if masking:

            station = utils.mask(station, process_vars, logfile)

        # write to file
            if first:
                ncdfp.write(os.path.join(NETCDF_DATA_LOCS, station.id + "_mask.nc"), station, process_vars, os.path.join(INPUT_FILE_LOCS,'attributes.dat'), opt_var_list = carry_thru_vars, compressed = match_to_compress, processing_date = '', qc_code_version = qc_code_version)
            elif second:
                ncdfp.write(os.path.join(NETCDF_DATA_LOCS, station.id + "_mask2.nc"), station, process_vars, os.path.join(INPUT_FILE_LOCS,'attributes.dat'), opt_var_list = carry_thru_vars, compressed = match_to_compress, processing_date = '', qc_code_version = qc_code_version)

        if plots or diagnostics:
            print "Masking completed\n"
            print dt.datetime.strftime(dt.datetime.now(), "%A, %d %B %Y, %H:%M:%S\n")
            print "processing took {:4.0f}s\n\n".format(time.time() - process_start_time)
            logfile.write("Masking completed\n")
            logfile.write(dt.datetime.strftime(dt.datetime.now(), "%A, %d %B %Y, %H:%M:%S\n"))
            logfile.write("processing took {:4.0f}s\n\n".format(time.time() - process_start_time))
    # gzip up all the raw files
    if doZip:
        for st, stat in enumerate(station_info):       
            if first:
                subprocess.call(["gzip",os.path.join(NETCDF_DATA_LOCS, stat[0]+"_internal.nc")])
                if masking:
                    subprocess.call(["gzip",os.path.join(NETCDF_DATA_LOCS, stat[0]+"_external.nc")])
                    subprocess.call(["gzip",os.path.join(NETCDF_DATA_LOCS, stat[0]+"_mask.nc")])

            elif second:
                subprocess.call(["gzip",os.path.join(NETCDF_DATA_LOCS, stat[0]+"_internal2.nc")])
                if masking:
                    subprocess.call(["gzip",os.path.join(NETCDF_DATA_LOCS, stat[0]+"_external2.nc")])
                    subprocess.call(["gzip",os.path.join(NETCDF_DATA_LOCS, stat[0]+"_mask2.nc")])

    print "Neighbour Checks completed\n"

    return # neighbour_checks 
コード例 #26
ファイル: spike.py プロジェクト: rjhd2/HadISD_v2
def sc(station, variable_list, flag_col, start, end, logfile, diagnostics = False, plots = False, second = False):
    Spike Check, looks for spikes up to 3 observations long, using thresholds
    calculated from the data itself.

    :param MetVar station: the station object
    :param list variable_list: list of observational variables to process
    :param list flag_col: the columns to set on the QC flag array
    :param datetime start: dataset start time
    :param datetime end: dataset end time
    :param file logfile: logfile to store outputs
    :param bool plots: do plots
    :param bool second: run for second time

    print "refactor"
    for v, variable in enumerate(variable_list):

        flags = station.qc_flags[:, flag_col[v]]

        prev_flag_number = 0
        if second:
            # count currently existing flags:
            prev_flag_number = len(flags[flags != 0])
        st_var = getattr(station, variable)
        all_filtered = utils.apply_filter_flags(st_var)
        reporting_resolution = utils.reporting_accuracy(utils.apply_filter_flags(st_var))
        # to match IDL system - should never be called as would mean no data
        if reporting_resolution == -1: reporting_resolution = 1 

        month_ranges = utils.month_starts_in_pairs(start, end)
        month_ranges = month_ranges.reshape(-1,12,2)
        good = np.where(all_filtered.mask == False)
        full_time_diffs = np.ma.zeros(len(all_filtered))
        full_time_diffs.mask = all_filtered.mask
        full_time_diffs[good] = station.time.data[good][1:] - station.time.data[good][:-1]
        # develop critical values using clean values
        # NOTE 4/7/14 - make sure that Missing and Flagged values treated appropriately
        print "sort the differencing if values were flagged rather than missing"

        full_filtered_diffs = np.ma.zeros(len(all_filtered))
        full_filtered_diffs.mask = all_filtered.mask
        full_filtered_diffs[good] = all_filtered.compressed()[1:] - all_filtered.compressed()[:-1]
        # test all values
        good_to_uncompress = np.where(st_var.data.mask == False)
        full_value_diffs = np.ma.zeros(len(st_var.data))
        full_value_diffs.mask = st_var.data.mask
        full_value_diffs[good_to_uncompress] = st_var.data.compressed()[1:] - st_var.data.compressed()[:-1]

        # convert to compressed time to match IDL
        value_diffs = full_value_diffs.compressed()
        time_diffs = full_time_diffs.compressed()
        filtered_diffs = full_filtered_diffs.compressed()
        flags = flags[good_to_uncompress]

        critical_values = np.zeros([9,12])
        # link observation to calendar month
        month_locs = np.zeros(full_time_diffs.shape)
        for month in range(12):
            for year in range(month_ranges.shape[0]):
                if year == 0:
                    this_month_time_diff = full_time_diffs[month_ranges[year,month,0]:month_ranges[year,month,1]]
                    this_month_filtered_diff = full_filtered_diffs[month_ranges[year,month,0]:month_ranges[year,month,1]]
                    this_month_time_diff = np.ma.concatenate([this_month_time_diff, full_time_diffs[month_ranges[year,month,0]:month_ranges[year,month,1]]])
                    this_month_filtered_diff = np.ma.concatenate([this_month_filtered_diff, full_filtered_diffs[month_ranges[year,month,0]:month_ranges[year,month,1]]])

                month_locs[month_ranges[year,month,0]:month_ranges[year,month,1]] = month
            for delta in range(1,9):
                locs = np.ma.where(this_month_time_diff == delta)
                if len(locs[0]) >= 100:
                    iqr = utils.IQR(this_month_filtered_diff[locs])

                    if iqr == 0. and delta == 1:
                        critical_values[delta-1,month] = 6.
                    elif iqr == 0: 
                        critical_values[delta-1,month] = st_var.mdi
                        critical_values[delta-1,month] = 6. * iqr      

                    # January 2015 - changed to dynamically calculating the thresholds if less than IQR method ^RJHD

                    if plots:
                        import calendar
                        title = "{}, {}-hr differences".format(calendar.month_name[month+1], delta)                  
                        line_label = st_var.name
                        xlabel = "First Difference Magnitudes"
                        title, line_label, xlabel = "","",""

                    threshold = utils.get_critical_values(this_month_filtered_diff[locs], binmin = 0, binwidth = 0.5, plots = plots, diagnostics = diagnostics, title = title, line_label = line_label, xlabel = xlabel, old_threshold = critical_values[delta-1,month])

                    if threshold < critical_values[delta-1,month]: critical_values[delta-1,month] = threshold

                    if plots or diagnostics:

                        print critical_values[delta-1,month] , iqr, 6 * iqr

        month_locs = month_locs[good_to_uncompress]
        if diagnostics:
            print critical_values[0,:]
        # not less than 5x reporting accuracy
        good_critical_values = np.where(critical_values != st_var.mdi)
        low_critical_values = np.where(critical_values[good_critical_values] <= 5.*reporting_resolution)
        temporary = critical_values[good_critical_values]
        temporary[low_critical_values] = 5.*reporting_resolution
        critical_values[good_critical_values] = temporary
        if diagnostics:
            print critical_values[0,:], 5.*reporting_resolution

        # check hourly against 2 hourly, if <2/3 the increase to avoid crazy rejection rate
        for month in range(12):
            if critical_values[0,month] != st_var.mdi and critical_values[1,month] != st_var.mdi:
                if critical_values[0,month]/critical_values[1,month] <= 0.66:
                    critical_values[0,month] = 0.66 * critical_values[1,month]
        if diagnostics:
            print critical_values[0,:]

        # get time differences for unfiltered data

        full_time_diffs = np.ma.zeros(len(st_var.data))
        full_time_diffs.mask = st_var.data.mask
        full_time_diffs[good_to_uncompress] = station.time.data[good_to_uncompress][1:] - station.time.data[good_to_uncompress][:-1]
        time_diffs = full_time_diffs.compressed()

        # go through each difference, identify which month it is in if passes spike thresholds 
        # spikes at the beginning or ends of sections
        for t in np.arange(len(time_diffs)):
            if (np.abs(time_diffs[t - 1]) > 240) and (np.abs(time_diffs[t]) < 3):
                # 10 days before but short gap thereafter
                next_values = st_var.data[good_to_uncompress[0][t + 1:]] 
                good, = np.where(next_values.mask == False)
                next_median = np.ma.median(next_values[good[:10]])
                next_diff = np.abs(value_diffs[t]) # out of spike
                median_diff = np.abs(next_median - st_var.data[good_to_uncompress[0][t]]) # are the remaining onees
                if (critical_values[time_diffs[t] - 1, month_locs[t]] != st_var.mdi):
                    # jump from spike > critical but average after < critical / 2
                    if (np.abs(median_diff) < critical_values[time_diffs[t] - 1, month_locs[t]] / 2.) and\
                        (np.abs(next_diff) > critical_values[time_diffs[t] - 1, month_locs[t]]) :
                        flags[t] = 1
                        if plots or diagnostics:
                            sc_diagnostics_and_plots(station.time.data, st_var.data, good_to_uncompress[0][t], good_to_uncompress[0][t+1], start, variable, plots = plots)
            elif (np.abs(time_diffs[t - 1]) < 3) and (np.abs(time_diffs[t]) > 240):
                # 10 days after but short gap before
                prev_values = st_var.data[good_to_uncompress[0][:t - 1]]
                good, = np.where(prev_values.mask == False)
                prev_median = np.ma.median(prev_values[good[-10:]])
                prev_diff = np.abs(value_diffs[t - 1])
                median_diff = np.abs(prev_median - st_var.data[good_to_uncompress[0][t]])
                if (critical_values[time_diffs[t - 1] - 1, month_locs[t]] != st_var.mdi):
                    # jump into spike > critical but average before < critical / 2
                    if (np.abs(median_diff) < critical_values[time_diffs[t - 1] - 1, month_locs[t]] / 2.) and\
                        (np.abs(prev_diff) > critical_values[time_diffs[t - 1] - 1, month_locs[t]]) :
                        flags[t] = 1
                        if plots or diagnostics:
                            sc_diagnostics_and_plots(station.time.data, st_var.data, good_to_uncompress[0][t], good_to_uncompress[0][t+1], start, variable, plots = plots)
        ''' this isn't the nicest way, but a direct copy from IDL
            masked arrays might help remove some of the lines

            Also, this is relatively slow'''
        for t in np.arange(len(time_diffs)):
            for spk_len in [1,2,3]:
                if t >= spk_len and t < len(time_diffs) - spk_len:
                    # check if time differences are appropriate, for multi-point spikes
                    if (np.abs(time_diffs[t - spk_len]) <= spk_len * 3) and\
                    (np.abs(time_diffs[t]) <= spk_len * 3) and\
                    (time_diffs[t - spk_len - 1] - 1 < spk_len * 3) and\
                    (time_diffs[t + 1] - 1 < spk_len * 3) and \
                    ((spk_len == 1) or \
                    ((spk_len == 2) and (np.abs(time_diffs[t - spk_len + 1]) <= spk_len * 3)) or \
                    ((spk_len == 3) and (np.abs(time_diffs[t - spk_len + 1]) <= spk_len * 3) and (np.abs(time_diffs[t - spk_len + 2]) <= spk_len * 3))):
                        # check if differences are valid                        
                        if (value_diffs[t - spk_len] != st_var.mdi) and \
                        (value_diffs[t - spk_len] != st_var.fdi) and \
                        (critical_values[time_diffs[t - spk_len] - 1, month_locs[t]] != st_var.mdi):
                            # if exceed critical values
                            if (np.abs(value_diffs[t - spk_len]) >= critical_values[time_diffs[t - spk_len] - 1, month_locs[t]]):

                                # are signs of two differences different
                                if (math.copysign(1, value_diffs[t]) != math.copysign(1, value_diffs[t - spk_len])):
                                    # are within spike differences small
                                    if (spk_len == 1) or\
                                    ((spk_len == 2) and (np.abs(value_diffs[t - spk_len + 1]) < critical_values[time_diffs[t - spk_len + 1] -1, month_locs[t]] / 2.)) or \
                                    ((spk_len == 3) and (np.abs(value_diffs[t - spk_len + 1]) < critical_values[time_diffs[t - spk_len + 1] -1, month_locs[t]] / 2.) and\
                                      (np.abs(value_diffs[t - spk_len + 2]) < critical_values[time_diffs[t - spk_len + 2] -1, month_locs[t]] / 2.)):
                                        # check if following value is valid
                                        if (value_diffs[t] != st_var.mdi) and (critical_values[time_diffs[t] - 1, month_locs[t]] != st_var.mdi) and\
                                            (value_diffs[t] != st_var.fdi):
                                            # and if at least critical value                                            
                                            if (np.abs(value_diffs[t]) >= critical_values[time_diffs[t] - 1, month_locs[t]]):
                                                # test if surrounding differences below 1/2 critical value
                                                if (np.abs(value_diffs[t - spk_len - 1]) <= critical_values[time_diffs[t - spk_len - 1] - 1, month_locs[t]] / 2.): 
                                                    if (np.abs(value_diffs[t + 1]) <= critical_values[time_diffs[t + 1] - 1, month_locs[t]] / 2.): 
                                                        # set the flags
                                                        flags[ t - spk_len + 1 : t +1] = 1   

                                                        if plots or diagnostics:
                                                            sc_diagnostics_and_plots(station.time.data, st_var.data, good_to_uncompress[0][t-spk_len+1], good_to_uncompress[0][t+1], start, variable, plots = plots)

        station.qc_flags[good_to_uncompress, flag_col[v]] = flags
        flag_locs = np.where(station.qc_flags[:, flag_col[v]] != 0)

        if plots or diagnostics:
            utils.print_flagged_obs_number(logfile, "Spike", variable, len(flag_locs[0]) - prev_flag_number, noWrite = True) # additional flags
            utils.print_flagged_obs_number(logfile, "Spike", variable, len(flag_locs[0]) - prev_flag_number) # additional flags

        # copy flags into attribute
        st_var.flags[flag_locs] = 1
        # matches 030660 - but with adapted IDL
        # matches 030220 OK, but finds more but all are reasonable 1/9/14

        do_interactive = False
        if plots and do_interactive == True:
            import matplotlib.pyplot as plt
            plot_times = utils.times_hours_to_datetime(station.time.data, start)
            plt.plot(plot_times, all_filtered, 'bo', ls='-')
            flg = np.where(flags[:, flag_col[v]] == 1)
            plt.plot(plot_times[flg], all_filtered[flg], 'ro', markersize=10)
    station = utils.append_history(station, "Spike Check")  

    return # sc
コード例 #27
ファイル: climatological.py プロジェクト: wk1984/HadISD_v2
def coc(station,

    for v, variable in enumerate(variable_list):

        st_var = getattr(station, variable)
        all_filtered = utils.apply_filter_flags(st_var)

        # is this needed 13th Nov 2014 RJHD
        #reporting_resolution = utils.reporting_accuracy(utils.apply_filter_flags(st_var))

        month_ranges = utils.month_starts_in_pairs(start, end)
        month_ranges = month_ranges.reshape(-1, 12, 2)

        for month in range(12):

            hourly_climatologies = np.zeros(24)

            # append all e.g. Januaries together

            this_month, year_ids, dummy = utils.concatenate_months(
                month_ranges[:, month, :], st_var.data, hours=True)
            this_month_filtered, dummy, dummy = utils.concatenate_months(
                month_ranges[:, month, :], all_filtered, hours=True)

            # if fixed climatology period, sort this here

            # get as array of 24 hrs.
            this_month = np.ma.array(this_month)
            this_month = this_month.reshape(-1, 24)

            this_month_filtered = np.ma.array(this_month_filtered)
            this_month_filtered = this_month_filtered.reshape(-1, 24)

            # get hourly climatology for each month
            for hour in range(24):

                this_hour = this_month[:, hour]

                # need to have data if this is going to work!
                if len(this_hour.compressed()) > 0:

                    # winsorize & climatologies - done to match IDL
                    if idl:
                        this_hour = utils.winsorize(np.append(
                            this_hour.compressed(), -999999),
                        hourly_climatologies[hour] = np.ma.sum(this_hour) / (
                            len(this_hour) - 1)

                        this_hour = utils.winsorize(this_hour.compressed(),
                        hourly_climatologies[hour] = np.ma.mean(this_hour)

            if len(this_month.compressed()) > 0:
                # can get stations with few obs in a particular variable.

                # anomalise each hour over month appropriately

                anomalies = this_month - np.tile(hourly_climatologies,
                                                 (this_month.shape[0], 1))
                anomalies_filtered = this_month_filtered - np.tile(
                    hourly_climatologies, (this_month_filtered.shape[0], 1))

                if len(anomalies.compressed()) >= 10:
                    iqr = utils.IQR(anomalies.compressed().reshape(
                        -1)) / 2.  # to match IDL
                    if iqr < 1.5: iqr = 1.5
                    iqr = st_var.mdi

                normed_anomalies = anomalies / iqr
                normed_anomalies_filtered = anomalies_filtered / iqr

                # get average anomaly for year
                year_ids = np.array(year_ids)
                monthly_vqvs = np.ma.zeros(month_ranges.shape[0])
                monthly_vqvs.mask = [
                    False for x in range(month_ranges.shape[0])
                for year in range(month_ranges.shape[0]):
                    year_locs = np.where(year_ids == year)
                    this_year = normed_anomalies_filtered[year_locs, :]

                    if len(this_year.compressed()) > 0:
                        # need to have data for this to work!
                        if idl:
                            monthly_vqvs[year] = utils.idl_median(
                            monthly_vqvs[year] = np.ma.median(this_year)
                        monthly_vqvs.mask[year] = True

                # low pass filter
                normed_anomalies = coc_low_pass_filter(normed_anomalies,
                                                       year_ids, monthly_vqvs,

                # copy from distributional_gap.py - refactor!
                # get the threshold value
                bins, bincenters = utils.create_bins(normed_anomalies, 1.)

                hist, binEdges = np.histogram(normed_anomalies, bins=bins)

                gaussian = utils.fit_gaussian(bincenters,
                minimum_threshold = round(
                    1. + utils.invert_gaussian(FREQUENCY_THRESHOLD, gaussian))

                if diagnostics:
                    print iqr, minimum_threshold, 1. + utils.invert_gaussian(
                        FREQUENCY_THRESHOLD, gaussian)
                    print gaussian
                    print hist

                if plots:

                uppercount = len(
                    np.where(normed_anomalies > minimum_threshold)[0])
                lowercount = len(
                    np.where(normed_anomalies < -minimum_threshold)[0])

                these_flags = station.qc_flags[:, flag_col[v]]
                gap_plot_values, tentative_plot_values = [], []

                # find the gaps and apply the flags

                gap_start = dgc.dgc_find_gap(hist,
                                             gap_size=1)  # in DGC it is 2.
                these_flags, gap_plot_values, tentative_plot_values =\
                    coc_find_and_apply_flags(month_ranges[:,month,:],normed_anomalies, these_flags, year_ids, minimum_threshold, gap_start, \
                                                           upper = True, plots = plots, gpv = gap_plot_values, tpv = tentative_plot_values)

                gap_start = dgc.dgc_find_gap(hist,
                                             gap_size=1)  # in DGC it is 2.
                these_flags, gap_plot_values, tentative_plot_values =\
                    coc_find_and_apply_flags(month_ranges[:,month,:],normed_anomalies, these_flags, year_ids, minimum_threshold, gap_start, \
                                                           upper = False, plots = plots, gpv = gap_plot_values, tpv = tentative_plot_values)

                station.qc_flags[:, flag_col[v]] = these_flags

                if uppercount + lowercount > 1000:
                    #print "not sorted spurious stations yet"
                if plots:
                    import matplotlib.pyplot as plt
                    hist, binEdges = np.histogram(tentative_plot_values,
                    plot_hist = np.array([0.01 if h == 0 else h for h in hist])

                    hist, binEdges = np.histogram(gap_plot_values, bins=bins)
                    plot_hist = np.array([0.01 if h == 0 else h for h in hist])
                    import calendar
                             calendar.month_name[month + 1],
                    leg = plt.legend(loc='lower center',
                                     bbox_to_anchor=(0.5, -0.2),
                                     prop={'size': 13},
                    plt.setp(leg.get_title(), fontsize=14)

        flag_locs = np.where(station.qc_flags[:, flag_col[v]] != 0)

        # copy flags into attribute
        st_var.flags[flag_locs] = 1

        if plots or diagnostics:
            print "where\n"
            nflags = len(np.where(station.qc_flags[:, flag_col[v]] == 1)[0])
                                           "  Firm Clim",
            nflags = len(np.where(station.qc_flags[:, flag_col[v]] == 2)[0])
                                           "  Tentative Clim",
            utils.print_flagged_obs_number(logfile, "Climatological", variable,
            nflags = len(np.where(station.qc_flags[:, flag_col[v]] == 1)[0])
            utils.print_flagged_obs_number(logfile, "  Firm Clim", variable,
            nflags = len(np.where(station.qc_flags[:, flag_col[v]] == 2)[0])
            utils.print_flagged_obs_number(logfile, "  Tentative Clim",
                                           variable, nflags)

        # firm flags match 030220
    station = utils.append_history(station, "Climatological Check")

コード例 #28
def rsc_straight_strings(st_var, times, n_obs, n_days, start, end, wind = False, reporting = 0., diagnostics = False, plots = False, dynamic = True, doMonth = False):
    Check for strings/streaks of repeating values
    :param object st_var: station variable object
    :param int n_days: number of days to exceed
    :param int n_obs: number of observations to exceed
    :param datetime start: start of data
    :param datetime end: end of data    
    :param float reporting: reporting accuracy
    :param bool wind: whether there is wind data to account for - extra minimum value
    :param bool diagnostics: do diagnostic output
    :param bool plots: do plots
    :param bool dynamic: calculate threshold of number of observations dynamically rather than using n_obs

    # January 2015 - changed to dynamically calculating the thresholds, but only use if less than current ^RJHD

    if st_var.name == "winddirs":
        # remove calm periods for this check.
        wd_st_var = copy.deepcopy(st_var)
        calms, = np.ma.where(st_var.data == 0) # True calms have direction set to 0, northerlies to 360
        wd_st_var.data[calms] = wd_st_var.mdi

        if dynamic:
            threshold = rsc_get_straight_string_threshold(wd_st_var, start, end, reporting = reporting, diagnostics = diagnostics, plots = plots, doMonth = doMonth, old_threshold = n_obs)          

            if threshold < n_obs: n_obs = threshold

        # threshold has been set applying "month" where appropriate.  
        # For the detection, want flagged data removed, but final incomplete year included.
        all_filtered = utils.apply_filter_flags(wd_st_var) # calms have been removed

        if dynamic:
            threshold = rsc_get_straight_string_threshold(st_var, start, end, reporting = reporting, diagnostics = diagnostics, plots = plots, doMonth = doMonth, old_threshold = n_obs)          
            if threshold < n_obs: n_obs = threshold

        # threshold has been set applying "month" where appropriate.  
        # For the detection, want flagged data removed, but final incomplete year included.
        all_filtered = utils.apply_filter_flags(st_var)
    flags = np.zeros(len(all_filtered))
    ''' Look for continuous straight strings '''
    prev_value = st_var.mdi
    string_points = []
    # storage for excess over years
    value_starts = []
    value_lengths =[]
    for o, obs in enumerate(all_filtered):
        if all_filtered.mask[o] == False:
            if obs != prev_value:
                if (st_var.name == "winddirs") and (prev_value == 0):
                    # this was a calm as a string of zeros.
                    # shouldn't be necessary - but just in case!

                    # if different value to before, which is long enough (and large enough for Wind)
                    if len(string_points) >= 10:
                        if wind == False or (wind == True and prev_value > WIND_MIN_VALUE[reporting]):
                            # note start and length for the annual excess test
                            value_starts += [string_points[0]]
                            value_lengths += [len(string_points)]

                            time_diff = times[string_points[-1]] - times[string_points[0]]

                            # if length above threshold and spread over sufficient time frame, flag
                            if (len(string_points) >= n_obs) or (time_diff >= (n_days * 24)): # measuring time in hours 
                                flags[string_points] = 1
                                if plots or diagnostics:
                                    rsc_diagnostics_and_plot(times, all_filtered, string_points, st_var.name, start, plots = plots)           
                string_points = [o]
                # if same value as before, note and continue
                string_points += [o]
            prev_value = obs

    # matches value_lengths 030660-99999, 1/7/2014 - seems to flag more though - compressed vs full time?

    flags = rsc_annual_string_expectance(all_filtered, np.array(value_starts), np.array(value_lengths), flags, start, end, st_var, times, diagnostics = diagnostics, plots = plots)
    return flags # rsc_straight_strings
コード例 #29
def sc(station,
    Spike Check, looks for spikes up to 3 observations long, using thresholds
    calculated from the data itself.

    :param MetVar station: the station object
    :param list variable_list: list of observational variables to process
    :param list flag_col: the columns to set on the QC flag array
    :param datetime start: dataset start time
    :param datetime end: dataset end time
    :param file logfile: logfile to store outputs
    :param bool plots: do plots
    :param bool doMonth: account for incomplete months

    print "refactor"

    for v, variable in enumerate(variable_list):

        flags = station.qc_flags[:, flag_col[v]]

        st_var = getattr(station, variable)

        # if incomplete year, mask all obs for the incomplete bit
        all_filtered = utils.apply_filter_flags(st_var,

        reporting_resolution = utils.reporting_accuracy(
        # to match IDL system - should never be called as would mean no data
        if reporting_resolution == -1:
            reporting_resolution = 1

        month_ranges = utils.month_starts_in_pairs(start, end)
        month_ranges = month_ranges.reshape(-1, 12, 2)

        good, = np.where(all_filtered.mask == False)

        full_time_diffs = np.ma.zeros(len(all_filtered), dtype=int)
        full_time_diffs.mask = copy.deepcopy(all_filtered.mask[:])
        full_time_diffs[good[:-1]] = station.time.data[
            good[1:]] - station.time.data[good[:-1]]

        # develop critical values using clean values
        # NOTE 4/7/14 - make sure that Missing and Flagged values treated appropriately
        print "sort the differencing if values were flagged rather than missing"

        full_filtered_diffs = np.ma.zeros(len(all_filtered))
        full_filtered_diffs.mask = copy.deepcopy(all_filtered.mask[:])
        full_filtered_diffs[good[:-1]] = all_filtered.compressed(
        )[1:] - all_filtered.compressed()[:-1]

        # test all values
        good_to_uncompress, = np.where(st_var.data.mask == False)
        full_value_diffs = np.ma.zeros(len(st_var.data))
        full_value_diffs.mask = copy.deepcopy(st_var.data.mask[:])
        full_value_diffs[good_to_uncompress[:-1]] = st_var.data.compressed(
        )[1:] - st_var.data.compressed()[:-1]

        # convert to compressed time to match IDL
        value_diffs = full_value_diffs.compressed()
        time_diffs = full_time_diffs.compressed()
        filtered_diffs = full_filtered_diffs.compressed()
        flags = flags[good_to_uncompress]

        critical_values = np.zeros([9, 12])

        # link observation to calendar month
        month_locs = np.zeros(full_time_diffs.shape, dtype=int)

        for month in range(12):
            for year in range(month_ranges.shape[0]):

                if year == 0:
                    this_month_time_diff = full_time_diffs[month_ranges[
                        year, month, 0]:month_ranges[year, month, 1]]
                    this_month_filtered_diff = full_filtered_diffs[
                        month_ranges[year, month, 0]:month_ranges[year, month,
                    this_month_time_diff = np.ma.concatenate([
                        full_time_diffs[month_ranges[year, month,
                                                                     month, 1]]
                    this_month_filtered_diff = np.ma.concatenate([
                        full_filtered_diffs[month_ranges[year, month,

                month_locs[month_ranges[year, month,
                                        0]:month_ranges[year, month,
                                                        1]] = month

            for delta in range(1, 9):

                locs = np.ma.where(this_month_time_diff == delta)

                if len(locs[0]) >= 100:

                    iqr = utils.IQR(this_month_filtered_diff[locs])

                    if iqr == 0. and delta == 1:
                        critical_values[delta - 1, month] = 6.
                    elif iqr == 0:
                        critical_values[delta - 1, month] = st_var.mdi
                        critical_values[delta - 1, month] = 6. * iqr

                    # January 2015 - changed to dynamically calculating the thresholds if less than IQR method ^RJHD

                    if plots:
                        import calendar
                        title = "{}, {}-hr differences".format(
                            calendar.month_name[month + 1], delta)
                        line_label = st_var.name
                        xlabel = "First Difference Magnitudes"
                        title, line_label, xlabel = "", "", ""

                    threshold = utils.get_critical_values(
                        old_threshold=critical_values[delta - 1, month])

                    if threshold < critical_values[delta - 1, month]:
                        critical_values[delta - 1, month] = threshold

                    if plots or diagnostics:

                        print critical_values[delta - 1, month], iqr, 6 * iqr

        month_locs = month_locs[good_to_uncompress]
        if diagnostics:
            print critical_values[0, :]

        # not less than 5x reporting accuracy
        good_critical_values = np.where(critical_values != st_var.mdi)
        low_critical_values = np.where(
            critical_values[good_critical_values] <= 5. * reporting_resolution)
        temporary = critical_values[good_critical_values]
        temporary[low_critical_values] = 5. * reporting_resolution
        critical_values[good_critical_values] = temporary

        if diagnostics:
            print critical_values[0, :], 5. * reporting_resolution

        # check hourly against 2 hourly, if <2/3 the increase to avoid crazy rejection rate
        for month in range(12):
            if critical_values[0, month] != st_var.mdi and critical_values[
                    1, month] != st_var.mdi:
                if critical_values[0, month] / critical_values[1,
                                                               month] <= 0.66:
                                    month] = 0.66 * critical_values[1, month]

        if diagnostics:
            print "critical values"
            print critical_values[0, :]

        # get time differences for unfiltered data

        full_time_diffs = np.ma.zeros(len(st_var.data), dtype=int)
        full_time_diffs.mask = copy.deepcopy(st_var.data.mask[:])
        full_time_diffs[good_to_uncompress[:-1]] = station.time.data[
            good_to_uncompress[1:]] - station.time.data[
        time_diffs = full_time_diffs.compressed()

        # go through each difference, identify which month it is in if passes spike thresholds

        # spikes at the beginning or ends of sections
        for t in np.arange(len(time_diffs)):
            if (np.abs(time_diffs[t - 1]) > 240) and (np.abs(time_diffs[t]) <
                # 10 days before but short gap thereafter

                next_values = st_var.data[good_to_uncompress[t + 1:]]
                good, = np.where(next_values.mask == False)

                next_median = np.ma.median(next_values[good[:10]])

                next_diff = np.abs(value_diffs[t])  # out of spike
                median_diff = np.abs(next_median -
                                     )  # are the remaining onees

                if (critical_values[time_diffs[t] - 1, month_locs[t]] !=

                    # jump from spike > critical but average after < critical / 2
                    if (np.abs(median_diff) < critical_values[time_diffs[t] - 1, month_locs[t]] / 2.) and\
                        (np.abs(next_diff) > critical_values[time_diffs[t] - 1, month_locs[t]]) :

                        flags[t] = 1
                        if plots or diagnostics:
                                                     good_to_uncompress[t + 1],

            elif (np.abs(time_diffs[t - 1]) < 3) and (np.abs(time_diffs[t]) >
                # 10 days after but short gap before

                prev_values = st_var.data[good_to_uncompress[:t - 1]]
                good, = np.where(prev_values.mask == False)

                prev_median = np.ma.median(prev_values[good[-10:]])

                prev_diff = np.abs(value_diffs[t - 1])
                median_diff = np.abs(prev_median -

                if (critical_values[time_diffs[t - 1] - 1, month_locs[t]] !=

                    # jump into spike > critical but average before < critical / 2
                    if (np.abs(median_diff) < critical_values[time_diffs[t - 1] - 1, month_locs[t]] / 2.) and\
                        (np.abs(prev_diff) > critical_values[time_diffs[t - 1] - 1, month_locs[t]]) :

                        flags[t] = 1
                        if plots or diagnostics:
                                                     good_to_uncompress[t + 1],
        ''' this isn't the nicest way, but a direct copy from IDL
            masked arrays might help remove some of the lines

            Also, this is relatively slow'''

        for t in np.arange(len(time_diffs)):
            for spk_len in [1, 2, 3]:
                if t >= spk_len and t < len(time_diffs) - spk_len:

                    # check if time differences are appropriate, for multi-point spikes
                    if (np.abs(time_diffs[t - spk_len]) <= spk_len * 3) and\
                    (np.abs(time_diffs[t]) <= spk_len * 3) and\
                    (time_diffs[t - spk_len - 1] - 1 < spk_len * 3) and\
                    (time_diffs[t + 1] - 1 < spk_len * 3) and \
                    ((spk_len == 1) or \
                    ((spk_len == 2) and (np.abs(time_diffs[t - spk_len + 1]) <= spk_len * 3)) or \
                    ((spk_len == 3) and (np.abs(time_diffs[t - spk_len + 1]) <= spk_len * 3) and (np.abs(time_diffs[t - spk_len + 2]) <= spk_len * 3))):

                        # check if differences are valid
                        if (value_diffs[t - spk_len] != st_var.mdi) and \
                        (value_diffs[t - spk_len] != st_var.fdi) and \
                        (critical_values[time_diffs[t - spk_len] - 1, month_locs[t]] != st_var.mdi):

                            # if exceed critical values
                            if (np.abs(value_diffs[t - spk_len]) >=
                                    critical_values[time_diffs[t - spk_len] -
                                                    1, month_locs[t]]):

                                # are signs of two differences different
                                if (math.copysign(1, value_diffs[t])
                                        != math.copysign(
                                            1, value_diffs[t - spk_len])):

                                    # are within spike differences small
                                    if (spk_len == 1) or\
                                    ((spk_len == 2) and (np.abs(value_diffs[t - spk_len + 1]) < critical_values[time_diffs[t - spk_len + 1] -1, month_locs[t]] / 2.)) or \
                                    ((spk_len == 3) and (np.abs(value_diffs[t - spk_len + 1]) < critical_values[time_diffs[t - spk_len + 1] -1, month_locs[t]] / 2.) and\
                                      (np.abs(value_diffs[t - spk_len + 2]) < critical_values[time_diffs[t - spk_len + 2] -1, month_locs[t]] / 2.)):

                                        # check if following value is valid
                                        if (value_diffs[t] != st_var.mdi) and (critical_values[time_diffs[t] - 1, month_locs[t]] != st_var.mdi) and\
                                            (value_diffs[t] != st_var.fdi):

                                            # and if at least critical value
                                            if (np.abs(value_diffs[t]) >=
                                                        time_diffs[t] - 1,

                                                # test if surrounding differences below 1/2 critical value
                                                if (np.abs(
                                                        value_diffs[t - spk_len
                                                                    - 1]
                                                ) <= critical_values[
                                                        time_diffs[t -
                                                                   spk_len -
                                                                   1] - 1,
                                                        month_locs[t]] / 2.):
                                                    if (np.abs(
                                                            value_diffs[t + 1]
                                                    ) <= critical_values[
                                                            time_diffs[t + 1] -
                                                            1, month_locs[t]] /

                                                        # set the flags
                                                        flags[t - spk_len +
                                                              1:t + 1] = 1

                                                        if plots or diagnostics:

                                                                    t -
                                                                    spk_len +
                                                                    t + 1],

        station.qc_flags[good_to_uncompress, flag_col[v]] = flags

        flag_locs, = np.where(station.qc_flags[:, flag_col[v]] != 0)

                                       noWrite=diagnostics)  # additional flags

        # copy flags into attribute
        st_var.flags[flag_locs] = 1

        # matches 030660 - but with adapted IDL
        # matches 030220 OK, but finds more but all are reasonable 1/9/14

        do_interactive = False
        if plots and do_interactive == True:
            import matplotlib.pyplot as plt

            plot_times = utils.times_hours_to_datetime(station.time.data,

            plt.plot(plot_times, all_filtered, 'bo', ls='-')
            flg = np.where(flags[:, flag_col[v]] == 1)
            plt.plot(plot_times[flg], all_filtered[flg], 'ro', markersize=10)

    station = utils.append_history(station, "Spike Check")

    return  # sc
コード例 #30
def dgc_all_obs(station,
    '''RJHD addition working on all observations'''

    if plots:
        import matplotlib.pyplot as plt

    month_ranges = utils.month_starts_in_pairs(start, end)
    month_ranges = month_ranges.reshape(-1, 12, 2)

    # extract variable
    st_var = getattr(station, variable)
    # apply flags (and mask incomplete year if appropriate)
    all_filtered = utils.apply_filter_flags(st_var,

    st_var_complete_year = copy.deepcopy(st_var)
    if doMonth:
        # restrict the incomplete year if appropriate - keep other flagged obs.
        full_year_end = utils.get_first_hour_this_year(start, end)
        st_var_complete_year.data.mask[full_year_end:] = True

    for month in range(12):

        # if requiring wind data, extract data and find monthly averages
        if windspeeds == True:
            st_var_wind = getattr(station, "windspeeds")

            if doMonth:
                # restrict the incomplete year if appropriate
                st_var_wind.data.mask[full_year_end:] = True

            # get monthly averages
            windspeeds_month = np.empty([])
            for y, year in enumerate(month_ranges[:, month, :]):

                if y == 0:
                    windspeeds_month = np.ma.array(
                    windspeeds_month = np.ma.concatenate(
                        [windspeeds_month, st_var_wind.data[year[0]:year[1]]])

            windspeeds_month_average = dgc_get_monthly_averages(
                windspeeds_month, OBS_LIMIT, st_var_wind.mdi, MEAN)
            windspeeds_month_mad = utils.mean_absolute_deviation(
                windspeeds_month, median=True)

        # pull data from each calendar month together
        this_month_data, dummy, dummy = utils.concatenate_months(
            month_ranges[:, month, :], st_var.data, hours=False)
        this_month_filtered, dummy, dummy = utils.concatenate_months(
            month_ranges[:, month, :], all_filtered, hours=False)
        this_month_complete, dummy, dummy = utils.concatenate_months(
            month_ranges[:, month, :], st_var_complete_year.data, hours=False)

        # if enough clean and complete data for this calendar month find the median and IQR
        if len(this_month_filtered.compressed()) > OBS_LIMIT:

            if idl:
                monthly_median = utils.idl_median(
                monthly_median = np.ma.median(this_month_filtered)

            iqr = utils.IQR(this_month_filtered.compressed())

            if iqr == 0.0:
                # to get some spread if IQR too small
                iqr = utils.IQR(this_month_filtered.compressed(),
                print "Spurious_stations file not yet sorted"

            # if have an IQR, anomalise using median and standardise using IQR
            if iqr != 0.0:

                monthly_values = np.ma.array(
                    (this_month_data.compressed() - monthly_median) / iqr)
                complete_values = np.ma.array(
                    (this_month_complete.compressed() - monthly_median) / iqr)

                # use complete years only for the histogram - aiming to find outliers.
                bins, bincenters = utils.create_bins(complete_values, BIN_SIZE)
                dummy, plot_bincenters = utils.create_bins(
                    complete_values, BIN_SIZE / 10.)
                hist, binEdges = np.histogram(complete_values, bins=bins)
                Change to monthly updates Oct 2017
                Thought about changing distribution to use filtered values
                But this changes the test beyond just dealing with additional months
                Commented out lines below would be alternative.
                # bins, bincenters = utils.create_bins(filtered_values, BIN_SIZE)
                # dummy, plot_bincenters = utils.create_bins(filtered_values, BIN_SIZE/10.)
                # hist, binEdges = np.histogram(filtered_values, bins = bins)

                # used filtered (incl. incomplete year mask) to determine the distribution.
                if GH:
                    # Use Gauss-Hermite polynomials to add skew and kurtosis to Gaussian fit - January 2015 ^RJHD

                    # Feb 2019 - if large amounts off centre, can affect initial values
                    # switched to median and MAD
                    initial_values = [
                    ]  # norm, mean, std, skew, kurtosis

                    fit = leastsq(utils.residualsGH, initial_values,
                                  [bincenters, hist,
                    res = utils.hermite2gauss(fit[0], diagnostics=diagnostics)

                    plot_gaussian = utils.funcGH(fit[0], plot_bincenters)

                    # adjust to remove the rising bumps seen in some fits - artefacts of GH fitting?
                    mid_point = np.argmax(plot_gaussian)
                    bad, = np.where(
                        plot_gaussian[mid_point:] < FREQUENCY_THRESHOLD / 10.)
                    if len(bad) > 0:
                            bad[0]:] = FREQUENCY_THRESHOLD / 10.

                    bad, = np.where(
                        plot_gaussian[:mid_point] < FREQUENCY_THRESHOLD / 10.)
                    if len(bad) > 0:
                            -1]] = FREQUENCY_THRESHOLD / 10.

                    # extract threshold values
                    good_values = np.argwhere(
                        plot_gaussian > FREQUENCY_THRESHOLD)

                    l_minimum_threshold = round(
                        plot_bincenters[good_values[0]]) - 1
                    u_minimum_threshold = 1 + round(

                    if diagnostics:
                        print hist
                        print res
                        print iqr, l_minimum_threshold, u_minimum_threshold

                # or just a standard Gaussian
                    gaussian = utils.fit_gaussian(

                    # assume the same threshold value
                    u_minimum_threshold = 1 + round(
                        utils.invert_gaussian(FREQUENCY_THRESHOLD, gaussian))
                    l_minimum_threshold = -u_minimum_threshold

                    plot_gaussian = utils.gaussian(plot_bincenters, gaussian)

                    if diagnostics:
                        print hist
                        print gaussian
                        print iqr, u_minimum_threshold, 1. + utils.invert_gaussian(
                            FREQUENCY_THRESHOLD, gaussian)

                if plots:

                    if GH:
                            'Mean %.2f, S.d. %.2f,\nSkew %.2f, Kurtosis %.2f' %
                            (res['mean'], res['dispersion'], res['skewness'],

                # now trying to find gaps in the distribution
                uppercount = len(
                    np.where(monthly_values > u_minimum_threshold)[0])
                lowercount = len(
                    np.where(monthly_values < l_minimum_threshold)[0])

                # this needs refactoring - but lots of variables to pass in
                if plots or diagnostics: gap_plot_values = np.array([])

                # do one side of distribution and then other
                if uppercount > 0:
                    gap_start = dgc_find_gap(hist, binEdges,

                    if gap_start != 0:

                        # if found a gap, then go through each year for this calendar month
                        #  and flag observations further from middle
                        for y, year in enumerate(month_ranges[:, month, :]):

                            # not using filtered - checking all available data
                            this_year_data = np.ma.array(
                            this_year_flags = np.array(flags[year[0]:year[1]])
                            gap_cleaned_locations = np.ma.where(
                                ((this_year_data - monthly_median) /
                                 iqr) > gap_start)

                            this_year_flags[gap_cleaned_locations] = 1
                            flags[year[0]:year[1]] = this_year_flags

                            if plots or diagnostics:
                                gap_plot_values = np.append(
                                     compressed() - monthly_median) / iqr)

                                if len(gap_cleaned_locations[0]) > 0:
                                    print "Upper {}-{} - {} obs flagged".format(
                                        y + start.year, month,
                                    print gap_cleaned_locations, this_year_data[

                if lowercount > 0:
                    gap_start = dgc_find_gap(hist, binEdges,

                    if gap_start != 0:

                        # if found a gap, then go through each year for this calendar month
                        #  and flag observations further from middle
                        for y, year in enumerate(month_ranges[:, month, :]):

                            this_year_data = np.ma.array(
                            this_year_flags = np.array(flags[year[0]:year[1]])
                            gap_cleaned_locations = np.ma.where(
                                    ((this_year_data - monthly_median) / iqr) <
                                    gap_start, this_year_data.mask != True))
                            # add flag requirement for low pressure bit if appropriate

                            this_year_flags[gap_cleaned_locations] = 1
                            flags[year[0]:year[1]] = this_year_flags

                            if plots or diagnostics:
                                gap_plot_values = np.append(
                                     compressed() - monthly_median) / iqr)

                                if len(gap_cleaned_locations[0]) > 0:
                                    print "Lower {}-{} - {} obs flagged".format(
                                        y + start.year, month,
                                    print gap_cleaned_locations, this_year_data[

                            # if doing SLP then do extra checks for storms
                            if windspeeds:
                                windspeeds_year = np.ma.array(

                                    gap_cleaned_locations] = 2  # tentative flags

                                slp_average = dgc_get_monthly_averages(
                                    this_month_data, OBS_LIMIT, st_var.mdi,
                                slp_mad = utils.mean_absolute_deviation(
                                    this_month_data, median=True)

                                # need to ensure that this_year_data is less than slp_average, hence order of test
                                storms, = np.ma.where((((windspeeds_year - windspeeds_month_average) / windspeeds_month_mad) > MAD_THRESHOLD) &\
                                                   (((slp_average - this_year_data) / slp_mad) > MAD_THRESHOLD))

                                # using IDL terminology
                                if len(storms) >= 2:
                                    # use the first difference series to find when there are gaps in
                                    # contiguous sequences of storm observations - want to split up into
                                    # separate storm events
                                    storm_1diffs = np.diff(storms)
                                    separations, = np.where(storm_1diffs != 1)

                                    # expand around storm signal so that all low SLP values covered, and unflagged
                                    if len(separations) >= 1:
                                        print "  multiple storms in {} {}".format(
                                            y + start.year, month)

                                        # if more than one storm signal that month, then use intervals
                                        #    in the first difference series to expand around the first interval alone
                                        storm_start = 0
                                        storm_finish = separations[0] + 1
                                        first_storm = dgc_expand_storms(
                                        final_storms = copy.deepcopy(

                                        for j in range(len(separations)):
                                            # then do the rest in a loop

                                            if j + 1 == len(separations):
                                                # final one
                                                this_storm = dgc_expand_storms(
                                                    storms[separations[j] +
                                                this_storm = dgc_expand_storms(
                                                    storms[separations[j] +
                                                           1:separations[j +
                                                                         1] +

                                            final_storms = np.append(
                                                final_storms, this_storm)

                                        # else just expand around the signal by 6 hours either way
                                        final_storms = dgc_expand_storms(
                                            storms, len(this_year_data))

                                    final_storms = storms

                                if len(storms) >= 1:
                                    print "Tropical Storm signal in {} {}".format(
                                        y + start.year, month)
                                    this_year_flags[final_storms] = 0

                            # and write flags back into array
                            flags[year[0]:year[1]] = this_year_flags

                if plots:
                    hist, binEdges = np.histogram(gap_plot_values, bins=bins)
                    plot_hist = np.array([0.01 if h == 0 else h for h in hist])
                    import calendar
                             calendar.month_name[month + 1],
                    plt.legend(loc='lower center',
                               bbox_to_anchor=(0.5, -0.2),
                               prop={'size': 13})

    nflags, = np.where(flags != 0)
                                   "Distributional Gap All",

    return flags  # dgc_all_obs