def coc(station, variable_list, flag_col, start, end, logfile, diagnostics=False, plots=False, idl=False): for v, variable in enumerate(variable_list): st_var = getattr(station, variable) all_filtered = utils.apply_filter_flags(st_var) # is this needed 13th Nov 2014 RJHD #reporting_resolution = utils.reporting_accuracy(utils.apply_filter_flags(st_var)) month_ranges = utils.month_starts_in_pairs(start, end) month_ranges = month_ranges.reshape(-1, 12, 2) for month in range(12): hourly_climatologies = np.zeros(24) hourly_climatologies.fill(st_var.mdi) # append all e.g. Januaries together this_month, year_ids, dummy = utils.concatenate_months( month_ranges[:, month, :], st_var.data, hours=True) this_month_filtered, dummy, dummy = utils.concatenate_months( month_ranges[:, month, :], all_filtered, hours=True) # if fixed climatology period, sort this here # get as array of 24 hrs. this_month = np.ma.array(this_month) this_month = this_month.reshape(-1, 24) this_month_filtered = np.ma.array(this_month_filtered) this_month_filtered = this_month_filtered.reshape(-1, 24) # get hourly climatology for each month for hour in range(24): this_hour = this_month[:, hour] # need to have data if this is going to work! if len(this_hour.compressed()) > 0: # winsorize & climatologies - done to match IDL if idl: this_hour = utils.winsorize(np.append( this_hour.compressed(), -999999), 0.05, idl=idl) hourly_climatologies[hour] = np.ma.sum(this_hour) / ( len(this_hour) - 1) else: this_hour = utils.winsorize(this_hour.compressed(), 0.05, idl=idl) hourly_climatologies[hour] = np.ma.mean(this_hour) if len(this_month.compressed()) > 0: # can get stations with few obs in a particular variable. # anomalise each hour over month appropriately anomalies = this_month - np.tile(hourly_climatologies, (this_month.shape[0], 1)) anomalies_filtered = this_month_filtered - np.tile( hourly_climatologies, (this_month_filtered.shape[0], 1)) if len(anomalies.compressed()) >= 10: iqr = utils.IQR(anomalies.compressed().reshape( -1)) / 2. # to match IDL if iqr < 1.5: iqr = 1.5 else: iqr = st_var.mdi normed_anomalies = anomalies / iqr normed_anomalies_filtered = anomalies_filtered / iqr # get average anomaly for year year_ids = np.array(year_ids) monthly_vqvs = np.ma.zeros(month_ranges.shape[0]) monthly_vqvs.mask = [ False for x in range(month_ranges.shape[0]) ] for year in range(month_ranges.shape[0]): year_locs = np.where(year_ids == year) this_year = normed_anomalies_filtered[year_locs, :] if len(this_year.compressed()) > 0: # need to have data for this to work! if idl: monthly_vqvs[year] = utils.idl_median( this_year.compressed().reshape(-1)) else: monthly_vqvs[year] = np.ma.median(this_year) else: monthly_vqvs.mask[year] = True # low pass filter normed_anomalies = coc_low_pass_filter(normed_anomalies, year_ids, monthly_vqvs, month_ranges.shape[0]) # copy from distributional_gap.py - refactor! # get the threshold value bins, bincenters = utils.create_bins(normed_anomalies, 1.) hist, binEdges = np.histogram(normed_anomalies, bins=bins) gaussian = utils.fit_gaussian(bincenters, hist, max(hist), mu=np.mean(normed_anomalies), sig=np.std(normed_anomalies)) minimum_threshold = round( 1. + utils.invert_gaussian(FREQUENCY_THRESHOLD, gaussian)) if diagnostics: print iqr, minimum_threshold, 1. + utils.invert_gaussian( FREQUENCY_THRESHOLD, gaussian) print gaussian print hist if plots: coc_set_up_plot(bincenters, hist, gaussian, variable, threshold=minimum_threshold, sub_par="observations") uppercount = len( np.where(normed_anomalies > minimum_threshold)[0]) lowercount = len( np.where(normed_anomalies < -minimum_threshold)[0]) these_flags = station.qc_flags[:, flag_col[v]] gap_plot_values, tentative_plot_values = [], [] # find the gaps and apply the flags gap_start = dgc.dgc_find_gap(hist, binEdges, minimum_threshold, gap_size=1) # in DGC it is 2. these_flags, gap_plot_values, tentative_plot_values =\ coc_find_and_apply_flags(month_ranges[:,month,:],normed_anomalies, these_flags, year_ids, minimum_threshold, gap_start, \ upper = True, plots = plots, gpv = gap_plot_values, tpv = tentative_plot_values) gap_start = dgc.dgc_find_gap(hist, binEdges, -minimum_threshold, gap_size=1) # in DGC it is 2. these_flags, gap_plot_values, tentative_plot_values =\ coc_find_and_apply_flags(month_ranges[:,month,:],normed_anomalies, these_flags, year_ids, minimum_threshold, gap_start, \ upper = False, plots = plots, gpv = gap_plot_values, tpv = tentative_plot_values) station.qc_flags[:, flag_col[v]] = these_flags if uppercount + lowercount > 1000: #print "not sorted spurious stations yet" pass if plots: import matplotlib.pyplot as plt hist, binEdges = np.histogram(tentative_plot_values, bins=bins) plot_hist = np.array([0.01 if h == 0 else h for h in hist]) plt.step(bincenters, plot_hist, c='orange', ls='-', label='tentative', where='mid') hist, binEdges = np.histogram(gap_plot_values, bins=bins) plot_hist = np.array([0.01 if h == 0 else h for h in hist]) plt.step(bincenters, plot_hist, 'r-', label='flagged', where='mid') import calendar plt.text(0.1, 0.9, calendar.month_name[month + 1], transform=plt.gca().transAxes) leg = plt.legend(loc='lower center', ncol=4, bbox_to_anchor=(0.5, -0.2), frameon=False, prop={'size': 13}, labelspacing=0.15, columnspacing=0.5) plt.setp(leg.get_title(), fontsize=14) plt.show() #plt.savefig(IMAGELOCATION+'/'+station.id+'_ClimatologicalGap_'+str(month+1)+'.png') flag_locs = np.where(station.qc_flags[:, flag_col[v]] != 0) # copy flags into attribute st_var.flags[flag_locs] = 1 if plots or diagnostics: utils.print_flagged_obs_number(logfile, "Climatological", variable, len(flag_locs[0]), noWrite=True) print "where\n" nflags = len(np.where(station.qc_flags[:, flag_col[v]] == 1)[0]) utils.print_flagged_obs_number(logfile, " Firm Clim", variable, nflags, noWrite=True) nflags = len(np.where(station.qc_flags[:, flag_col[v]] == 2)[0]) utils.print_flagged_obs_number(logfile, " Tentative Clim", variable, nflags, noWrite=True) else: utils.print_flagged_obs_number(logfile, "Climatological", variable, len(flag_locs[0])) logfile.write("where\n") nflags = len(np.where(station.qc_flags[:, flag_col[v]] == 1)[0]) utils.print_flagged_obs_number(logfile, " Firm Clim", variable, nflags) nflags = len(np.where(station.qc_flags[:, flag_col[v]] == 2)[0]) utils.print_flagged_obs_number(logfile, " Tentative Clim", variable, nflags) # firm flags match 030220 station = utils.append_history(station, "Climatological Check") return
def sc(station, variable_list, flag_col, start, end, logfile, diagnostics=False, plots=False, doMonth=False): ''' Spike Check, looks for spikes up to 3 observations long, using thresholds calculated from the data itself. :param MetVar station: the station object :param list variable_list: list of observational variables to process :param list flag_col: the columns to set on the QC flag array :param datetime start: dataset start time :param datetime end: dataset end time :param file logfile: logfile to store outputs :param bool plots: do plots :param bool doMonth: account for incomplete months :returns: ''' print "refactor" for v, variable in enumerate(variable_list): flags = station.qc_flags[:, flag_col[v]] st_var = getattr(station, variable) # if incomplete year, mask all obs for the incomplete bit all_filtered = utils.apply_filter_flags(st_var, doMonth=doMonth, start=start, end=end) reporting_resolution = utils.reporting_accuracy( utils.apply_filter_flags(st_var)) # to match IDL system - should never be called as would mean no data if reporting_resolution == -1: reporting_resolution = 1 month_ranges = utils.month_starts_in_pairs(start, end) month_ranges = month_ranges.reshape(-1, 12, 2) good, = np.where(all_filtered.mask == False) full_time_diffs = np.ma.zeros(len(all_filtered), dtype=int) full_time_diffs.mask = copy.deepcopy(all_filtered.mask[:]) full_time_diffs[good[:-1]] = station.time.data[ good[1:]] - station.time.data[good[:-1]] # develop critical values using clean values # NOTE 4/7/14 - make sure that Missing and Flagged values treated appropriately print "sort the differencing if values were flagged rather than missing" full_filtered_diffs = np.ma.zeros(len(all_filtered)) full_filtered_diffs.mask = copy.deepcopy(all_filtered.mask[:]) full_filtered_diffs[good[:-1]] = all_filtered.compressed( )[1:] - all_filtered.compressed()[:-1] # test all values good_to_uncompress, = np.where(st_var.data.mask == False) full_value_diffs = np.ma.zeros(len(st_var.data)) full_value_diffs.mask = copy.deepcopy(st_var.data.mask[:]) full_value_diffs[good_to_uncompress[:-1]] = st_var.data.compressed( )[1:] - st_var.data.compressed()[:-1] # convert to compressed time to match IDL value_diffs = full_value_diffs.compressed() time_diffs = full_time_diffs.compressed() filtered_diffs = full_filtered_diffs.compressed() flags = flags[good_to_uncompress] critical_values = np.zeros([9, 12]) critical_values.fill(st_var.mdi) # link observation to calendar month month_locs = np.zeros(full_time_diffs.shape, dtype=int) for month in range(12): for year in range(month_ranges.shape[0]): if year == 0: this_month_time_diff = full_time_diffs[month_ranges[ year, month, 0]:month_ranges[year, month, 1]] this_month_filtered_diff = full_filtered_diffs[ month_ranges[year, month, 0]:month_ranges[year, month, 1]] else: this_month_time_diff = np.ma.concatenate([ this_month_time_diff, full_time_diffs[month_ranges[year, month, 0]:month_ranges[year, month, 1]] ]) this_month_filtered_diff = np.ma.concatenate([ this_month_filtered_diff, full_filtered_diffs[month_ranges[year, month, 0]:month_ranges[year, month, 1]] ]) month_locs[month_ranges[year, month, 0]:month_ranges[year, month, 1]] = month for delta in range(1, 9): locs = np.ma.where(this_month_time_diff == delta) if len(locs[0]) >= 100: iqr = utils.IQR(this_month_filtered_diff[locs]) if iqr == 0. and delta == 1: critical_values[delta - 1, month] = 6. elif iqr == 0: critical_values[delta - 1, month] = st_var.mdi else: critical_values[delta - 1, month] = 6. * iqr # January 2015 - changed to dynamically calculating the thresholds if less than IQR method ^RJHD if plots: import calendar title = "{}, {}-hr differences".format( calendar.month_name[month + 1], delta) line_label = st_var.name xlabel = "First Difference Magnitudes" else: title, line_label, xlabel = "", "", "" threshold = utils.get_critical_values( this_month_filtered_diff[locs], binmin=0, binwidth=0.5, plots=plots, diagnostics=diagnostics, title=title, line_label=line_label, xlabel=xlabel, old_threshold=critical_values[delta - 1, month]) if threshold < critical_values[delta - 1, month]: critical_values[delta - 1, month] = threshold if plots or diagnostics: print critical_values[delta - 1, month], iqr, 6 * iqr month_locs = month_locs[good_to_uncompress] if diagnostics: print critical_values[0, :] # not less than 5x reporting accuracy good_critical_values = np.where(critical_values != st_var.mdi) low_critical_values = np.where( critical_values[good_critical_values] <= 5. * reporting_resolution) temporary = critical_values[good_critical_values] temporary[low_critical_values] = 5. * reporting_resolution critical_values[good_critical_values] = temporary if diagnostics: print critical_values[0, :], 5. * reporting_resolution # check hourly against 2 hourly, if <2/3 the increase to avoid crazy rejection rate for month in range(12): if critical_values[0, month] != st_var.mdi and critical_values[ 1, month] != st_var.mdi: if critical_values[0, month] / critical_values[1, month] <= 0.66: critical_values[0, month] = 0.66 * critical_values[1, month] if diagnostics: print "critical values" print critical_values[0, :] # get time differences for unfiltered data full_time_diffs = np.ma.zeros(len(st_var.data), dtype=int) full_time_diffs.mask = copy.deepcopy(st_var.data.mask[:]) full_time_diffs[good_to_uncompress[:-1]] = station.time.data[ good_to_uncompress[1:]] - station.time.data[ good_to_uncompress[:-1]] time_diffs = full_time_diffs.compressed() # go through each difference, identify which month it is in if passes spike thresholds # spikes at the beginning or ends of sections for t in np.arange(len(time_diffs)): if (np.abs(time_diffs[t - 1]) > 240) and (np.abs(time_diffs[t]) < 3): # 10 days before but short gap thereafter next_values = st_var.data[good_to_uncompress[t + 1:]] good, = np.where(next_values.mask == False) next_median = np.ma.median(next_values[good[:10]]) next_diff = np.abs(value_diffs[t]) # out of spike median_diff = np.abs(next_median - st_var.data[good_to_uncompress[t]] ) # are the remaining onees if (critical_values[time_diffs[t] - 1, month_locs[t]] != st_var.mdi): # jump from spike > critical but average after < critical / 2 if (np.abs(median_diff) < critical_values[time_diffs[t] - 1, month_locs[t]] / 2.) and\ (np.abs(next_diff) > critical_values[time_diffs[t] - 1, month_locs[t]]) : flags[t] = 1 if plots or diagnostics: sc_diagnostics_and_plots(station.time.data, st_var.data, good_to_uncompress[t], good_to_uncompress[t + 1], start, variable, plots=plots) elif (np.abs(time_diffs[t - 1]) < 3) and (np.abs(time_diffs[t]) > 240): # 10 days after but short gap before prev_values = st_var.data[good_to_uncompress[:t - 1]] good, = np.where(prev_values.mask == False) prev_median = np.ma.median(prev_values[good[-10:]]) prev_diff = np.abs(value_diffs[t - 1]) median_diff = np.abs(prev_median - st_var.data[good_to_uncompress[t]]) if (critical_values[time_diffs[t - 1] - 1, month_locs[t]] != st_var.mdi): # jump into spike > critical but average before < critical / 2 if (np.abs(median_diff) < critical_values[time_diffs[t - 1] - 1, month_locs[t]] / 2.) and\ (np.abs(prev_diff) > critical_values[time_diffs[t - 1] - 1, month_locs[t]]) : flags[t] = 1 if plots or diagnostics: sc_diagnostics_and_plots(station.time.data, st_var.data, good_to_uncompress[t], good_to_uncompress[t + 1], start, variable, plots=plots) ''' this isn't the nicest way, but a direct copy from IDL masked arrays might help remove some of the lines Also, this is relatively slow''' for t in np.arange(len(time_diffs)): for spk_len in [1, 2, 3]: if t >= spk_len and t < len(time_diffs) - spk_len: # check if time differences are appropriate, for multi-point spikes if (np.abs(time_diffs[t - spk_len]) <= spk_len * 3) and\ (np.abs(time_diffs[t]) <= spk_len * 3) and\ (time_diffs[t - spk_len - 1] - 1 < spk_len * 3) and\ (time_diffs[t + 1] - 1 < spk_len * 3) and \ ((spk_len == 1) or \ ((spk_len == 2) and (np.abs(time_diffs[t - spk_len + 1]) <= spk_len * 3)) or \ ((spk_len == 3) and (np.abs(time_diffs[t - spk_len + 1]) <= spk_len * 3) and (np.abs(time_diffs[t - spk_len + 2]) <= spk_len * 3))): # check if differences are valid if (value_diffs[t - spk_len] != st_var.mdi) and \ (value_diffs[t - spk_len] != st_var.fdi) and \ (critical_values[time_diffs[t - spk_len] - 1, month_locs[t]] != st_var.mdi): # if exceed critical values if (np.abs(value_diffs[t - spk_len]) >= critical_values[time_diffs[t - spk_len] - 1, month_locs[t]]): # are signs of two differences different if (math.copysign(1, value_diffs[t]) != math.copysign( 1, value_diffs[t - spk_len])): # are within spike differences small if (spk_len == 1) or\ ((spk_len == 2) and (np.abs(value_diffs[t - spk_len + 1]) < critical_values[time_diffs[t - spk_len + 1] -1, month_locs[t]] / 2.)) or \ ((spk_len == 3) and (np.abs(value_diffs[t - spk_len + 1]) < critical_values[time_diffs[t - spk_len + 1] -1, month_locs[t]] / 2.) and\ (np.abs(value_diffs[t - spk_len + 2]) < critical_values[time_diffs[t - spk_len + 2] -1, month_locs[t]] / 2.)): # check if following value is valid if (value_diffs[t] != st_var.mdi) and (critical_values[time_diffs[t] - 1, month_locs[t]] != st_var.mdi) and\ (value_diffs[t] != st_var.fdi): # and if at least critical value if (np.abs(value_diffs[t]) >= critical_values[ time_diffs[t] - 1, month_locs[t]]): # test if surrounding differences below 1/2 critical value if (np.abs( value_diffs[t - spk_len - 1] ) <= critical_values[ time_diffs[t - spk_len - 1] - 1, month_locs[t]] / 2.): if (np.abs( value_diffs[t + 1] ) <= critical_values[ time_diffs[t + 1] - 1, month_locs[t]] / 2.): # set the flags flags[t - spk_len + 1:t + 1] = 1 if plots or diagnostics: sc_diagnostics_and_plots( station.time. data, st_var.data, good_to_uncompress[ t - spk_len + 1], good_to_uncompress[ t + 1], start, variable, plots=plots) station.qc_flags[good_to_uncompress, flag_col[v]] = flags flag_locs, = np.where(station.qc_flags[:, flag_col[v]] != 0) utils.print_flagged_obs_number(logfile, "Spike", variable, len(flag_locs), noWrite=diagnostics) # additional flags # copy flags into attribute st_var.flags[flag_locs] = 1 # matches 030660 - but with adapted IDL # matches 030220 OK, but finds more but all are reasonable 1/9/14 do_interactive = False if plots and do_interactive == True: import matplotlib.pyplot as plt plot_times = utils.times_hours_to_datetime(station.time.data, start) plt.clf() plt.plot(plot_times, all_filtered, 'bo', ls='-') flg = np.where(flags[:, flag_col[v]] == 1) plt.plot(plot_times[flg], all_filtered[flg], 'ro', markersize=10) plt.show() station = utils.append_history(station, "Spike Check") return # sc
def evc(station, variable_list, flag_col, start, end, logfile, diagnostics = False, plots = False, idl = False): if plots or diagnostics: import matplotlib.pyplot as plt import calendar # very similar to climatological check - ensure that not duplicating for v, variable in enumerate(variable_list): st_var = getattr(station, variable) reporting_resolution = utils.reporting_accuracy(utils.apply_filter_flags(st_var)) reporting_freq = utils.reporting_frequency(utils.apply_filter_flags(st_var)) month_ranges = utils.month_starts_in_pairs(start, end) month_ranges = month_ranges.reshape(-1,12,2) month_data_count = np.zeros(month_ranges.shape[0:2]) # for each month for month in range(12): # set up hourly climatologies hourly_clims = np.zeros(24) hourly_clims.fill(st_var.data.fill_value) this_month, year_ids, month_data_count[:,month] = utils.concatenate_months(month_ranges[:,month,:], st_var.data, hours = True) # # extract each year and append together # year_ids = [] # counter to determine which year each day corresponds to # for year in range(month_ranges.shape[0]): # this_year = st_var.data[month_ranges[year,month][0]:month_ranges[year,month][1]] # if year == 0: # # store so can access each hour of day separately # this_month = this_year.reshape(-1,24) # year_ids = [year for x in range(this_month.shape[0])] # month_data_count[year,month] = len(this_year.compressed()) # else: # this_year = this_year.reshape(-1,24) # this_month = np.ma.concatenate((this_month, this_year), axis = 0) # year_ids.extend([year for x in range(this_year.shape[0])]) # month_data_count[year,month] = len(this_year.compressed()) # winsorize and get hourly climatology for h in range(24): this_hour = this_month[:,h] if len(this_hour.compressed()) > 100: # winsorize & climatologies - done to match IDL if idl: this_hour_winsorized = utils.winsorize(np.append(this_hour.compressed(), -999999), 0.05, idl = idl) hourly_clims[h] = np.ma.sum(this_hour_winsorized)/(len(this_hour_winsorized) - 1) else: this_hour_winsorized = utils.winsorize(this_hour.compressed(), 0.05, idl = idl) hourly_clims[h] = np.ma.mean(this_hour_winsorized) hourly_clims = np.ma.masked_where(hourly_clims == st_var.data.fill_value, hourly_clims) anomalies = this_month - np.tile(hourly_clims, (this_month.shape[0], 1)) # extract IQR of anomalies (using 1/2 value to match IDL) if len(anomalies.compressed()) >= 10: iqr = utils.IQR(anomalies.compressed().reshape(-1)) / 2. # to match IDL if iqr < 1.5: iqr = 1.5 else: iqr = st_var.mdi normed_anomalies = anomalies / iqr variances = np.ma.zeros(month_ranges.shape[0]) variances.mask = [False for i in range(month_ranges.shape[0])] rep_accuracies = np.zeros(month_ranges.shape[0]) rep_freqs = np.zeros(month_ranges.shape[0]) variances.fill(st_var.mdi) rep_accuracies.fill(st_var.mdi) rep_freqs.fill(st_var.mdi) year_ids = np.array(year_ids) # extract variance of normalised anomalies for each year for y, year in enumerate(range(month_ranges.shape[0])): year_locs = np.where(year_ids == y) this_year = normed_anomalies[year_locs,:] this_year = this_year.reshape(-1) # end of similarity with Climatological check if len(this_year.compressed()) >= 30: variances[y] = utils.mean_absolute_deviation(this_year, median = True) rep_accuracies[y] = utils.reporting_accuracy(this_year) rep_freqs[y] = utils.reporting_frequency(this_year) else: variances.mask[y] = True good = np.where(month_data_count[:,month] >= 100) # get median and IQR of variance for all years for this month if len(good[0]) >= 10: median_variance = np.median(variances[good]) iqr_variance = utils.IQR(variances[good]) / 2. # to match IDL if iqr_variance < 0.01: iqr_variance = 0.01 else: median_variance = st_var.mdi iqr_variance = st_var.mdi # if SLP, then get median and MAD of SLP and windspeed for month if variable in ["slp", "windspeeds"]: winds = getattr(station, "windspeeds") slp = getattr(station, "slp") # refactor this as similar in style to how target data extracted for y, year in enumerate(range(month_ranges.shape[0])): if y == 0: winds_year = winds.data[month_ranges[year,month][0]:month_ranges[year,month][1]] winds_month = winds_year.reshape(-1,24) slp_year = slp.data[month_ranges[year,month][0]:month_ranges[year,month][1]] slp_month = slp_year.reshape(-1,24) else: winds_year = winds.data[month_ranges[year,month][0]:month_ranges[year,month][1]] winds_year = winds_year.reshape(-1,24) winds_month = np.ma.concatenate((winds_month, winds_year), axis = 0) slp_year = slp.data[month_ranges[year,month][0]:month_ranges[year,month][1]] slp_year = slp_year.reshape(-1,24) slp_month = np.ma.concatenate((slp_month, slp_year), axis = 0) median_wind = np.ma.median(winds_month) median_slp = np.ma.median(slp_month) wind_MAD = utils.mean_absolute_deviation(winds_month.compressed()) slp_MAD = utils.mean_absolute_deviation(slp_month.compressed()) if diagnostics: print "median windspeed {} m/s, MAD = {}".format(median_wind, wind_MAD) print "median slp {} hPa, MAD = {}".format(median_slp, slp_MAD) # now test to see if variance exceeds expected range for y, year in enumerate(range(month_ranges.shape[0])): if (variances[y] != st_var.mdi) and (iqr_variance != st_var.mdi) and \ (median_variance != st_var.mdi) and (month_data_count[y,month] >= DATA_COUNT_THRESHOLD): # if SLP, then need to test if deep low pressure ("hurricane/storm") present # as this will increase the variance for this month + year if variable in ["slp", "windspeeds"]: iqr_threshold = 6. # increase threshold if reporting frequency and resolution of this # year doesn't match average if (rep_accuracies[y] != reporting_resolution) and \ (rep_freqs[y] != reporting_freq): iqr_threshold = 8. if diagnostics: print np.abs(variances[y] - median_variance) / iqr_variance, variances[y] , median_variance , iqr_variance , iqr_threshold, month+1, year+start.year if np.abs((variances[y] - median_variance) / iqr_variance) > iqr_threshold: # check for storms winds_month = winds.data[month_ranges[year,month][0]:month_ranges[year,month][1]] slp_month = slp.data[month_ranges[year,month][0]:month_ranges[year,month][1]] storm = False if (len(winds_month.compressed()) >= 1) and (len(slp_month.compressed()) >= 1): # find max wind & min SLP # max_wind_loc = np.where(winds_month == np.max(winds_month))[0][0] # min_slp_loc = np.where(slp_month == np.min(slp_month))[0][0] # if these are above thresholds and within one day of each other, # then it likely was a storm # print "fix this in case of multiple max/min locations" # if (np.abs(max_wind_loc - min_slp_loc) <= 24) and \ # (((np.max(winds_month) - median_wind) / wind_MAD) > MAD_THRESHOLD) and \ # (((median_slp - np.min(slp_month)) / slp_MAD) > MAD_THRESHOLD): # locations where winds greater than threshold high_winds, = np.where((winds_month - median_wind)/wind_MAD > MAD_THRESHOLD) # and where SLP less than threshold low_slps, = np.where((median_slp - slp_month)/slp_MAD > MAD_THRESHOLD) # if any locations match, then it's a storm match_loc = high_winds[np.in1d(high_winds, low_slps)] if len(match_loc) > 0: storm = True else: print "write spurious" # check the SLP first difference series # to ensure a drop down and climb out of minimum SLP/or climb up and down from maximum wind speed if variable == "slp": diffs = np.diff(slp_month.compressed()) elif variable == "windspeeds": diffs = np.diff(winds_month.compressed()) negs, poss = 0,0 biggest_neg, biggest_pos = 0,0 for diff in diffs: if diff > 0: if negs > biggest_neg: biggest_neg = negs negs = 0 poss += 1 else: if poss > biggest_pos: biggest_pos = poss poss = 0 negs += 1 if (biggest_neg < 10) and (biggest_pos < 10) and not storm: # not a hurricane, so mask station.qc_flags[month_ranges[year,month,0]:month_ranges[year,month,1], flag_col[v]] = 1 if plots or diagnostics: print "No Storm or Hurricane in %i %i - flagging\n" % (month+1, y+start.year) else: logfile.write("No Storm or Hurricane in %i %i - flagging\n" % (month+1, y+start.year)) else: # hurricane if plots or diagnostics: print "Storm or Hurricane in %i %i - not flagging\n" % (month+1, y+start.year) else: logfile.write("Storm or Hurricane in %i %i - not flagging\n" % (month+1, y+start.year)) if plots: # plot showing the pressure, pressure first differences and the wind speeds plot_times = utils.times_hours_to_datetime(station.time.data[month_ranges[year,month][0]:month_ranges[year,month][1]], start) evc_plot_slp_wind(plot_times, slp_month, diffs, median_slp, slp_MAD, winds_month, median_wind, wind_MAD) else: iqr_threshold = 8. if (rep_accuracies[y] != reporting_resolution) and \ (rep_freqs[y] != reporting_freq): iqr_threshold = 10. if np.abs(variances[y] - median_variance) / iqr_variance > iqr_threshold: if diagnostics: print "flagging {} {}".format(year+start.year,calendar.month_name[month+1]) # remove the data station.qc_flags[month_ranges[year,month,0]:month_ranges[year,month,1], flag_col[v]] = 1 if plots: plot_variances = (variances - median_variance) / iqr_variance plot_variances = np.ma.masked_where(month_data_count[:,month] < DATA_COUNT_THRESHOLD,plot_variances) evc_plot_hist(plot_variances, iqr_threshold, "Variance Check - %s - %s" % (variable, calendar.month_name[month+1])) flag_locs = np.where(station.qc_flags[:, flag_col[v]] != 0) if plots or diagnostics: utils.print_flagged_obs_number(logfile, "Variance", variable, len(flag_locs[0]), noWrite = True) else: utils.print_flagged_obs_number(logfile, "Variance", variable, len(flag_locs[0])) # copy flags into attribute st_var.flags[flag_locs] = 1 # matches 030660 for T, D and SLP 21/8/2014 station = utils.append_history(station, "Excess Variance Check") return # evc
def dgc_all_obs(station, variable, flags, start, end, plots=False, diagnostics=False, idl=False, windspeeds=False, GH=False): '''RJHD addition working on all observations''' if plots: import matplotlib.pyplot as plt st_var = getattr(station, variable) month_ranges = utils.month_starts_in_pairs(start, end) month_ranges = month_ranges.reshape(-1, 12, 2) all_filtered = utils.apply_filter_flags(st_var) for month in range(12): if windspeeds == True: st_var_wind = getattr(station, "windspeeds") # get monthly averages windspeeds_month = np.empty([]) for y, year in enumerate(month_ranges[:, month, :]): if y == 0: windspeeds_month = np.ma.array( st_var_wind.data[year[0]:year[1]]) else: windspeeds_month = np.ma.concatenate( [windspeeds_month, st_var_wind.data[year[0]:year[1]]]) windspeeds_month_average = dgc_get_monthly_averages( windspeeds_month, OBS_LIMIT, st_var_wind.mdi, MEAN) windspeeds_month_mad = utils.mean_absolute_deviation( windspeeds_month, median=True) this_month_data = np.array([]) this_month_filtered = np.array([]) this_month_data, dummy, dummy = utils.concatenate_months( month_ranges[:, month, :], st_var.data, hours=False) this_month_filtered, dummy, dummy = utils.concatenate_months( month_ranges[:, month, :], all_filtered, hours=False) if len(this_month_filtered.compressed()) > OBS_LIMIT: if idl: monthly_median = utils.idl_median( this_month_filtered.compressed().reshape(-1)) else: monthly_median = np.ma.median(this_month_filtered) iqr = utils.IQR(this_month_filtered.compressed()) if iqr == 0.0: # to get some spread if IQR too small iqr = utils.IQR(this_month_filtered.compressed(), percentile=0.05) print "Spurious_stations file not yet sorted" if iqr != 0.0: monthly_values = np.ma.array( (this_month_data.compressed() - monthly_median) / iqr) bins, bincenters = utils.create_bins(monthly_values, BIN_SIZE) dummy, plot_bincenters = utils.create_bins( monthly_values, BIN_SIZE / 10.) hist, binEdges = np.histogram(monthly_values, bins=bins) if GH: # Use Gauss-Hermite polynomials to add skew and kurtosis to Gaussian fit - January 2015 ^RJHD initial_values = [ np.max(hist), np.mean(monthly_values), np.std(monthly_values), stats.skew(monthly_values), stats.kurtosis(monthly_values) ] # norm, mean, std, skew, kurtosis fit = leastsq(utils.residualsGH, initial_values, [bincenters, hist, np.ones(len(hist))]) res = utils.hermite2gauss(fit[0], diagnostics=diagnostics) plot_gaussian = utils.funcGH(fit[0], plot_bincenters) # adjust to remove the rising bumps seen in some fits - artefacts of GH fitting? mid_point = np.argmax(plot_gaussian) bad, = np.where( plot_gaussian[mid_point:] < FREQUENCY_THRESHOLD / 10.) if len(bad) > 0: plot_gaussian[mid_point:][ bad[0]:] = FREQUENCY_THRESHOLD / 10. bad, = np.where( plot_gaussian[:mid_point] < FREQUENCY_THRESHOLD / 10.) if len(bad) > 0: plot_gaussian[:mid_point][:bad[ -1]] = FREQUENCY_THRESHOLD / 10. # extract threshold values good_values = np.argwhere( plot_gaussian > FREQUENCY_THRESHOLD) l_minimum_threshold = round( plot_bincenters[good_values[0]]) - 1 u_minimum_threshold = 1 + round( plot_bincenters[good_values[-1]]) else: gaussian = utils.fit_gaussian(bincenters, hist, max(hist), mu=np.mean(monthly_values), sig=np.std(monthly_values)) # assume the same threshold value u_minimum_threshold = 1 + round( utils.invert_gaussian(FREQUENCY_THRESHOLD, gaussian)) l_minimum_threshold = -u_minimum_threshold plot_gaussian = utils.gaussian(plot_bincenters, gaussian) if diagnostics: if GH: print hist print res print iqr, l_minimum_threshold, u_minimum_threshold else: print hist print gaussian print iqr, u_minimum_threshold, 1. + utils.invert_gaussian( FREQUENCY_THRESHOLD, gaussian) if plots: dgc_set_up_plot(plot_gaussian, monthly_values, variable, threshold=(u_minimum_threshold, l_minimum_threshold), sub_par="observations", GH=GH) if GH: plt.figtext( 0.15, 0.67, 'Mean %.2f, S.d. %.2f,\nSkew %.2f, Kurtosis %.2f' % (res['mean'], res['dispersion'], res['skewness'], res['kurtosis']), color='k', size='small') uppercount = len( np.where(monthly_values > u_minimum_threshold)[0]) lowercount = len( np.where(monthly_values < l_minimum_threshold)[0]) # this needs refactoring - but lots of variables to pass in if plots or diagnostics: gap_plot_values = np.array([]) if uppercount > 0: gap_start = dgc_find_gap(hist, binEdges, u_minimum_threshold) if gap_start != 0: for y, year in enumerate(month_ranges[:, month, :]): this_year_data = np.ma.array( all_filtered[year[0]:year[1]]) this_year_flags = np.array(flags[year[0]:year[1]]) gap_cleaned_locations = np.where( ((this_year_data - monthly_median) / iqr) > gap_start) this_year_flags[gap_cleaned_locations] = 1 flags[year[0]:year[1]] = this_year_flags if plots or diagnostics: gap_plot_values = np.append( gap_plot_values, (this_year_data[gap_cleaned_locations]. compressed() - monthly_median) / iqr) if lowercount > 0: gap_start = dgc_find_gap(hist, binEdges, l_minimum_threshold) if gap_start != 0: for y, year in enumerate(month_ranges[:, month, :]): this_year_data = np.ma.array( all_filtered[year[0]:year[1]]) this_year_flags = np.array(flags[year[0]:year[1]]) gap_cleaned_locations = np.where( np.logical_and( ((this_year_data - monthly_median) / iqr) < gap_start, this_year_data.mask != True)) this_year_flags[gap_cleaned_locations] = 1 flags[year[0]:year[1]] = this_year_flags if plots or diagnostics: gap_plot_values = np.append( gap_plot_values, (this_year_data[gap_cleaned_locations]. compressed() - monthly_median) / iqr) if windspeeds: this_year_flags[ gap_cleaned_locations] = 2 # tentative flags slp_average = dgc_get_monthly_averages( this_month_data, OBS_LIMIT, st_var.mdi, MEAN) slp_mad = utils.mean_absolute_deviation( this_month_data, median=True) storms = np.where((((windspeeds_month - windspeeds_month_average) / windspeeds_month_mad) > 4.5) &\ (((this_month_data - slp_average) / slp_mad) > 4.5)) if len(storms[0]) >= 2: storm_1diffs = np.diff(storms) separations = np.where(storm_1diffs != 1) #for sep in separations: if plots: hist, binEdges = np.histogram(gap_plot_values, bins=bins) plot_hist = np.array([0.01 if h == 0 else h for h in hist]) plt.step(bincenters, plot_hist, 'r-', label='flagged', where='mid') import calendar plt.text(0.1, 0.9, calendar.month_name[month + 1], transform=plt.gca().transAxes) plt.legend(loc='lower center', ncol=3, bbox_to_anchor=(0.5, -0.2), frameon=False, prop={'size': 13}) plt.show() #plt.savefig(IMAGELOCATION+'/'+station.id+'_DistributionalGap_'+str(month+1)+'.png') if diagnostics: utils.print_flagged_obs_number("", "Distributional Gap", variable, len(gap_plot_values), noWrite=True) return flags # dgc_all_obs
def dgc_monthly(station, variable, flags, start, end, plots=False, diagnostics=False, idl=False): ''' Original Distributional Gap Check :param obj station: station object :param str variable: variable to act on :param array flags: flags array :param datetime start: data start :param datetime end: data end :param bool plots: run plots :param bool diagnostics: run diagnostics :param bool idl: run IDL equivalent routines for median :returns: flags - updated flag array ''' if plots: import matplotlib.pyplot as plt st_var = getattr(station, variable) month_ranges = utils.month_starts_in_pairs(start, end) # get monthly averages month_average = np.empty(month_ranges.shape[0]) month_average.fill(st_var.mdi) month_average_filtered = np.empty(month_ranges.shape[0]) month_average_filtered.fill(st_var.mdi) all_filtered = utils.apply_filter_flags(st_var) for m, month in enumerate(month_ranges): data = st_var.data[month[0]:month[1]] filtered = all_filtered[month[0]:month[1]] month_average[m] = dgc_get_monthly_averages(data, OBS_LIMIT, st_var.mdi, MEAN) month_average_filtered[m] = dgc_get_monthly_averages( filtered, OBS_LIMIT, st_var.mdi, MEAN) # get overall monthly climatologies - use filtered data month_average = month_average.reshape(-1, 12) month_average_filtered = month_average_filtered.reshape(-1, 12) standardised_months = np.empty(month_average.shape) standardised_months.fill(st_var.mdi) for m in range(12): valid_filtered = np.where(month_average_filtered[:, m] != st_var.mdi) if len(valid_filtered[0]) >= VALID_MONTHS: valid_data = month_average_filtered[valid_filtered, m][0] if MEAN: clim = np.mean(valid_data) spread = np.stdev(valid_data) else: if idl: clim = utils.idl_median( valid_data.compressed().reshape(-1)) else: clim = np.median(valid_data) spread = utils.IQR(valid_data) if spread <= SPREAD_LIMIT: spread = SPREAD_LIMIT standardised_months[valid_filtered, m] = (month_average[valid_filtered, m] - clim) / spread standardised_months = standardised_months.reshape(month_ranges.shape[0]) good_months = np.where(standardised_months != st_var.mdi) # must be able to do this with masked arrays if plots: bins, bincenters = utils.create_bins(standardised_months[good_months], BIN_SIZE) dummy, plot_bincenters = utils.create_bins( standardised_months[good_months], BIN_SIZE / 10.) hist, binEdges = np.histogram(standardised_months[good_months], bins=bins) fit = utils.fit_gaussian(bincenters, hist, max(hist), mu=np.mean(standardised_months[good_months]), sig=np.std(standardised_months[good_months])) plot_gaussian = utils.gaussian(plot_bincenters, fit) dgc_set_up_plot(plot_gaussian, standardised_months[good_months], variable, sub_par="Months") # remove all months with a large standardised offset if len(good_months[0]) >= MONTH_LIMIT: standardised_months = np.ma.masked_values(standardised_months, st_var.mdi) large_offsets = np.where(standardised_months >= LARGE_LIMIT) if len(large_offsets[0]) > 0: for lo in large_offsets[0]: flags[month_ranges[lo, 0]:month_ranges[lo, 1]] = 1 if plots: hist, binEdges = np.histogram( standardised_months[large_offsets], bins=bins) plot_hist = np.array([0.01 if h == 0 else h for h in hist]) plt.step(bincenters, plot_hist, 'g-', label='> %i' % LARGE_LIMIT, where='mid', zorder=5) plt.axvline(5, c='g') plt.axvline(-5, c='g') # walk distribution from centre and see if any assymetry sort_order = standardised_months[good_months].argsort() mid_point = len(good_months[0]) / 2 good = True iter = 1 while good: if standardised_months[good_months][sort_order][ mid_point - iter] != standardised_months[good_months][sort_order][ mid_point + iter]: # using IDL notation tempvals = [ np.abs( standardised_months[good_months][sort_order][mid_point - iter]), np.abs( standardised_months[good_months][sort_order][mid_point + iter]) ] if min(tempvals) != 0: if max(tempvals) / min(tempvals) >= 2. and min( tempvals) >= 1.5: # substantial asymmetry in distribution - at least 1.5 from centre and difference of 2. if tempvals[0] == max(tempvals): # LHS bad = good_months[0][sort_order][:mid_point - iter] if plots: badplot = standardised_months[good_months][ sort_order][:mid_point - iter] elif tempvals[1] == max(tempvals): #RHS bad = good_months[0][sort_order][mid_point + iter:] if plots: badplot = standardised_months[good_months][ sort_order][mid_point + iter:] for b in bad: flags[month_ranges[b, 0]:month_ranges[b, 1]] = 1 if plots: hist, binEdges = np.histogram(badplot, bins=bins) plot_hist = np.array( [0.01 if h == 0 else h for h in hist]) plt.step(bincenters, plot_hist, 'r-', label='Gap', where='mid', zorder=4) good = False iter += 1 if iter == mid_point: break if plots: plt.legend(loc='lower center', ncol=4, bbox_to_anchor=(0.5, -0.2), frameon=False, prop={'size': 13}) plt.show() #plt.savefig(IMAGELOCATION+'/'+station.id+'_DistributionalGap.png') return flags # dgc_monthly
def detect(station, neighbour, variable, flags, neighbour_count, start, end, distance=0, diagnostics=False, plots=False): ''' Detect which observations are outliers :param MetVar station: station object (target) :param MetVar neighbour: station object (neighbour) :param string variable: which variable to process :param array flags: array to store how many neighbours thing each obs is bad :param array neighbour_count: how many neighbours present at each obs :param datetime start: start of dataset :param datetime end: end of dataset :param int distance: separation of target and neighbour :param bool diagnostics: extra output :param bool plots: make figures :returns: None ''' FILTERING_FLAG_COL = { "temperatures": [0, 1, 4, 5, 8, 12, 16, 20, 27, 41, 44, 58], "dewpoints": [0, 2, 4, 6, 8, 9, 13, 17, 21, 28, 30, 31, 32, 42, 45, 59], "slp": [0, 3, 4, 7, 11, 15, 19, 23, 29, 43, 46, 60], "windspeeds": [0, 4, 10, 14, 18, 22, 56, 62, 63, 64] } # not used, but ready for it. st_var = getattr(station, variable) neigh_var = getattr(neighbour, variable) # filter by flags - not all (no Climatological [24,25], or Odd cluster [54,55,56,57]), T record check not in D, total_flags = np.sum(station.qc_flags[:, FILTERING_FLAG_COL[variable]], axis=1) st_filtered = np.ma.masked_where(total_flags == 1, st_var.data) neigh_filtered = np.ma.masked_where(total_flags == 1, neigh_var.data) # match the observation times match = np.where( np.logical_and((st_filtered.data != st_var.mdi), (neigh_filtered.data != neigh_var.mdi))) month_ranges = utils.month_starts_in_pairs(start, end).reshape( -1, 12, 2) # in year-long sets of pairs. if len(match[0]) >= 100: neighbour_count[match] += 1 # number of neighbours with data present differences = np.ma.zeros(len(st_filtered)) differences.fill(st_var.mdi) differences.mask = True differences[ match] = st_filtered.data[match] - neigh_filtered.data[match] differences.mask[match] = False all_iqrs = np.zeros(len(differences)) # get monthly IQR values for month in range(12): this_month, dummy1, dummy2 = utils.concatenate_months( month_ranges[:, month, :], differences, hours=False) if len(this_month.compressed()) > 4: iqr = utils.IQR(this_month.compressed()) if iqr <= 2.: iqr = 2. else: iqr = 2. # and copy back into the array for year in month_ranges[:, month, :]: all_iqrs[year[0]:year[1]] = iqr if plots: plot_target_neigh_diffs_dist(differences, min(all_iqrs)) dubious = np.ma.where(np.ma.abs(differences) > 5. * all_iqrs) if len(dubious[0]) >= 1.: if variable == "slp": # check if they are storms positive = np.ma.where(differences > 5. * iqr) negative = np.ma.where(differences < -5. * iqr) # if majority negative (2/3) and separation > 100 if (distance > 100.) and ( float(len(positive[0])) / len(dubious[0]) < 0.333): if len(positive[0]) > 0: flags[positive] += 1 if len(negative[0]) > 0: neighbour_count[match] -= 1 else: flags[dubious] += 1 else: flags[dubious] += 1 return # detect
def dgc_all_obs(station, variable, flags, start, end, logfile, plots=False, diagnostics=False, idl=False, windspeeds=False, GH=False, doMonth=False): '''RJHD addition working on all observations''' if plots: import matplotlib.pyplot as plt month_ranges = utils.month_starts_in_pairs(start, end) month_ranges = month_ranges.reshape(-1, 12, 2) # extract variable st_var = getattr(station, variable) # apply flags (and mask incomplete year if appropriate) all_filtered = utils.apply_filter_flags(st_var, doMonth=doMonth, start=start, end=end) st_var_complete_year = copy.deepcopy(st_var) if doMonth: # restrict the incomplete year if appropriate - keep other flagged obs. full_year_end = utils.get_first_hour_this_year(start, end) st_var_complete_year.data.mask[full_year_end:] = True for month in range(12): # if requiring wind data, extract data and find monthly averages if windspeeds == True: st_var_wind = getattr(station, "windspeeds") if doMonth: # restrict the incomplete year if appropriate st_var_wind.data.mask[full_year_end:] = True # get monthly averages windspeeds_month = np.empty([]) for y, year in enumerate(month_ranges[:, month, :]): if y == 0: windspeeds_month = np.ma.array( st_var_wind.data[year[0]:year[1]]) else: windspeeds_month = np.ma.concatenate( [windspeeds_month, st_var_wind.data[year[0]:year[1]]]) windspeeds_month_average = dgc_get_monthly_averages( windspeeds_month, OBS_LIMIT, st_var_wind.mdi, MEAN) windspeeds_month_mad = utils.mean_absolute_deviation( windspeeds_month, median=True) # pull data from each calendar month together this_month_data, dummy, dummy = utils.concatenate_months( month_ranges[:, month, :], st_var.data, hours=False) this_month_filtered, dummy, dummy = utils.concatenate_months( month_ranges[:, month, :], all_filtered, hours=False) this_month_complete, dummy, dummy = utils.concatenate_months( month_ranges[:, month, :], st_var_complete_year.data, hours=False) # if enough clean and complete data for this calendar month find the median and IQR if len(this_month_filtered.compressed()) > OBS_LIMIT: if idl: monthly_median = utils.idl_median( this_month_filtered.compressed().reshape(-1)) else: monthly_median = np.ma.median(this_month_filtered) iqr = utils.IQR(this_month_filtered.compressed()) if iqr == 0.0: # to get some spread if IQR too small iqr = utils.IQR(this_month_filtered.compressed(), percentile=0.05) print "Spurious_stations file not yet sorted" # if have an IQR, anomalise using median and standardise using IQR if iqr != 0.0: monthly_values = np.ma.array( (this_month_data.compressed() - monthly_median) / iqr) complete_values = np.ma.array( (this_month_complete.compressed() - monthly_median) / iqr) # use complete years only for the histogram - aiming to find outliers. bins, bincenters = utils.create_bins(complete_values, BIN_SIZE) dummy, plot_bincenters = utils.create_bins( complete_values, BIN_SIZE / 10.) hist, binEdges = np.histogram(complete_values, bins=bins) """ Change to monthly updates Oct 2017 Thought about changing distribution to use filtered values But this changes the test beyond just dealing with additional months Commented out lines below would be alternative. """ # bins, bincenters = utils.create_bins(filtered_values, BIN_SIZE) # dummy, plot_bincenters = utils.create_bins(filtered_values, BIN_SIZE/10.) # hist, binEdges = np.histogram(filtered_values, bins = bins) # used filtered (incl. incomplete year mask) to determine the distribution. if GH: # Use Gauss-Hermite polynomials to add skew and kurtosis to Gaussian fit - January 2015 ^RJHD # Feb 2019 - if large amounts off centre, can affect initial values # switched to median and MAD initial_values = [ np.max(hist), np.median(complete_values), utils.mean_absolute_deviation(complete_values, median=True), stats.skew(complete_values), stats.kurtosis(complete_values) ] # norm, mean, std, skew, kurtosis fit = leastsq(utils.residualsGH, initial_values, [bincenters, hist, np.ones(len(hist))]) res = utils.hermite2gauss(fit[0], diagnostics=diagnostics) plot_gaussian = utils.funcGH(fit[0], plot_bincenters) # adjust to remove the rising bumps seen in some fits - artefacts of GH fitting? mid_point = np.argmax(plot_gaussian) bad, = np.where( plot_gaussian[mid_point:] < FREQUENCY_THRESHOLD / 10.) if len(bad) > 0: plot_gaussian[mid_point:][ bad[0]:] = FREQUENCY_THRESHOLD / 10. bad, = np.where( plot_gaussian[:mid_point] < FREQUENCY_THRESHOLD / 10.) if len(bad) > 0: plot_gaussian[:mid_point][:bad[ -1]] = FREQUENCY_THRESHOLD / 10. # extract threshold values good_values = np.argwhere( plot_gaussian > FREQUENCY_THRESHOLD) l_minimum_threshold = round( plot_bincenters[good_values[0]]) - 1 u_minimum_threshold = 1 + round( plot_bincenters[good_values[-1]]) if diagnostics: print hist print res print iqr, l_minimum_threshold, u_minimum_threshold # or just a standard Gaussian else: gaussian = utils.fit_gaussian( bincenters, hist, max(hist), mu=np.median(complete_values), sig=utils.mean_absolute_value(complete_values)) # assume the same threshold value u_minimum_threshold = 1 + round( utils.invert_gaussian(FREQUENCY_THRESHOLD, gaussian)) l_minimum_threshold = -u_minimum_threshold plot_gaussian = utils.gaussian(plot_bincenters, gaussian) if diagnostics: print hist print gaussian print iqr, u_minimum_threshold, 1. + utils.invert_gaussian( FREQUENCY_THRESHOLD, gaussian) if plots: dgc_set_up_plot(plot_gaussian, complete_values, variable, threshold=(u_minimum_threshold, l_minimum_threshold), sub_par="observations", GH=GH) if GH: plt.figtext( 0.15, 0.67, 'Mean %.2f, S.d. %.2f,\nSkew %.2f, Kurtosis %.2f' % (res['mean'], res['dispersion'], res['skewness'], res['kurtosis']), color='k', size='small') # now trying to find gaps in the distribution uppercount = len( np.where(monthly_values > u_minimum_threshold)[0]) lowercount = len( np.where(monthly_values < l_minimum_threshold)[0]) # this needs refactoring - but lots of variables to pass in if plots or diagnostics: gap_plot_values = np.array([]) # do one side of distribution and then other if uppercount > 0: gap_start = dgc_find_gap(hist, binEdges, u_minimum_threshold) if gap_start != 0: # if found a gap, then go through each year for this calendar month # and flag observations further from middle for y, year in enumerate(month_ranges[:, month, :]): # not using filtered - checking all available data this_year_data = np.ma.array( st_var.data[year[0]:year[1]]) this_year_flags = np.array(flags[year[0]:year[1]]) gap_cleaned_locations = np.ma.where( ((this_year_data - monthly_median) / iqr) > gap_start) this_year_flags[gap_cleaned_locations] = 1 flags[year[0]:year[1]] = this_year_flags if plots or diagnostics: gap_plot_values = np.append( gap_plot_values, (this_year_data[gap_cleaned_locations]. compressed() - monthly_median) / iqr) if len(gap_cleaned_locations[0]) > 0: print "Upper {}-{} - {} obs flagged".format( y + start.year, month, len(gap_cleaned_locations[0])) print gap_cleaned_locations, this_year_data[ gap_cleaned_locations] if lowercount > 0: gap_start = dgc_find_gap(hist, binEdges, l_minimum_threshold) if gap_start != 0: # if found a gap, then go through each year for this calendar month # and flag observations further from middle for y, year in enumerate(month_ranges[:, month, :]): this_year_data = np.ma.array( st_var.data[year[0]:year[1]]) this_year_flags = np.array(flags[year[0]:year[1]]) gap_cleaned_locations = np.ma.where( np.logical_and( ((this_year_data - monthly_median) / iqr) < gap_start, this_year_data.mask != True)) # add flag requirement for low pressure bit if appropriate this_year_flags[gap_cleaned_locations] = 1 flags[year[0]:year[1]] = this_year_flags if plots or diagnostics: gap_plot_values = np.append( gap_plot_values, (this_year_data[gap_cleaned_locations]. compressed() - monthly_median) / iqr) if len(gap_cleaned_locations[0]) > 0: print "Lower {}-{} - {} obs flagged".format( y + start.year, month, len(gap_cleaned_locations[0])) print gap_cleaned_locations, this_year_data[ gap_cleaned_locations] # if doing SLP then do extra checks for storms if windspeeds: windspeeds_year = np.ma.array( st_var_wind.data[year[0]:year[1]]) this_year_flags[ gap_cleaned_locations] = 2 # tentative flags slp_average = dgc_get_monthly_averages( this_month_data, OBS_LIMIT, st_var.mdi, MEAN) slp_mad = utils.mean_absolute_deviation( this_month_data, median=True) # need to ensure that this_year_data is less than slp_average, hence order of test storms, = np.ma.where((((windspeeds_year - windspeeds_month_average) / windspeeds_month_mad) > MAD_THRESHOLD) &\ (((slp_average - this_year_data) / slp_mad) > MAD_THRESHOLD)) # using IDL terminology if len(storms) >= 2: # use the first difference series to find when there are gaps in # contiguous sequences of storm observations - want to split up into # separate storm events storm_1diffs = np.diff(storms) separations, = np.where(storm_1diffs != 1) # expand around storm signal so that all low SLP values covered, and unflagged if len(separations) >= 1: print " multiple storms in {} {}".format( y + start.year, month) # if more than one storm signal that month, then use intervals # in the first difference series to expand around the first interval alone storm_start = 0 storm_finish = separations[0] + 1 first_storm = dgc_expand_storms( storms[storm_start:storm_finish], len(this_year_data)) final_storms = copy.deepcopy( first_storm) for j in range(len(separations)): # then do the rest in a loop if j + 1 == len(separations): # final one this_storm = dgc_expand_storms( storms[separations[j] + 1:], len(this_year_data)) else: this_storm = dgc_expand_storms( storms[separations[j] + 1:separations[j + 1] + 1], len(this_year_data)) final_storms = np.append( final_storms, this_storm) else: # else just expand around the signal by 6 hours either way final_storms = dgc_expand_storms( storms, len(this_year_data)) else: final_storms = storms if len(storms) >= 1: print "Tropical Storm signal in {} {}".format( y + start.year, month) this_year_flags[final_storms] = 0 # and write flags back into array flags[year[0]:year[1]] = this_year_flags if plots: hist, binEdges = np.histogram(gap_plot_values, bins=bins) plot_hist = np.array([0.01 if h == 0 else h for h in hist]) plt.step(bincenters, plot_hist, 'r-', label='flagged', where='mid') import calendar plt.text(0.1, 0.9, calendar.month_name[month + 1], transform=plt.gca().transAxes) plt.legend(loc='lower center', ncol=3, bbox_to_anchor=(0.5, -0.2), frameon=False, prop={'size': 13}) plt.show() #plt.savefig(IMAGELOCATION+'/'+station.id+'_DistributionalGap_'+str(month+1)+'.png') nflags, = np.where(flags != 0) utils.print_flagged_obs_number(logfile, "Distributional Gap All", variable, len(nflags), noWrite=diagnostics) return flags # dgc_all_obs