def diurnal_cycle_check(obs_var, station, config_file, plots=False, diagnostics=False, best_fit_diurnal=None, best_fit_uncertainty=None): """ Use offset to find days where cycle doesn't match :param MetVar obs_var: Meteorological Variable object :param Station station: Station Object for the station :param str configfile: string for configuration file :param bool plots: turn on plots :param bool diagnostics: turn on diagnostic output """ flags = np.array(["" for i in range(obs_var.data.shape[0])]) diurnal_offset = int(utils.read_qc_config(config_file, "DIURNAL-{}".format(obs_var.name), "peak")) hours = np.arange(24) hours = np.roll(hours, 11-int(diurnal_offset)) if diurnal_offset != MISSING: if (best_fit_diurnal is None) and (best_fit_uncertainty is None): best_fit_diurnal, best_fit_uncertainty = prepare_data(station, obs_var) # find locations where the overall best fit does not match the daily fit potentially_spurious = np.ones(best_fit_diurnal.shape[0])*MISSING for d, (fit, uncertainty) in enumerate(zip(best_fit_diurnal, best_fit_uncertainty)): if fit != MISSING: min_range = 11 - uncertainty max_range = 11 + uncertainty offset_loc, = np.where(hours == fit) # find where the best fit falls outside the range for this particular day if offset_loc < min_range or offset_loc > max_range: potentially_spurious[d] = 1 else: potentially_spurious[d] = 0 # now check there are sufficient issues in running 30 day periods """Any periods>30 days where the diurnal cycle deviates from the expected phase by more than this uncertainty, without three consecutive good or missing days or six consecutive days consisting of a mix of only good or missing values, a re deemed dubious and the entire period of data (including all non-temperature elements) is flagged""" n_good = 0 n_miss = 0 n_not_bad = 0 total_points = 0 total_not_miss = 0 bad_locs = np.zeros(best_fit_diurnal.shape[0]) for d in range(best_fit_diurnal.shape[0]): if potentially_spurious[d] == 1: # if bad, just add one n_good = 0 n_miss = 0 n_not_bad = 0 total_points += 1 total_not_miss += 1 else: # find a non-bad value - so check previous run # if have reached limits on good/missing if (n_good == 3) or (n_miss == 3) or (n_not_bad >= 6): # sufficient good missing or not bad data if total_points >= 30: # if have collected enough others, then set flag if float(total_not_miss)/total_points >= 0.5: bad_locs[d - total_points : d] = 1 # reset counters n_good = 0 n_miss = 0 n_not_bad = 0 total_points = 0 total_not_miss = 0 # and deal with this point total_points += 1 if potentially_spurious[d] == 0: # if good n_good += 1 n_not_bad += 1 if n_miss != 0: n_miss = 0 total_not_miss += 1 elif potentially_spurious[d] == -999: # if missing data n_miss += 1 n_not_bad += 1 if n_good != 0: n_good = 0 # run through all days # find zero point of day counter in data preparation part day_counter_start = dt.datetime(np.unique(station.years)[0], np.unique(station.months)[0], np.unique(station.days)[0]) # find the bad days in the times array for day in bad_locs: this_day = day_counter_start + dt.timedelta(days=int(day)) locs, = np.where(np.logical_and.reduce((station.years == this_day.year, station.months == this_day.month, station.days == this_day.day))) flags[locs] = "U" # append flags to object obs_var.flags = utils.insert_flags(obs_var.flags, flags) if diagnostics: print("Diurnal Check {}".format(obs_var.name)) print(" Cumulative number of flags set: {}".format(len(np.where(flags != "")[0]))) else: if diagnostics: print("Diurnal fit not found") return # diurnal_cycle_check
def repeating_value(obs_var, times, config_file, plots=False, diagnostics=False): """ AKA straight string Use config file to read threshold values. Then find strings which exceed threshold. :param MetVar obs_var: meteorological variable object :param array times: array of times (usually in minutes) :param str config_file: configuration file to store critical values :param bool plots: turn on plots :param bool diagnostics: turn on diagnostic output """ # remove calm periods for wind speeds when (a) calculating thresholds and (b) identifying streaks this_var = copy.deepcopy(obs_var) if obs_var.name == "wind_speed": calms, = np.ma.where(this_var.data == 0) this_var.data[calms] = utils.MDI this_var.data.mask[calms] = True flags = np.array(["" for i in range(this_var.data.shape[0])]) compressed_flags = np.array( ["" for i in range(this_var.data.compressed().shape[0])]) # retrieve the threshold and store in another dictionary threshold = {} try: th = utils.read_qc_config(config_file, "STREAK-{}".format(this_var.name), "Straight") threshold["Straight"] = float(th) except KeyError: # no threshold set print("Threshold missing in config file") get_repeating_string_threshold(this_var, config_file, plots=plots, diagnostics=diagnostics) th = utils.read_qc_config(config_file, "STREAK-{}".format(this_var.name), "Straight") threshold["Straight"] = float(th) # only process further if there is enough data if len(this_var.data.compressed()) > 1: repeated_string_lengths, grouped_diffs, strings = prepare_data_repeating_string( this_var, plots=plots, diagnostics=diagnostics) # above threshold bad, = np.where(repeated_string_lengths >= threshold["Straight"]) # flag identified strings for string in bad: start = int(np.sum(grouped_diffs[:strings[string], 1])) end = start + int(grouped_diffs[strings[string], 1]) + 1 compressed_flags[start:end] = "K" if plots: plot_streak(times, this_var, start, end) # undo compression and write into original object (the one with calm periods) flags[this_var.data.mask == False] = compressed_flags obs_var.flags = utils.insert_flags(obs_var.flags, flags) if diagnostics: print("Repeated Strings {}".format(this_var.name)) print(" Cumulative number of flags set: {}".format( len(np.where(flags != "")[0]))) return # repeating_value
def identify_spikes(obs_var, times, config_file, plots=False, diagnostics=False): """ Use config_file to read in critical values, and then assess to find spikes :param MetVar obs_var: meteorological variable object :param array times: array of times (usually in minutes) :param str config_file: configuration file to store critical values :param bool plots: turn on plots :param bool diagnostics: turn on diagnostic output """ # TODO check works with missing data (compressed?) # TODO monthly? masked_times = np.ma.masked_array(times, mask=obs_var.data.mask) time_diffs = np.ma.diff(masked_times) / np.timedelta64( 1, "m") # presuming minutes value_diffs = np.ma.diff(obs_var.data) if len(value_diffs.mask.shape) == 0: # single mask value, replace with array of True/False's if value_diffs.mask: value_diffs.mask = np.ones(value_diffs.shape) else: value_diffs.mask = np.zeros(value_diffs.shape) # get thresholds for each unique time differences unique_diffs = np.unique(time_diffs.compressed()) # retrieve the critical values critical_values = {} for t_diff in unique_diffs: try: c_value = utils.read_qc_config(config_file, "SPIKE-{}".format(obs_var.name), "{}".format(t_diff)) critical_values[t_diff] = float(c_value) except KeyError: # no critical value for this time difference pass # if none have been read, give an option to calculate in case that was the reason for none if len(critical_values) == 0: get_critical_values(obs_var, times, config_file, plots=plots, diagnostics=diagnostics) # and try again for t_diff in unique_diffs: try: c_value = utils.read_qc_config(config_file, "SPIKE-{}".format(obs_var.name), "{}".format(t_diff)) critical_values[t_diff] = float(c_value) except KeyError: # no critical value for this time difference pass # pre select for each time difference that can be tested for t_diff in unique_diffs: if t_diff == 0: # not a spike or jump, but 2 values at the same time. # should be zero value difference, so fitting histogram not going to work # handled in separate test continue # new blank flag array flags = np.array(["" for i in range(obs_var.data.shape[0])]) t_locs, = np.where(time_diffs == t_diff) try: c_locs, = np.where( np.abs(value_diffs[t_locs]) > critical_values[t_diff]) except: # no critical value for this time difference continue # to next loop # TODO - sort spikes at very beginning or very end of sequence, # when don't have a departure from/return to a normal level # potential spikes for ps, possible_in_spike in enumerate(t_locs[c_locs]): is_spike = False spike_len = 1 while spike_len <= MAX_SPIKE_LENGTH: # test for each possible length to see if identified try: out_spike_t_diff = time_diffs[possible_in_spike + spike_len] possible_out_spike = value_diffs[possible_in_spike + spike_len] except IndexError: # got to end of data run, can't test final value at the moment break # need to test mask/unmasked using array rather than values extracted above # as if values unmasked, then no mask attribute to test! if time_diffs.mask[possible_in_spike + spike_len] == False and \ value_diffs.mask[possible_in_spike + spike_len] == False: try: # find critical value for time-difference of way out of spike out_critical_value = critical_values[out_spike_t_diff] except KeyError: # don't have a value for this time difference, so use the maximum of all as a proxy out_critical_value = max(critical_values.values()) else: # time or value difference masked out_critical_value = max(critical_values.values()) if np.abs(possible_out_spike) > out_critical_value: # check that the signs are opposite if np.sign(value_diffs[possible_in_spike]) != np.sign( value_diffs[possible_in_spike + spike_len]): is_spike = True break spike_len += 1 if is_spike and spike_len >= 1: # test within spike differences (chosing correct time difference) within = 1 while within < spike_len: within_t_diff = time_diffs[possible_in_spike + within] if time_diffs.mask[possible_in_spike + within] == False: try: within_critical_value = critical_values[ within_t_diff] if value_diffs[ possible_in_spike + within] > within_critical_value / 2.: is_spike = False except KeyError: # don't have a value for this time difference, so use the maximum of all as a proxy within_critical_value = max( critical_values.values()) else: # time difference masked within_critical_value = max(critical_values.values()) if value_diffs.mask[possible_in_spike + within] == False: if value_diffs[possible_in_spike + within] > within_critical_value / 2.: is_spike = False else: # if masked then no data, so can't say if it's not a spike pass within += 1 if is_spike: # test either side (either before or after is too big) try: before_t_diff = time_diffs[possible_in_spike - 1] if time_diffs.mask[possible_in_spike - 1] == False: before_critical_value = critical_values[before_t_diff] else: # time difference masked before_critical_value = max(critical_values.values()) except KeyError: # don't have a value for this time difference, so use the maximum of all as a proxy before_critical_value = max(critical_values.values()) except IndexError: # off the front of the data array before_critical_value = max(critical_values.values()) try: after_t_diff = time_diffs[possible_in_spike + spike_len + 1] if time_diffs.mask[possible_in_spike + spike_len + 1] == False: after_critical_value = critical_values[after_t_diff] else: # time difference masked after_critical_value = max(critical_values.values()) except KeyError: # don't have a value for this time difference, so use the maximum of all as a proxy after_critical_value = max(critical_values.values()) except IndexError: # off the back of the data array after_critical_value = max(critical_values.values()) try: if value_diffs.mask[possible_in_spike - 1] == False: if value_diffs[possible_in_spike - 1] > before_critical_value / 2.: # before spike fails test is_spike = False except IndexError: # off the front of the data array pass try: if value_diffs.mask[possible_in_spike + spike_len + 1] == False: if value_diffs[possible_in_spike + spike_len + 1] > after_critical_value / 2.: # after spike fails test is_spike = False except IndexError: # off the back of the data array pass # if the spike is still set, set the flags if is_spike: # "+1" because of difference arrays flags[possible_in_spike + 1:possible_in_spike + 1 + spike_len] = "S" # diagnostic plots if plots: plot_spike(times, obs_var, possible_in_spike + 1, spike_len) obs_var.flags = utils.insert_flags(obs_var.flags, flags) if diagnostics: print("Spike {}".format(obs_var.name)) print(" Time Difference: {} minutes".format(t_diff)) print(" Cumulative number of flags set: {}".format( len(np.where(flags != "")[0]))) return # identify_spikes
def frequent_values(obs_var, station, config_file, plots=False, diagnostics=False): """ Use config file to read frequent values. Check each month to see if appear. :param MetVar obs_var: meteorological variable object :param Station station: station object :param str config_file: configuration file to store critical values :param bool plots: turn on plots :param bool diagnostics: turn on diagnostic output """ flags = np.array(["" for i in range(obs_var.data.shape[0])]) all_years = np.unique(station.years) # work through each month, and then year for month in range(1, 13): # read in bin-width and suspect bins for this month try: width = float( utils.read_qc_config(config_file, "FREQUENT-{}".format(obs_var.name), "width")) suspect_bins = utils.read_qc_config(config_file, "FREQUENT-{}".format( obs_var.name), "{}".format(month), islist=True) except KeyError: print("Information missing in config file") identify_values(obs_var, station, config_file, plots=plots, diagnostics=diagnostics) width = float( utils.read_qc_config(config_file, "FREQUENT-{}".format(obs_var.name), "width")) suspect_bins = utils.read_qc_config(config_file, "FREQUENT-{}".format( obs_var.name), "{}".format(month), islist=True) # skip on if nothing to find if len(suspect_bins) == 0: continue # work through each year for year in all_years: locs, = np.where( np.logical_and(station.months == month, station.years == year)) month_data = obs_var.data[locs] # skip if no data if np.ma.count(month_data) == 0: continue month_flags = np.array(["" for i in range(month_data.shape[0])]) # adjust bin widths according to reporting accuracy resolution = utils.reporting_accuracy(month_data) if resolution <= 0.5: bins = utils.create_bins(month_data, 0.5, obs_var.name) else: bins = utils.create_bins(month_data, 1.0, obs_var.name) hist, bin_edges = np.histogram(month_data, bins) # Scan through the histogram # check if a bin is the maximum of a local area ("ROLLING") for b, bar in enumerate(hist): if (b > ROLLING // 2) and (b <= (len(hist) - ROLLING // 2)): target_bins = hist[b - (ROLLING // 2):b + (ROLLING // 2) + 1] # if sufficient obs, maximum and contains > 50% of data if bar >= utils.DATA_COUNT_THRESHOLD: if bar == target_bins.max(): if (bar / target_bins.sum()) > RATIO: # this bin meets all the criteria if bins[b] in suspect_bins: # find observations (month & year) to flag! flag_locs = np.where( np.logical_and( month_data >= bins[b], month_data < bins[b + 1])) month_flags[flag_locs] = "F" # copy flags for all years into main array flags[locs] = month_flags # diagnostic plots if plots: import matplotlib.pyplot as plt plt.step(bins[1:], hist, color='k', where="pre") plt.yscale("log") plt.ylabel("Number of Observations") plt.xlabel(obs_var.name.capitalize()) plt.title("{} - month {}".format(station.id, month)) bad_hist = np.copy(hist) for b, bar in enumerate(bad_hist): if bins[b] not in suspect_bins: bad_hist[b] = 0 plt.step(bins[1:], bad_hist, color='r', where="pre") plt.show() # append flags to object obs_var.flags = utils.insert_flags(obs_var.flags, flags) if diagnostics: print("Frequent Values {}".format(obs_var.name)) print(" Cumulative number of flags set: {}".format( len(np.where(flags != "")[0]))) return # frequent_values
def variance_check(obs_var, station, config_file, plots=False, diagnostics=False, winsorize=True): """ Use distribution to identify threshold values. Then also store in config file. :param MetVar obs_var: meteorological variable object :param Station station: station object :param str config_file: configuration file to store critical values :param bool plots: turn on plots :param bool diagnostics: turn on diagnostic output :param bool winsorize: apply winsorization at 5%/95% """ flags = np.array(["" for i in range(obs_var.data.shape[0])]) # get hourly climatology for each month for month in range(1, 13): month_locs, = np.where(station.months == month) variances = prepare_data(obs_var, station, month, diagnostics=diagnostics, winsorize=winsorize) try: average_variance = float( utils.read_qc_config(config_file, "VARIANCE-{}".format(obs_var.name), "{}-average".format(month))) variance_spread = float( utils.read_qc_config(config_file, "VARIANCE-{}".format(obs_var.name), "{}-spread".format(month))) except KeyError: print("Information missing in config file") find_thresholds(obs_var, station, config_file, plots=plots, diagnostics=diagnostics) average_variance = float( utils.read_qc_config(config_file, "VARIANCE-{}".format(obs_var.name), "{}-average".format(month))) variance_spread = float( utils.read_qc_config(config_file, "VARIANCE-{}".format(obs_var.name), "{}-spread".format(month))) if average_variance == utils.MDI and variance_spread == utils.MDI: # couldn't be calculated, move on continue bad_years, = np.where( np.abs(variances - average_variance) / variance_spread > SPREAD_THRESHOLD) # prepare wind and pressure data in case needed to check for storms if obs_var.name in [ "station_level_pressure", "sea_level_pressure", "wind_speed" ]: wind_monthly_data = station.wind_speed.data[month_locs] if obs_var.name in [ "station_level_pressure", "sea_level_pressure" ]: pressure_monthly_data = obs_var.data[month_locs] else: pressure_monthly_data = station.sea_level_pressure.data[ month_locs] if len(pressure_monthly_data.compressed()) < utils.DATA_COUNT_THRESHOLD or \ len(wind_monthly_data.compressed()) < utils.DATA_COUNT_THRESHOLD: # need sufficient data to work with for storm check to work, else can't tell # move on continue wind_average = utils.average(wind_monthly_data) wind_spread = utils.spread(wind_monthly_data) pressure_average = utils.average(pressure_monthly_data) pressure_spread = utils.spread(pressure_monthly_data) # go through each bad year for this month all_years = np.unique(station.years) for year in bad_years: # corresponding locations ym_locs, = np.where( np.logical_and(station.months == month, station.years == all_years[year])) # if pressure or wind speed, need to do some further checking before applying flags if obs_var.name in [ "station_level_pressure", "sea_level_pressure", "wind_speed" ]: # pull out the data wind_data = station.wind_speed.data[ym_locs] if obs_var.name in [ "station_level_pressure", "sea_level_pressure" ]: pressure_data = obs_var.data[ym_locs] else: pressure_data = station.sea_level_pressure.data[ym_locs] # need sufficient data to work with for storm check to work, else can't tell if len(pressure_data.compressed()) < utils.DATA_COUNT_THRESHOLD or \ len(wind_data.compressed()) < utils.DATA_COUNT_THRESHOLD: # move on continue # find locations of high wind speeds and low pressures, cross match high_winds, = np.ma.where( (wind_data - wind_average) / wind_spread > STORM_THRESHOLD) low_pressures, = np.ma.where( (pressure_average - pressure_data) / pressure_spread > STORM_THRESHOLD) match = np.in1d(high_winds, low_pressures) couldbe_storm = False if len(match) > 0: # this could be a storm, either at tropical station (relatively constant pressure) # or out of season in mid-latitudes. couldbe_storm = True if obs_var.name in [ "station_level_pressure", "sea_level_pressure" ]: diffs = np.ma.diff(pressure_data) elif obs_var.name == "wind_speed": diffs = np.ma.diff(wind_data) # count up the largest number of sequential negative and positive differences negs, poss = 0, 0 biggest_neg, biggest_pos = 0, 0 for diff in diffs: if diff > 0: if negs > biggest_neg: biggest_neg = negs negs = 0 poss += 1 else: if poss > biggest_pos: biggest_pos = poss poss = 0 negs += 1 if (biggest_neg < 10) and (biggest_pos < 10) and not couldbe_storm: # insufficient to identify as a storm (HadISD values) # leave flags set pass else: # could be a storm, so better to leave this month unflagged # zero length array to flag ym_locs = np.ma.array([]) # copy over the flags, if any if len(ym_locs) != 0: # and set the flags flags[ym_locs] = "V" # diagnostic plots if plots: import matplotlib.pyplot as plt scaled_variances = ((variances - average_variance) / variance_spread) bins = utils.create_bins(scaled_variances, 0.25, obs_var.name) hist, bin_edges = np.histogram(scaled_variances, bins) plt.clf() plt.step(bins[1:], hist, color='k', where="pre") plt.yscale("log") plt.ylabel("Number of Months") plt.xlabel("Scaled {} Variances".format(obs_var.name.capitalize())) plt.title("{} - month {}".format(station.id, month)) plt.ylim([0.1, max(hist) * 2]) plt.axvline(SPREAD_THRESHOLD, c="r") plt.axvline(-SPREAD_THRESHOLD, c="r") bad_hist, dummy = np.histogram(scaled_variances[bad_years], bins) plt.step(bins[1:], bad_hist, color='r', where="pre") plt.show() # append flags to object obs_var.flags = utils.insert_flags(obs_var.flags, flags) if diagnostics: print("Variance {}".format(obs_var.name)) print(" Cumulative number of flags set: {}".format( len(np.where(flags != "")[0]))) return # variance_check
def pressure_offset(sealp, stnlp, times, config_file, plots=False, diagnostics=False): """ Flag locations where difference between station and sea-level pressure falls outside of bounds :param MetVar sealp: sea level pressure object :param MetVar stnlp: station level pressure object :param array times: datetime array :param str configfile: string for configuration file :param bool plots: turn on plots :param bool diagnostics: turn on diagnostic output """ flags = np.array(["" for i in range(sealp.data.shape[0])]) difference = sealp.data - stnlp.data if len(difference.compressed()) >= utils.DATA_COUNT_THRESHOLD: try: average = float( utils.read_qc_config(config_file, "PRESSURE", "average")) spread = float( utils.read_qc_config(config_file, "PRESSURE", "spread")) except KeyError: print("Information missing in config file") average = utils.average(difference) spread = utils.spread(difference) if spread < MIN_SPREAD: # less than XhPa spread = MIN_SPREAD elif spread > MAX_SPREAD: # more than XhPa spread = MAX_SPREAD utils.write_qc_config(config_file, "PRESSURE", "average", "{}".format(average), diagnostics=diagnostics) utils.write_qc_config(config_file, "PRESSURE", "spread", "{}".format(spread), diagnostics=diagnostics) if np.abs(np.ma.mean(difference) - np.ma.median(difference)) > THRESHOLD * spread: if diagnostics: print("Large difference between mean and median") print("Likely to have two populations of roughly equal size") print("Test won't work") pass else: high, = np.ma.where(difference > (average + (THRESHOLD * spread))) low, = np.ma.where(difference < (average - (THRESHOLD * spread))) # diagnostic plots if plots: bins = np.arange( np.round(difference.min()) - 1, np.round(difference.max()) + 1, 0.1) import matplotlib.pyplot as plt plt.clf() plt.hist(difference.compressed(), bins=bins) plt.axvline(x=(average + (THRESHOLD * spread)), ls="--", c="r") plt.axvline(x=(average - (THRESHOLD * spread)), ls="--", c="r") plt.xlim([bins[0] - 1, bins[-1] + 1]) plt.ylabel("Observations") plt.xlabel("Difference (hPa)") plt.show() if len(high) != 0: flags[high] = "p" if diagnostics: print("Pressure".format(stnlp.name)) print(" Number of high differences {}".format(len(high))) if plots: for bad in high: plot_pressure(sealp, stnlp, times, bad) if len(low) != 0: flags[low] = "p" if diagnostics: print(" Number of low differences {}".format(len(low))) if plots: for bad in low: plot_pressure(sealp, stnlp, times, bad) # only flag the station level pressure stnlp.flags = utils.insert_flags(stnlp.flags, flags) if diagnostics: print("Pressure {}".format(stnlp.name)) print(" Cumulative number of flags set: {}".format( len(np.where(flags != "")[0]))) return # pressure_offset
def monthly_clim(obs_var, station, config_file, logfile="", plots=False, diagnostics=False, winsorize=True): """ Run through the variables and pass to the Distributional Gap Checks :param MetVar obs_var: meteorological variable object :param Station station: station object :param str configfile: string for configuration file :param str logfile: string for log file :param bool plots: turn on plots :param bool diagnostics: turn on diagnostic output """ flags = np.array(["" for i in range(obs_var.data.shape[0])]) for month in range(1, 13): month_locs, = np.where(station.months == month) # note these are for the whole record, just this month is unmasked normalised_anomalies = prepare_data(obs_var, station, month, diagnostics=diagnostics, winsorize=winsorize) if len(normalised_anomalies.compressed() ) >= utils.DATA_COUNT_THRESHOLD: bins = utils.create_bins(normalised_anomalies, BIN_WIDTH, obs_var.name) hist, bin_edges = np.histogram(normalised_anomalies.compressed(), bins) try: upper_threshold = float( utils.read_qc_config( config_file, "CLIMATOLOGICAL-{}".format(obs_var.name), "{}-uthresh".format(month))) lower_threshold = float( utils.read_qc_config( config_file, "CLIMATOLOGICAL-{}".format(obs_var.name), "{}-lthresh".format(month))) except KeyError: print("Information missing in config file") find_month_thresholds(obs_var, station, config_file, plots=plots, diagnostics=diagnostics) upper_threshold = float( utils.read_qc_config( config_file, "CLIMATOLOGICAL-{}".format(obs_var.name), "{}-uthresh".format(month))) lower_threshold = float( utils.read_qc_config( config_file, "CLIMATOLOGICAL-{}".format(obs_var.name), "{}-lthresh".format(month))) # now to find the gaps uppercount = len( np.where(normalised_anomalies > upper_threshold)[0]) lowercount = len( np.where(normalised_anomalies < lower_threshold)[0]) if uppercount > 0: gap_start = utils.find_gap(hist, bins, upper_threshold, GAP_SIZE) if gap_start != 0: bad_locs, = np.ma.where( normalised_anomalies > gap_start) # all years for one month # normalised_anomalies are for the whole record, just this month is unmasked flags[bad_locs] = "C" if lowercount > 0: gap_start = utils.find_gap(hist, bins, lower_threshold, GAP_SIZE, upwards=False) if gap_start != 0: bad_locs, = np.ma.where( normalised_anomalies < gap_start) # all years for one month flags[bad_locs] = "C" # diagnostic plots if plots: import matplotlib.pyplot as plt plt.clf() plt.step(bins[1:], hist, color='k', where="pre") plt.yscale("log") plt.ylabel("Number of Observations") plt.xlabel("Scaled {}".format(obs_var.name.capitalize())) plt.title("{} - month {}".format(station.id, month)) plt.ylim([0.1, max(hist) * 2]) plt.axvline(upper_threshold, c="r") plt.axvline(lower_threshold, c="r") bad_locs, = np.where(flags[month_locs] == "C") bad_hist, dummy = np.histogram( normalised_anomalies[month_locs][bad_locs], bins) plt.step(bins[1:], bad_hist, color='r', where="pre") plt.show() # append flags to object obs_var.flags = utils.insert_flags(obs_var.flags, flags) if diagnostics: print("Climatological {}".format(obs_var.name)) print(" Cumulative number of flags set: {}".format( len(np.where(flags != "")[0]))) return # monthly_clim
def all_obs_gap(obs_var, station, config_file, plots=False, diagnostics=False): """ Extract data for month and find secondary populations in distribution. :param MetVar obs_var: meteorological variable object :param Station station: station object :param str config_file: configuration file to store critical values :param bool plots: turn on plots :param bool diagnostics: turn on diagnostic output """ flags = np.array(["" for i in range(obs_var.data.shape[0])]) for month in range(1, 13): normalised_anomalies = prepare_all_data(obs_var, station, month, config_file, full=False, diagnostics=diagnostics) if (len(normalised_anomalies.compressed()) == 1 and normalised_anomalies[0] == utils.MDI): # no data to work with for this month, move on. continue bins = utils.create_bins(normalised_anomalies, BIN_WIDTH, obs_var.name) hist, bin_edges = np.histogram(normalised_anomalies, bins) try: upper_threshold = float( utils.read_qc_config(config_file, "ADISTRIBUTION-{}".format(obs_var.name), "{}-uthresh".format(month))) lower_threshold = float( utils.read_qc_config(config_file, "ADISTRIBUTION-{}".format(obs_var.name), "{}-lthresh".format(month))) except KeyError: print("Information missing in config file") find_thresholds(obs_var, station, config_file, plots=plots, diagnostics=diagnostics) upper_threshold = float( utils.read_qc_config(config_file, "ADISTRIBUTION-{}".format(obs_var.name), "{}-uthresh".format(month))) lower_threshold = float( utils.read_qc_config(config_file, "ADISTRIBUTION-{}".format(obs_var.name), "{}-lthresh".format(month))) if upper_threshold == utils.MDI and lower_threshold == utils.MDI: # these weren't able to be calculated, move on continue elif len(np.unique(normalised_anomalies)) == 1: # all the same value, so won't be able to fit a histogram continue # now to find the gaps uppercount = len(np.where(normalised_anomalies > upper_threshold)[0]) lowercount = len(np.where(normalised_anomalies < lower_threshold)[0]) month_locs, = np.where( station.months == month) # append should keep year order if uppercount > 0: gap_start = utils.find_gap(hist, bins, upper_threshold, GAP_SIZE) if gap_start != 0: bad_locs, = np.ma.where(normalised_anomalies > gap_start) # all years for one month month_flags = flags[month_locs] month_flags[bad_locs] = "d" flags[month_locs] = month_flags if lowercount > 0: gap_start = utils.find_gap(hist, bins, lower_threshold, GAP_SIZE, upwards=False) if gap_start != 0: bad_locs, = np.ma.where(normalised_anomalies < gap_start) # all years for one month month_flags = flags[month_locs] month_flags[bad_locs] = "d" # TODO - can this bit be refactored? # for pressure data, see if the flagged obs correspond with high winds # could be a storm signal if obs_var.name in [ "station_level_pressure", "sea_level_pressure" ]: wind_monthly_data = prepare_monthly_data( station.wind_speed, station, month) pressure_monthly_data = prepare_monthly_data( obs_var, station, month) if len(pressure_monthly_data.compressed()) < utils.DATA_COUNT_THRESHOLD or \ len(wind_monthly_data.compressed()) < utils.DATA_COUNT_THRESHOLD: # need sufficient data to work with for storm check to work, else can't tell pass else: wind_monthly_average = utils.average(wind_monthly_data) wind_monthly_spread = utils.spread(wind_monthly_data) pressure_monthly_average = utils.average( pressure_monthly_data) pressure_monthly_spread = utils.spread( pressure_monthly_data) # already a single calendar month, so go through each year all_years = np.unique(station.years) for year in all_years: # what's best - extract only when necessary but repeatedly if so, or always, but once this_year_locs = np.where( station.years[month_locs] == year) if "d" not in month_flags[this_year_locs]: # skip if you get the chance continue wind_data = station.wind_speed.data[month_locs][ this_year_locs] pressure_data = obs_var.data[month_locs][ this_year_locs] storms, = np.ma.where( np.logical_and( (((wind_data - wind_monthly_average) / wind_monthly_spread) > STORM_THRESHOLD), (((pressure_monthly_average - pressure_data ) / pressure_monthly_spread) > STORM_THRESHOLD))) # more than one entry - check if separate events if len(storms) >= 2: # find where separation more than the usual obs separation storm_1diffs = np.ma.diff(storms) separations, = np.where( storm_1diffs > np.ma.median( np.ma.diff(wind_data))) if len(separations) != 0: # multiple storm signals storm_start = 0 storm_finish = separations[0] + 1 first_storm = expand_around_storms( storms[storm_start:storm_finish], len(wind_data)) final_storm_locs = copy.deepcopy( first_storm) for j in range(len(separations)): # then do the rest in a loop if j + 1 == len(separations): # final one this_storm = expand_around_storms( storms[separations[j] + 1:], len(wind_data)) else: this_storm = expand_around_storms( storms[separations[j] + 1:separations[j + 1] + 1], len(wind_data)) final_storm_locs = np.append( final_storm_locs, this_storm) else: # locations separated at same interval as data final_storm_locs = expand_around_storms( storms, len(wind_data)) # single entry elif len(storms) != 0: # expand around the storm signal (rather than # just unflagging what could be the peak and # leaving the entry/exit flagged) final_storm_locs = expand_around_storms( storms, len(wind_data)) # unset the flags if len(storms) > 0: month_flags[this_year_locs][ final_storm_locs] = "" # having checked for storms now store final flags flags[month_locs] = month_flags # diagnostic plots if plots: import matplotlib.pyplot as plt plt.step(bins[1:], hist, color='k', where="pre") plt.yscale("log") plt.ylabel("Number of Observations") plt.xlabel(obs_var.name.capitalize()) plt.title("{} - month {}".format(station.id, month)) plt.ylim([0.1, max(hist) * 2]) plt.axvline(upper_threshold, c="r") plt.axvline(lower_threshold, c="r") bad_locs, = np.where(flags[month_locs] == "d") bad_hist, dummy = np.histogram(normalised_anomalies[bad_locs], bins) plt.step(bins[1:], bad_hist, color='r', where="pre") plt.show() # append flags to object obs_var.flags = utils.insert_flags(obs_var.flags, flags) if diagnostics: print("Distribution (all) {}".format(obs_var.name)) print(" Cumulative number of flags set: {}".format( len(np.where(flags != "")[0]))) return # all_obs_gap
def prepare_all_data(obs_var, station, month, config_file, full=False, diagnostics=False): """ Extract data for the month, make & store or read average and spread. Use to calculate normalised anomalies. :param MetVar obs_var: meteorological variable object :param Station station: station object :param int month: month to process :param str config_file: configuration file to store critical values :param bool diagnostics: turn on diagnostic output """ month_locs, = np.where(station.months == month) all_month_data = obs_var.data[month_locs] if full: if len(all_month_data.compressed()) >= utils.DATA_COUNT_THRESHOLD: # have data, now to standardise climatology = utils.average(all_month_data) # mean spread = utils.spread(all_month_data) # IQR currently else: climatology = utils.MDI spread = utils.MDI # write out the scaling... utils.write_qc_config(config_file, "ADISTRIBUTION-{}".format(obs_var.name), "{}-clim".format(month), "{}".format(climatology), diagnostics=diagnostics) utils.write_qc_config(config_file, "ADISTRIBUTION-{}".format(obs_var.name), "{}-spread".format(month), "{}".format(spread), diagnostics=diagnostics) else: try: climatology = float( utils.read_qc_config(config_file, "ADISTRIBUTION-{}".format(obs_var.name), "{}-clim".format(month))) spread = float( utils.read_qc_config(config_file, "ADISTRIBUTION-{}".format(obs_var.name), "{}-spread".format(month))) except KeyError: if len(all_month_data.compressed()) >= utils.DATA_COUNT_THRESHOLD: # have data, now to standardise climatology = utils.average(all_month_data) # mean spread = utils.spread(all_month_data) # IQR currently else: climatology = utils.MDI spread = utils.MDI # write out the scaling... utils.write_qc_config(config_file, "ADISTRIBUTION-{}".format(obs_var.name), "{}-clim".format(month), "{}".format(climatology), diagnostics=diagnostics) utils.write_qc_config(config_file, "ADISTRIBUTION-{}".format(obs_var.name), "{}-spread".format(month), "{}".format(spread), diagnostics=diagnostics) if climatology == utils.MDI and spread == utils.MDI: # these weren't calculable, move on return np.ma.array([utils.MDI]) elif spread == 0: # all the same value return (all_month_data - climatology) # prepare_all_data else: return (all_month_data - climatology) / spread # prepare_all_data
def monthly_gap(obs_var, station, config_file, plots=False, diagnostics=False): """ Use distribution to identify assymetries. :param MetVar obs_var: meteorological variable object :param Station station: station object :param str config_file: configuration file to store critical values :param bool plots: turn on plots :param bool diagnostics: turn on diagnostic output """ flags = np.array(["" for i in range(obs_var.data.shape[0])]) all_years = np.unique(station.years) for month in range(1, 13): month_averages = prepare_monthly_data(obs_var, station, month, diagnostics=diagnostics) # read in the scaling try: climatology = float( utils.read_qc_config(config_file, "MDISTRIBUTION-{}".format(obs_var.name), "{}-clim".format(month))) spread = float( utils.read_qc_config(config_file, "MDISTRIBUTION-{}".format(obs_var.name), "{}-spread".format(month))) except KeyError: print("Information missing in config file") find_monthly_scaling(obs_var, station, config_file, diagnostics=diagnostics) climatology = float( utils.read_qc_config(config_file, "MDISTRIBUTION-{}".format(obs_var.name), "{}-clim".format(month))) spread = float( utils.read_qc_config(config_file, "MDISTRIBUTION-{}".format(obs_var.name), "{}-spread".format(month))) if climatology == utils.MDI and spread == utils.MDI: # these weren't calculable, move on continue standardised_months = (month_averages - climatology) / spread bins = utils.create_bins(standardised_months, BIN_WIDTH, obs_var.name) hist, bin_edges = np.histogram(standardised_months, bins) # flag months with very large offsets bad, = np.where(np.abs(standardised_months) >= LARGE_LIMIT) # now follow flag locations back up through the process for bad_month_id in bad: # year ID for this set of calendar months locs, = np.where( np.logical_and(station.months == month, station.years == all_years[bad_month_id])) flags[locs] = "D" # walk distribution from centre to find assymetry sort_order = standardised_months.argsort() mid_point = len(standardised_months) // 2 good = True step = 1 bad = [] while good: if standardised_months[sort_order][ mid_point - step] != standardised_months[sort_order][mid_point + step]: suspect_months = [np.abs(standardised_months[sort_order][mid_point - step]), \ np.abs(standardised_months[sort_order][mid_point + step])] if min(suspect_months) != 0: # not all clustered at origin if max(suspect_months) / min(suspect_months) >= 2. and min( suspect_months) >= 1.5: # at least 1.5x spread from centre and difference of two in location (longer tail) # flag everything further from this bin for that tail if suspect_months[0] == max(suspect_months): # LHS has issue (remember that have removed the sign) bad = sort_order[:mid_point - ( step - 1)] # need -1 given array indexing standards elif suspect_months[1] == max(suspect_months): # RHS has issue bad = sort_order[mid_point + step:] good = False step += 1 if (mid_point - step) < 0 or ( mid_point + step) == standardised_months.shape[0]: # reached end break # now follow flag locations back up through the process for bad_month_id in bad: # year ID for this set of calendar months locs, = np.where( np.logical_and(station.months == month, station.years == all_years[bad_month_id])) flags[locs] = "D" if plots: import matplotlib.pyplot as plt plt.step(bins[1:], hist, color='k', where="pre") if len(bad) > 0: bad_hist, dummy = np.histogram(standardised_months[bad], bins) plt.step(bins[1:], bad_hist, color='r', where="pre") plt.ylabel("Number of Months") plt.xlabel(obs_var.name.capitalize()) plt.title("{} - month {}".format(station.id, month)) plt.show() # append flags to object obs_var.flags = utils.insert_flags(obs_var.flags, flags) if diagnostics: print("Distribution (monthly) {}".format(obs_var.name)) print(" Cumulative number of flags set: {}".format( len(np.where(flags != "")[0]))) return # monthly_gap