def find_monthly_scaling(obs_var, station, config_file, diagnostics=False): """ Find scaling parameters for monthly values and store in config file :param MetVar obs_var: meteorological variable object :param Station station: station object :param str config_file: configuration file to store critical values :param bool diagnostics: turn on diagnostic output """ all_years = np.unique(station.years) for month in range(1, 13): month_averages = prepare_monthly_data(obs_var, station, month, diagnostics=diagnostics) if len(month_averages.compressed()) >= VALID_MONTHS: # have months, now to standardise climatology = utils.average(month_averages) # mean spread = utils.spread(month_averages) # IQR currently if spread < SPREAD_LIMIT: spread = SPREAD_LIMIT # write out the scaling... utils.write_qc_config(config_file, "MDISTRIBUTION-{}".format(obs_var.name), "{}-clim".format(month), "{}".format(climatology), diagnostics=diagnostics) utils.write_qc_config(config_file, "MDISTRIBUTION-{}".format(obs_var.name), "{}-spread".format(month), "{}".format(spread), diagnostics=diagnostics) else: utils.write_qc_config(config_file, "MDISTRIBUTION-{}".format(obs_var.name), "{}-clim".format(month), "{}".format(utils.MDI), diagnostics=diagnostics) utils.write_qc_config(config_file, "MDISTRIBUTION-{}".format(obs_var.name), "{}-spread".format(month), "{}".format(utils.MDI), diagnostics=diagnostics) return # find_monthly_scaling
def find_thresholds(obs_var, station, config_file, plots=False, diagnostics=False, winsorize=True): """ Use distribution to identify threshold values. Then also store in config file. :param MetVar obs_var: meteorological variable object :param Station station: station object :param str config_file: configuration file to store critical values :param bool plots: turn on plots :param bool diagnostics: turn on diagnostic output :param bool winsorize: apply winsorization at 5%/95% """ # get hourly climatology for each month for month in range(1, 13): variances = prepare_data(obs_var, station, month, diagnostics=diagnostics, winsorize=winsorize) if len(variances.compressed()) >= MIN_VARIANCES: average_variance = utils.average(variances) variance_spread = utils.spread(variances) else: average_variance = utils.MDI variance_spread = utils.MDI utils.write_qc_config(config_file, "VARIANCE-{}".format(obs_var.name), "{}-average".format(month), "{}".format(average_variance), diagnostics=diagnostics) utils.write_qc_config(config_file, "VARIANCE-{}".format(obs_var.name), "{}-spread".format(month), "{}".format(variance_spread), diagnostics=diagnostics) return # find_thresholds
def identify_values(sealp, stnlp, times, config_file, plots=False, diagnostics=False): """ Find average and spread of differences :param MetVar sealp: sea level pressure object :param MetVar stnlp: station level pressure object :param array times: datetime array :param str configfile: string for configuration file :param bool plots: turn on plots :param bool diagnostics: turn on diagnostic output """ difference = sealp.data - stnlp.data if len(difference.compressed()) >= utils.DATA_COUNT_THRESHOLD: average = utils.average(difference) spread = utils.spread(difference) if spread < MIN_SPREAD: # less than XhPa spread = MIN_SPREAD elif spread > MAX_SPREAD: # more than XhPa spread = MAX_SPREAD utils.write_qc_config(config_file, "PRESSURE", "average", "{}".format(average), diagnostics=diagnostics) utils.write_qc_config(config_file, "PRESSURE", "spread", "{}".format(spread), diagnostics=diagnostics) return # identify_values
def neighbour_outlier(target_station, initial_neighbours, variable, diagnostics=False, plots=False, full=False): """ Works on a single station and variable. Reads in neighbour's data, finds locations where sufficent are sufficiently different. :param Station target_station: station to run on :param array initial_neighbours: input neighbours (ID, distance) pairs :param str variable: obs variable being run on :param bool diagnostics: print extra material to screen :param bool plots: create plots from each test :param bool full: run full reprocessing rather than using stored values. """ station_list = utils.get_station_list() # if sufficient n_neighbours = len(np.where(initial_neighbours[:, 0] != "-")[0]) - 1 if n_neighbours < utils.MIN_NEIGHBOURS: print("{} has insufficient neighbours ({}<{})".format( target_station.id, n_neighbours, utils.MIN_NEIGHBOURS)) else: #************************* # extract target observations obs_var = getattr(target_station, variable) flags = np.array(["" for i in range(obs_var.data.shape[0]) ]).astype("<U10") #************************* # read in in the neighbour (buddy) data all_buddy_data = np.ma.zeros( [len(initial_neighbours[:, 0]), len(target_station.times)]) all_buddy_data.mask = np.ones(all_buddy_data.shape) for bid, buddy_id in enumerate(initial_neighbours[:, 0]): if buddy_id == target_station.id: # first entry is self continue if buddy_id == "-": # end of the list of buddies break if diagnostics: print("{}/{} {}".format(bid, len(initial_neighbours[:, 0]), buddy_id)) # set up station object to hold information buddy_idx, = np.where(station_list.id == buddy_id) buddy = utils.Station(buddy_id, station_list.iloc[buddy_idx].latitude.values[0], \ station_list.iloc[buddy_idx].longitude.values[0], station_list.iloc[buddy_idx].elevation.values[0]) try: buddy, buddy_df = io.read_station(os.path.join( setup.SUBDAILY_PROC_DIR, "{:11s}.qff".format(buddy_id)), buddy, read_flags=True) buddy_var = getattr(buddy, variable) # apply flags flag_locs, = np.where(buddy_var.flags != "") buddy_var.data.mask[flag_locs] = True except OSError as e: # file missing, move on to next in sequence io.write_error( target_station, "File Missing (Buddy, {}) - {}".format(variable, buddy_id)) continue except ValueError as e: # some issue in the raw file io.write_error(target_station, "Error in input file (Buddy, {}) - {}".format( variable, buddy_id), error=str(e)) continue # match the timestamps of target_station and copy over match = np.in1d(target_station.times, buddy.times) match_back = np.in1d(buddy.times, target_station.times) if True in match and True in match_back: # skip if no overlapping times at all! all_buddy_data[bid, match] = buddy_var.data[match_back] if diagnostics: print("All buddies read in") #************************* # find differences differences = all_buddy_data - obs_var.data #************************* # find spread of differences on monthly basis (with minimum value) spreads = np.ma.zeros(differences.shape) for month in range(1, 13): month_locs = np.where(target_station.months == month) for bid, buddy in enumerate(differences): if len(differences[bid, month_locs].compressed() ) > utils.DATA_COUNT_THRESHOLD: this_spread = utils.spread(differences[bid, month_locs]) if this_spread < MIN_SPREAD: spreads[bid, month_locs] = MIN_SPREAD else: spreads[bid, month_locs] = utils.spread( differences[bid, month_locs]) else: spreads[bid, month_locs] = MIN_SPREAD spreads.mask = np.copy(differences.mask) # store which entries may be sufficient to flag dubious = np.ma.zeros(differences.shape) dubious.mask = np.copy(differences.mask) #************************* # adjust for storms if variable in ["sea_level_pressure", "station_level_pressure"]: distant, = np.where(initial_neighbours[:, 1].astype(int) > 100) if len(distant) > 0: # find positive and negative differences across neighbours positive = np.ma.where( differences[distant] > spreads[distant] * SPREAD_LIMIT) negative = np.ma.where( differences[distant] < spreads[distant] * SPREAD_LIMIT) # spin through each neighbour for dn, dist_neigh in enumerate(distant): pos, = np.where(positive[0] == dn) neg, = np.where(negative[0] == dn) if len(neg) > 0: ratio = len(neg) / (len(pos) + len(neg)) if ratio > 0.667: # majority negative, only flag the positives [definitely not storms] dubious[dist_neigh, positive[1][pos]] = 1 else: # all stations close by so storms shouldn't affect, include all # note where differences exceed the spread dubious_locs = np.ma.where( np.ma.abs(differences) > spreads * SPREAD_LIMIT) dubious[dubious_locs] = 1 else: #************************* # note where differences exceed the spread [all non pressure variables] dubious_locs = np.ma.where( np.ma.abs(differences) > spreads * SPREAD_LIMIT) dubious[dubious_locs] = 1 if diagnostics: print("cross checks complete - assessing all outcomes") #************************* # sum across neighbours neighbour_count = np.ma.count(differences, axis=0) dubious_count = np.ma.sum(dubious, axis=0) # flag if large enough fraction (>0.66) sufficient, = np.ma.where(dubious_count > 0.66 * neighbour_count) flags[sufficient] = "N" if plots: for flag in sufficient: plot_neighbour_flags(target_station.times, flag, obs_var, all_buddy_data) # append flags to object obs_var.flags = utils.insert_flags(obs_var.flags, flags) if diagnostics: print("Neighbour Outlier {}".format(obs_var.name)) print(" Cumulative number of flags set: {}".format( len(np.where(flags != "")[0]))) return # neighbour_outlier
def prepare_data(obs_var, station, month, diagnostics=False, winsorize=True): """ Calculate the monthly variances :param MetVar obs_var: meteorological variable object :param Station station: station object :param int month: which month to run on :param bool plots: turn on plots :param bool diagnostics: turn on diagnostic output :param bool winsorize: apply winsorization at 5%/95% """ anomalies = np.ma.zeros(obs_var.data.shape[0]) anomalies.mask = np.ones(anomalies.shape[0]) normed_anomalies = np.ma.copy(anomalies) mlocs, = np.where(station.months == month) anomalies.mask[mlocs] = False normed_anomalies.mask[mlocs] = False hourly_clims = np.ma.zeros(24) hourly_clims.mask = np.ones(24) for hour in range(24): # calculate climatology hlocs, = np.where( np.logical_and(station.months == month, station.hours == hour)) hour_data = obs_var.data[hlocs] if winsorize: if len(hour_data.compressed()) > 10: hour_data = utils.winsorize(hour_data, 5) if len(hour_data.compressed()) >= utils.DATA_COUNT_THRESHOLD: hourly_clims[hour] = np.ma.mean(hour_data) hourly_clims.mask[hour] = False # make anomalies - keeping the order anomalies[hlocs] = obs_var.data[hlocs] - hourly_clims[hour] if len(anomalies[mlocs].compressed()) >= MIN_VARIANCES: # for the month, normalise anomalies by spread spread = utils.spread(anomalies[mlocs]) if spread < 1.5: spread = 1.5 else: spread = 1.5 normed_anomalies[mlocs] = anomalies[mlocs] / spread # calculate the variance for each year in this single month. all_years = np.unique(station.years) variances = np.ma.zeros(all_years.shape[0]) variances.mask = np.ones(all_years.shape[0]) for y, year in enumerate(all_years): ymlocs, = np.where( np.logical_and(station.months == month, station.years == year)) this_year = normed_anomalies[ymlocs] # HadISD used M.A.D. if this_year.compressed().shape[0] > MIN_VALUES: variances[y] = utils.spread(this_year) return variances # prepare_data
def variance_check(obs_var, station, config_file, plots=False, diagnostics=False, winsorize=True): """ Use distribution to identify threshold values. Then also store in config file. :param MetVar obs_var: meteorological variable object :param Station station: station object :param str config_file: configuration file to store critical values :param bool plots: turn on plots :param bool diagnostics: turn on diagnostic output :param bool winsorize: apply winsorization at 5%/95% """ flags = np.array(["" for i in range(obs_var.data.shape[0])]) # get hourly climatology for each month for month in range(1, 13): month_locs, = np.where(station.months == month) variances = prepare_data(obs_var, station, month, diagnostics=diagnostics, winsorize=winsorize) try: average_variance = float( utils.read_qc_config(config_file, "VARIANCE-{}".format(obs_var.name), "{}-average".format(month))) variance_spread = float( utils.read_qc_config(config_file, "VARIANCE-{}".format(obs_var.name), "{}-spread".format(month))) except KeyError: print("Information missing in config file") find_thresholds(obs_var, station, config_file, plots=plots, diagnostics=diagnostics) average_variance = float( utils.read_qc_config(config_file, "VARIANCE-{}".format(obs_var.name), "{}-average".format(month))) variance_spread = float( utils.read_qc_config(config_file, "VARIANCE-{}".format(obs_var.name), "{}-spread".format(month))) if average_variance == utils.MDI and variance_spread == utils.MDI: # couldn't be calculated, move on continue bad_years, = np.where( np.abs(variances - average_variance) / variance_spread > SPREAD_THRESHOLD) # prepare wind and pressure data in case needed to check for storms if obs_var.name in [ "station_level_pressure", "sea_level_pressure", "wind_speed" ]: wind_monthly_data = station.wind_speed.data[month_locs] if obs_var.name in [ "station_level_pressure", "sea_level_pressure" ]: pressure_monthly_data = obs_var.data[month_locs] else: pressure_monthly_data = station.sea_level_pressure.data[ month_locs] if len(pressure_monthly_data.compressed()) < utils.DATA_COUNT_THRESHOLD or \ len(wind_monthly_data.compressed()) < utils.DATA_COUNT_THRESHOLD: # need sufficient data to work with for storm check to work, else can't tell # move on continue wind_average = utils.average(wind_monthly_data) wind_spread = utils.spread(wind_monthly_data) pressure_average = utils.average(pressure_monthly_data) pressure_spread = utils.spread(pressure_monthly_data) # go through each bad year for this month all_years = np.unique(station.years) for year in bad_years: # corresponding locations ym_locs, = np.where( np.logical_and(station.months == month, station.years == all_years[year])) # if pressure or wind speed, need to do some further checking before applying flags if obs_var.name in [ "station_level_pressure", "sea_level_pressure", "wind_speed" ]: # pull out the data wind_data = station.wind_speed.data[ym_locs] if obs_var.name in [ "station_level_pressure", "sea_level_pressure" ]: pressure_data = obs_var.data[ym_locs] else: pressure_data = station.sea_level_pressure.data[ym_locs] # need sufficient data to work with for storm check to work, else can't tell if len(pressure_data.compressed()) < utils.DATA_COUNT_THRESHOLD or \ len(wind_data.compressed()) < utils.DATA_COUNT_THRESHOLD: # move on continue # find locations of high wind speeds and low pressures, cross match high_winds, = np.ma.where( (wind_data - wind_average) / wind_spread > STORM_THRESHOLD) low_pressures, = np.ma.where( (pressure_average - pressure_data) / pressure_spread > STORM_THRESHOLD) match = np.in1d(high_winds, low_pressures) couldbe_storm = False if len(match) > 0: # this could be a storm, either at tropical station (relatively constant pressure) # or out of season in mid-latitudes. couldbe_storm = True if obs_var.name in [ "station_level_pressure", "sea_level_pressure" ]: diffs = np.ma.diff(pressure_data) elif obs_var.name == "wind_speed": diffs = np.ma.diff(wind_data) # count up the largest number of sequential negative and positive differences negs, poss = 0, 0 biggest_neg, biggest_pos = 0, 0 for diff in diffs: if diff > 0: if negs > biggest_neg: biggest_neg = negs negs = 0 poss += 1 else: if poss > biggest_pos: biggest_pos = poss poss = 0 negs += 1 if (biggest_neg < 10) and (biggest_pos < 10) and not couldbe_storm: # insufficient to identify as a storm (HadISD values) # leave flags set pass else: # could be a storm, so better to leave this month unflagged # zero length array to flag ym_locs = np.ma.array([]) # copy over the flags, if any if len(ym_locs) != 0: # and set the flags flags[ym_locs] = "V" # diagnostic plots if plots: import matplotlib.pyplot as plt scaled_variances = ((variances - average_variance) / variance_spread) bins = utils.create_bins(scaled_variances, 0.25, obs_var.name) hist, bin_edges = np.histogram(scaled_variances, bins) plt.clf() plt.step(bins[1:], hist, color='k', where="pre") plt.yscale("log") plt.ylabel("Number of Months") plt.xlabel("Scaled {} Variances".format(obs_var.name.capitalize())) plt.title("{} - month {}".format(station.id, month)) plt.ylim([0.1, max(hist) * 2]) plt.axvline(SPREAD_THRESHOLD, c="r") plt.axvline(-SPREAD_THRESHOLD, c="r") bad_hist, dummy = np.histogram(scaled_variances[bad_years], bins) plt.step(bins[1:], bad_hist, color='r', where="pre") plt.show() # append flags to object obs_var.flags = utils.insert_flags(obs_var.flags, flags) if diagnostics: print("Variance {}".format(obs_var.name)) print(" Cumulative number of flags set: {}".format( len(np.where(flags != "")[0]))) return # variance_check
def pressure_offset(sealp, stnlp, times, config_file, plots=False, diagnostics=False): """ Flag locations where difference between station and sea-level pressure falls outside of bounds :param MetVar sealp: sea level pressure object :param MetVar stnlp: station level pressure object :param array times: datetime array :param str configfile: string for configuration file :param bool plots: turn on plots :param bool diagnostics: turn on diagnostic output """ flags = np.array(["" for i in range(sealp.data.shape[0])]) difference = sealp.data - stnlp.data if len(difference.compressed()) >= utils.DATA_COUNT_THRESHOLD: try: average = float( utils.read_qc_config(config_file, "PRESSURE", "average")) spread = float( utils.read_qc_config(config_file, "PRESSURE", "spread")) except KeyError: print("Information missing in config file") average = utils.average(difference) spread = utils.spread(difference) if spread < MIN_SPREAD: # less than XhPa spread = MIN_SPREAD elif spread > MAX_SPREAD: # more than XhPa spread = MAX_SPREAD utils.write_qc_config(config_file, "PRESSURE", "average", "{}".format(average), diagnostics=diagnostics) utils.write_qc_config(config_file, "PRESSURE", "spread", "{}".format(spread), diagnostics=diagnostics) if np.abs(np.ma.mean(difference) - np.ma.median(difference)) > THRESHOLD * spread: if diagnostics: print("Large difference between mean and median") print("Likely to have two populations of roughly equal size") print("Test won't work") pass else: high, = np.ma.where(difference > (average + (THRESHOLD * spread))) low, = np.ma.where(difference < (average - (THRESHOLD * spread))) # diagnostic plots if plots: bins = np.arange( np.round(difference.min()) - 1, np.round(difference.max()) + 1, 0.1) import matplotlib.pyplot as plt plt.clf() plt.hist(difference.compressed(), bins=bins) plt.axvline(x=(average + (THRESHOLD * spread)), ls="--", c="r") plt.axvline(x=(average - (THRESHOLD * spread)), ls="--", c="r") plt.xlim([bins[0] - 1, bins[-1] + 1]) plt.ylabel("Observations") plt.xlabel("Difference (hPa)") plt.show() if len(high) != 0: flags[high] = "p" if diagnostics: print("Pressure".format(stnlp.name)) print(" Number of high differences {}".format(len(high))) if plots: for bad in high: plot_pressure(sealp, stnlp, times, bad) if len(low) != 0: flags[low] = "p" if diagnostics: print(" Number of low differences {}".format(len(low))) if plots: for bad in low: plot_pressure(sealp, stnlp, times, bad) # only flag the station level pressure stnlp.flags = utils.insert_flags(stnlp.flags, flags) if diagnostics: print("Pressure {}".format(stnlp.name)) print(" Cumulative number of flags set: {}".format( len(np.where(flags != "")[0]))) return # pressure_offset
def prepare_data(obs_var, station, month, diagnostics=False, winsorize=True): """ Prepare the data for the climatological check. Makes anomalies and applies low-pass filter :param MetVar obs_var: meteorological variable object :param Station station: station object :param int month: which month to run on :param bool plots: turn on plots :param bool diagnostics: turn on diagnostic output :param bool winsorize: apply winsorization at 5%/95% """ anomalies = np.ma.zeros(obs_var.data.shape[0]) anomalies.mask = np.ones(anomalies.shape[0]) normed_anomalies = np.ma.copy(anomalies) mlocs, = np.where(station.months == month) nyears = len(np.unique(station.years[mlocs])) # need to have some data and in at least 5 years! if len(mlocs) >= utils.DATA_COUNT_THRESHOLD and nyears >= 5: anomalies.mask[mlocs] = False normed_anomalies.mask[mlocs] = False hourly_clims = np.ma.zeros(24) hourly_clims.mask = np.ones(24) for hour in range(24): # calculate climatology hlocs, = np.where( np.logical_and(station.months == month, station.hours == hour)) hour_data = obs_var.data[hlocs] if winsorize: if len(hour_data.compressed()) > 10: hour_data = utils.winsorize(hour_data, 5) if len(hour_data) >= utils.DATA_COUNT_THRESHOLD: hourly_clims[hour] = np.ma.mean(hour_data) hourly_clims.mask[hour] = False # make anomalies - keeping the order anomalies[hlocs] = obs_var.data[hlocs] - hourly_clims[hour] # if insufficient data at each hour, then no anomalies calculated if len(anomalies[mlocs].compressed()) >= utils.DATA_COUNT_THRESHOLD: # for the month, normalise anomalies by spread spread = utils.spread(anomalies[mlocs]) if spread < 1.5: spread = 1.5 normed_anomalies[mlocs] = anomalies[mlocs] / spread # apply low pass filter derived from monthly values all_years = np.unique(station.years) monthly_anoms = np.ma.zeros(all_years.shape[0]) for y, year in enumerate(all_years): ylocs, = np.where(station.years == year) year_data = obs_var.data[ylocs] monthly_anoms[y] = utils.average(year_data) lp_filtered_anomalies = low_pass_filter(normed_anomalies, station, monthly_anoms, month) return lp_filtered_anomalies # prepare_data else: return anomalies # prepare_data else: return anomalies # prepare_data
def find_month_thresholds(obs_var, station, config_file, plots=False, diagnostics=False, winsorize=True): """ Use distribution to identify threshold values. Then also store in config file. :param MetVar obs_var: meteorological variable object :param Station station: station object :param str config_file: configuration file to store critical values :param bool plots: turn on plots :param bool diagnostics: turn on diagnostic output :param bool winsorize: apply winsorization at 5%/95% """ # get hourly climatology for each month for month in range(1, 13): normalised_anomalies = prepare_data(obs_var, station, month, diagnostics=diagnostics, winsorize=winsorize) if len(normalised_anomalies.compressed() ) >= utils.DATA_COUNT_THRESHOLD: bins = utils.create_bins(normalised_anomalies, BIN_WIDTH, obs_var.name) hist, bin_edges = np.histogram(normalised_anomalies.compressed(), bins) gaussian_fit = utils.fit_gaussian( bins[1:], hist, max(hist), mu=bins[np.argmax(hist)], sig=utils.spread(normalised_anomalies)) fitted_curve = utils.gaussian(bins[1:], gaussian_fit) # diagnostic plots if plots: import matplotlib.pyplot as plt plt.clf() plt.step(bins[1:], hist, color='k', where="pre") plt.yscale("log") plt.ylabel("Number of Observations") plt.xlabel("Scaled {}".format(obs_var.name.capitalize())) plt.title("{} - month {}".format(station.id, month)) plt.plot(bins[1:], fitted_curve) plt.ylim([0.1, max(hist) * 2]) # use bins and curve to find points where curve is < FREQUENCY_THRESHOLD try: lower_threshold = bins[1:][np.where( np.logical_and(fitted_curve < FREQUENCY_THRESHOLD, bins[1:] < 0))[0]][-1] except: lower_threshold = bins[1] try: upper_threshold = bins[1:][np.where( np.logical_and(fitted_curve < FREQUENCY_THRESHOLD, bins[1:] > 0))[0]][0] except: upper_threshold = bins[-1] if plots: plt.axvline(upper_threshold, c="r") plt.axvline(lower_threshold, c="r") plt.show() utils.write_qc_config(config_file, "CLIMATOLOGICAL-{}".format(obs_var.name), "{}-uthresh".format(month), "{}".format(upper_threshold), diagnostics=diagnostics) utils.write_qc_config(config_file, "CLIMATOLOGICAL-{}".format(obs_var.name), "{}-lthresh".format(month), "{}".format(lower_threshold), diagnostics=diagnostics) return # find_month_thresholds
def all_obs_gap(obs_var, station, config_file, plots=False, diagnostics=False): """ Extract data for month and find secondary populations in distribution. :param MetVar obs_var: meteorological variable object :param Station station: station object :param str config_file: configuration file to store critical values :param bool plots: turn on plots :param bool diagnostics: turn on diagnostic output """ flags = np.array(["" for i in range(obs_var.data.shape[0])]) for month in range(1, 13): normalised_anomalies = prepare_all_data(obs_var, station, month, config_file, full=False, diagnostics=diagnostics) if (len(normalised_anomalies.compressed()) == 1 and normalised_anomalies[0] == utils.MDI): # no data to work with for this month, move on. continue bins = utils.create_bins(normalised_anomalies, BIN_WIDTH, obs_var.name) hist, bin_edges = np.histogram(normalised_anomalies, bins) try: upper_threshold = float( utils.read_qc_config(config_file, "ADISTRIBUTION-{}".format(obs_var.name), "{}-uthresh".format(month))) lower_threshold = float( utils.read_qc_config(config_file, "ADISTRIBUTION-{}".format(obs_var.name), "{}-lthresh".format(month))) except KeyError: print("Information missing in config file") find_thresholds(obs_var, station, config_file, plots=plots, diagnostics=diagnostics) upper_threshold = float( utils.read_qc_config(config_file, "ADISTRIBUTION-{}".format(obs_var.name), "{}-uthresh".format(month))) lower_threshold = float( utils.read_qc_config(config_file, "ADISTRIBUTION-{}".format(obs_var.name), "{}-lthresh".format(month))) if upper_threshold == utils.MDI and lower_threshold == utils.MDI: # these weren't able to be calculated, move on continue elif len(np.unique(normalised_anomalies)) == 1: # all the same value, so won't be able to fit a histogram continue # now to find the gaps uppercount = len(np.where(normalised_anomalies > upper_threshold)[0]) lowercount = len(np.where(normalised_anomalies < lower_threshold)[0]) month_locs, = np.where( station.months == month) # append should keep year order if uppercount > 0: gap_start = utils.find_gap(hist, bins, upper_threshold, GAP_SIZE) if gap_start != 0: bad_locs, = np.ma.where(normalised_anomalies > gap_start) # all years for one month month_flags = flags[month_locs] month_flags[bad_locs] = "d" flags[month_locs] = month_flags if lowercount > 0: gap_start = utils.find_gap(hist, bins, lower_threshold, GAP_SIZE, upwards=False) if gap_start != 0: bad_locs, = np.ma.where(normalised_anomalies < gap_start) # all years for one month month_flags = flags[month_locs] month_flags[bad_locs] = "d" # TODO - can this bit be refactored? # for pressure data, see if the flagged obs correspond with high winds # could be a storm signal if obs_var.name in [ "station_level_pressure", "sea_level_pressure" ]: wind_monthly_data = prepare_monthly_data( station.wind_speed, station, month) pressure_monthly_data = prepare_monthly_data( obs_var, station, month) if len(pressure_monthly_data.compressed()) < utils.DATA_COUNT_THRESHOLD or \ len(wind_monthly_data.compressed()) < utils.DATA_COUNT_THRESHOLD: # need sufficient data to work with for storm check to work, else can't tell pass else: wind_monthly_average = utils.average(wind_monthly_data) wind_monthly_spread = utils.spread(wind_monthly_data) pressure_monthly_average = utils.average( pressure_monthly_data) pressure_monthly_spread = utils.spread( pressure_monthly_data) # already a single calendar month, so go through each year all_years = np.unique(station.years) for year in all_years: # what's best - extract only when necessary but repeatedly if so, or always, but once this_year_locs = np.where( station.years[month_locs] == year) if "d" not in month_flags[this_year_locs]: # skip if you get the chance continue wind_data = station.wind_speed.data[month_locs][ this_year_locs] pressure_data = obs_var.data[month_locs][ this_year_locs] storms, = np.ma.where( np.logical_and( (((wind_data - wind_monthly_average) / wind_monthly_spread) > STORM_THRESHOLD), (((pressure_monthly_average - pressure_data ) / pressure_monthly_spread) > STORM_THRESHOLD))) # more than one entry - check if separate events if len(storms) >= 2: # find where separation more than the usual obs separation storm_1diffs = np.ma.diff(storms) separations, = np.where( storm_1diffs > np.ma.median( np.ma.diff(wind_data))) if len(separations) != 0: # multiple storm signals storm_start = 0 storm_finish = separations[0] + 1 first_storm = expand_around_storms( storms[storm_start:storm_finish], len(wind_data)) final_storm_locs = copy.deepcopy( first_storm) for j in range(len(separations)): # then do the rest in a loop if j + 1 == len(separations): # final one this_storm = expand_around_storms( storms[separations[j] + 1:], len(wind_data)) else: this_storm = expand_around_storms( storms[separations[j] + 1:separations[j + 1] + 1], len(wind_data)) final_storm_locs = np.append( final_storm_locs, this_storm) else: # locations separated at same interval as data final_storm_locs = expand_around_storms( storms, len(wind_data)) # single entry elif len(storms) != 0: # expand around the storm signal (rather than # just unflagging what could be the peak and # leaving the entry/exit flagged) final_storm_locs = expand_around_storms( storms, len(wind_data)) # unset the flags if len(storms) > 0: month_flags[this_year_locs][ final_storm_locs] = "" # having checked for storms now store final flags flags[month_locs] = month_flags # diagnostic plots if plots: import matplotlib.pyplot as plt plt.step(bins[1:], hist, color='k', where="pre") plt.yscale("log") plt.ylabel("Number of Observations") plt.xlabel(obs_var.name.capitalize()) plt.title("{} - month {}".format(station.id, month)) plt.ylim([0.1, max(hist) * 2]) plt.axvline(upper_threshold, c="r") plt.axvline(lower_threshold, c="r") bad_locs, = np.where(flags[month_locs] == "d") bad_hist, dummy = np.histogram(normalised_anomalies[bad_locs], bins) plt.step(bins[1:], bad_hist, color='r', where="pre") plt.show() # append flags to object obs_var.flags = utils.insert_flags(obs_var.flags, flags) if diagnostics: print("Distribution (all) {}".format(obs_var.name)) print(" Cumulative number of flags set: {}".format( len(np.where(flags != "")[0]))) return # all_obs_gap
def find_thresholds(obs_var, station, config_file, plots=False, diagnostics=False): """ Extract data for month and find thresholds in distribution and store. :param MetVar obs_var: meteorological variable object :param Station station: station object :param int month: month to process :param str config_file: configuration file to store critical values :param bool diagnostics: turn on diagnostic output """ for month in range(1, 13): normalised_anomalies = prepare_all_data(obs_var, station, month, config_file, full=True, diagnostics=diagnostics) if len(normalised_anomalies.compressed() ) == 1 and normalised_anomalies[0] == utils.MDI: # scaling not possible for this month utils.write_qc_config(config_file, "ADISTRIBUTION-{}".format(obs_var.name), "{}-uthresh".format(month), "{}".format(utils.MDI), diagnostics=diagnostics) utils.write_qc_config(config_file, "ADISTRIBUTION-{}".format(obs_var.name), "{}-lthresh".format(month), "{}".format(utils.MDI), diagnostics=diagnostics) continue elif len(np.unique(normalised_anomalies)) == 1: # all the same value, so won't be able to fit a histogram utils.write_qc_config(config_file, "ADISTRIBUTION-{}".format(obs_var.name), "{}-uthresh".format(month), "{}".format(utils.MDI), diagnostics=diagnostics) utils.write_qc_config(config_file, "ADISTRIBUTION-{}".format(obs_var.name), "{}-lthresh".format(month), "{}".format(utils.MDI), diagnostics=diagnostics) continue bins = utils.create_bins(normalised_anomalies, BIN_WIDTH, obs_var.name) hist, bin_edges = np.histogram(normalised_anomalies, bins) gaussian_fit = utils.fit_gaussian(bins[1:], hist, max(hist), mu=bins[np.argmax(hist)], \ sig=utils.spread(normalised_anomalies), skew=skew(normalised_anomalies.compressed())) fitted_curve = utils.skew_gaussian(bins[1:], gaussian_fit) # diagnostic plots if plots: import matplotlib.pyplot as plt plt.clf() plt.step(bins[1:], hist, color='k', where="pre") plt.yscale("log") plt.ylabel("Number of Observations") plt.xlabel(obs_var.name.capitalize()) plt.title("{} - month {}".format(station.id, month)) plt.plot(bins[1:], fitted_curve) plt.ylim([0.1, max(hist) * 2]) # use bins and curve to find points where curve is < FREQUENCY_THRESHOLD try: lower_threshold = bins[1:][np.where( np.logical_and( fitted_curve < FREQUENCY_THRESHOLD, bins[1:] < bins[np.argmax(fitted_curve)]))[0]][-1] except: lower_threshold = bins[1] try: if len(np.unique(fitted_curve)) == 1: # just a line of zeros perhaps (found on AFA00409906 station_level_pressure 20190913) upper_threshold = bins[-1] else: upper_threshold = bins[1:][np.where( np.logical_and( fitted_curve < FREQUENCY_THRESHOLD, bins[1:] > bins[np.argmax(fitted_curve)]))[0]][0] except: upper_threshold = bins[-1] if plots: plt.axvline(upper_threshold, c="r") plt.axvline(lower_threshold, c="r") plt.show() utils.write_qc_config(config_file, "ADISTRIBUTION-{}".format(obs_var.name), "{}-uthresh".format(month), "{}".format(upper_threshold), diagnostics=diagnostics) utils.write_qc_config(config_file, "ADISTRIBUTION-{}".format(obs_var.name), "{}-lthresh".format(month), "{}".format(lower_threshold), diagnostics=diagnostics) return # find_thresholds
def prepare_all_data(obs_var, station, month, config_file, full=False, diagnostics=False): """ Extract data for the month, make & store or read average and spread. Use to calculate normalised anomalies. :param MetVar obs_var: meteorological variable object :param Station station: station object :param int month: month to process :param str config_file: configuration file to store critical values :param bool diagnostics: turn on diagnostic output """ month_locs, = np.where(station.months == month) all_month_data = obs_var.data[month_locs] if full: if len(all_month_data.compressed()) >= utils.DATA_COUNT_THRESHOLD: # have data, now to standardise climatology = utils.average(all_month_data) # mean spread = utils.spread(all_month_data) # IQR currently else: climatology = utils.MDI spread = utils.MDI # write out the scaling... utils.write_qc_config(config_file, "ADISTRIBUTION-{}".format(obs_var.name), "{}-clim".format(month), "{}".format(climatology), diagnostics=diagnostics) utils.write_qc_config(config_file, "ADISTRIBUTION-{}".format(obs_var.name), "{}-spread".format(month), "{}".format(spread), diagnostics=diagnostics) else: try: climatology = float( utils.read_qc_config(config_file, "ADISTRIBUTION-{}".format(obs_var.name), "{}-clim".format(month))) spread = float( utils.read_qc_config(config_file, "ADISTRIBUTION-{}".format(obs_var.name), "{}-spread".format(month))) except KeyError: if len(all_month_data.compressed()) >= utils.DATA_COUNT_THRESHOLD: # have data, now to standardise climatology = utils.average(all_month_data) # mean spread = utils.spread(all_month_data) # IQR currently else: climatology = utils.MDI spread = utils.MDI # write out the scaling... utils.write_qc_config(config_file, "ADISTRIBUTION-{}".format(obs_var.name), "{}-clim".format(month), "{}".format(climatology), diagnostics=diagnostics) utils.write_qc_config(config_file, "ADISTRIBUTION-{}".format(obs_var.name), "{}-spread".format(month), "{}".format(spread), diagnostics=diagnostics) if climatology == utils.MDI and spread == utils.MDI: # these weren't calculable, move on return np.ma.array([utils.MDI]) elif spread == 0: # all the same value return (all_month_data - climatology) # prepare_all_data else: return (all_month_data - climatology) / spread # prepare_all_data