def spc(station, flag_col, start, end, logfile, diagnostics = False, plots = False, doMonth = False): ''' Run pressure cross checks on SLP and STNLP :param object station: station object :param array flag_col: which columns to fill in flag_array :param datetime start: start of dataset :param datetime end: end of dataset :param file logfile: logfile to store outputs :param bool plots: do plots or not :param bool diagnostics: extra verbose information :param bool doMonth: account for incomplete months :returns: ''' slp = getattr(station, 'slp') stnlp = getattr(station, 'stnlp') month_ranges = utils.month_starts_in_pairs(start, end) station.qc_flags[:,flag_col[0]] = spc_diff(slp, stnlp, station.qc_flags[:,flag_col[0]], month_ranges, start, end, logfile, plots = plots, diagnostics = diagnostics, doMonth = doMonth) station = utils.append_history(station, "Pressure Cross Check") return # spc
def run_calcs(station, logfile, plots=False, diagnostics=False): ''' Run the humidity calculations and add the attributes to the station file :param object station: station object :param file logfile: logfile to store outputs :param boolean diagnostics: output diagnostic information :param boolean plots: make a plot :returns: station - updated with humidity variables ''' temperatures = utils.apply_flags_to_mask(station, "temperatures") dewpoints = utils.apply_flags_to_mask(station, "dewpoints") # adjust from sea-level to station-level station_pressure = get_station_level_pressure(station) e_v = utils.set_MetVar_attributes("vapor_pressure", "Vapor pressure calculated w.r.t water", "water_vapor_pressure", "hPa", temperatures.mdi, np.dtype('float64')) e_s = utils.set_MetVar_attributes( "saturation_vapor_pressure", "Saturation vapor pressure calculated w.r.t. water", "water_vapor_pressure", "hPa", temperatures.mdi, np.dtype('float64')) Tw = utils.set_MetVar_attributes( "wet_bulb_temperature", "Wet bulb temperatures nearest to reporting hour", "wet_bulb_temperature", "C", temperatures.mdi, np.dtype('float64')) q = utils.set_MetVar_attributes("specific_humidity", "Specific humidity", "specific_humidity", "g/kg", temperatures.mdi, np.dtype('float64')) rh = utils.set_MetVar_attributes("relative_humidity", "Relative humidity", "relative_humidity", "%rh", temperatures.mdi, np.dtype('float64')) # sort the vapour pressures and wet-bulb --> ice or water? e_v.data, e_s.data, Tw.data = fix_wrt_ice_or_water(temperatures.data, dewpoints.data, station_pressure) # get relative and specific humidity q.data = calculate_q(e_v.data, station_pressure) rh.data = calculate_rh(e_v.data, e_s.data) if plots or diagnostics: print "Humidity variables calculated, setting attributes\n" else: logfile.write("Humidity variables calculated, setting attributes\n") setattr(station, "vapour_pressure", e_v) setattr(station, "saturation_vapour_pressure", e_s) setattr(station, "wetbulb_temperature", Tw) setattr(station, "specific_humidity", q) setattr(station, "relative_humidity", rh) station = utils.append_history(station, "Humidity Calculations") return station # run_calcs
def dgc(station, variable_list, flag_col, start, end, logfile, plots=False, diagnostics = False, idl = False, GH = False): '''Controller for two individual tests''' if plots: import matplotlib.pyplot as plt for v, variable in enumerate(variable_list): station.qc_flags[:,flag_col[v]] = dgc_monthly(station, variable, station.qc_flags[:,flag_col[v]], start, end, plots=plots, diagnostics=diagnostics, idl = idl) if variable == "slp": # need to send in windspeeds too station.qc_flags[:,flag_col[v]] = dgc_all_obs(station, variable, station.qc_flags[:,flag_col[v]], start, end, plots=plots, diagnostics=diagnostics, idl = idl, windspeeds = True, GH = GH) else: station.qc_flags[:,flag_col[v]] = dgc_all_obs(station, variable, station.qc_flags[:,flag_col[v]], start, end, plots=plots, diagnostics=diagnostics, idl = idl, GH = GH) flag_locs = np.where(station.qc_flags[:, flag_col[v]] != 0) if plots or diagnostics: utils.print_flagged_obs_number(logfile, "Distributional Gap", variable, len(flag_locs[0]), noWrite=True) else: utils.print_flagged_obs_number(logfile, "Distributional Gap", variable, len(flag_locs[0])) # copy flags into attribute st_var = getattr(station, variable) st_var.flags[flag_locs] = 1 # MATCHES IDL for 030660-99999, 2 flags in T, 30-06-2014 station = utils.append_history(station, "Distributional Gap Check") return # dgc
def clu(station, var_list, flag_cols, FLAG_COL_DICT, start, end, logfile, plots=False): """ Run the clean up for each variable :param file logfile: logfile to store outputs """ for v, variable in enumerate(var_list): st_var = getattr(station, variable) clean_up( st_var, station.qc_flags, FLAG_COL_DICT[variable], flag_cols[v], start, end, station.time.data, plots=plots ) flag_locs = np.where(station.qc_flags[:, flag_cols[v]] != 0) if plots: utils.print_flagged_obs_number(logfile, "Clean Up Months", variable, len(flag_locs[0]), noWrite=True) else: utils.print_flagged_obs_number(logfile, "Clean Up Months", variable, len(flag_locs[0])) # copy flags into attribute st_var.flags[flag_locs] = 1 station = utils.append_history(station, "Clean Up Months") return # clu
def ccc(station, flag_col, logfile, diagnostics = False, plots = False): ''' Call the logical cloud checks :param obj station: station object :param list flag_col: flag columns to use :param file logfile: logfile to store output :param bool diagnostics: diagnostic output (unused) :param bool plots: do the plots (unused) :returns: ''' if len(flag_col) != 8: print "insufficient flag columns given" return unobservable(station, flag_col[0:4], logfile, plots = plots, diagnostics = diagnostics) total_lt_max(station, flag_col[4], logfile, plots = plots, diagnostics = diagnostics) low_full(station, flag_col[5], logfile, plots = plots, diagnostics = diagnostics) mid_full(station, flag_col[6], logfile, plots = plots, diagnostics = diagnostics) fix_cloud_base(station) negative_cloud(station, flag_col[7], logfile, plots = plots, diagnostics = diagnostics) station = utils.append_history(station, "Cloud - Logical Cross Check") return # ccc
def run_calcs(station, logfile, plots=False, diagnostics=False): ''' Run the heat stress calculations and add the new attributes to the station :param obj station: station object :param file logfile: logfile to store outputs :param boolean diagnostics: output diagnostic information :param boolean plots: make a plot :returns: updated station object with heat stress values. ''' temperatures = utils.apply_flags_to_mask(station, "temperatures") rh = getattr(station, "relative_humidity") # no separate flags using fdi e_v = getattr(station, "vapour_pressure") # no separate flags using fdi windspeeds = utils.apply_flags_to_mask(station, "windspeeds") thi = utils.set_MetVar_attributes("temperature_humidity_index", "Temperature Humidity Index (THI)", "temperature_humidity_index", "1", temperatures.mdi, np.dtype('float64')) wbgt = utils.set_MetVar_attributes("wet_bulb_globe_temperature", "Wet Bulb Globe Temperature (WBGT)", "wet_bulb_globe_temperature", "C", temperatures.mdi, np.dtype('float64')) humidex = utils.set_MetVar_attributes("humidex", "Humidex", "humidex", "1", temperatures.mdi, np.dtype('float64')) apparent_t = utils.set_MetVar_attributes("apparent_temperature", "Apparent Temperature", "apparent_temperature", "C", temperatures.mdi, np.dtype('float64')) heat_index = utils.set_MetVar_attributes("heat_index", "Heat Index", "heat_index", "C", temperatures.mdi, np.dtype('float64')) thi.data = calculate_thi(temperatures.data, rh.data) wbgt.data = calculate_wbgt(temperatures.data, e_v.data) humidex.data = calculate_humidex(temperatures.data, e_v.data) apparent_t.data = calculate_apparent_t(temperatures.data, e_v.data, windspeeds.data) heat_index.data = calculate_heat_index(temperatures.data, rh.data) if plots or diagnostics: print "Heat stress variables calculated, setting attributes\n" else: logfile.write("Heat stress variables calculated, setting attributes\n") setattr(station, "THI", thi) setattr(station, "WBGT", wbgt) setattr(station, "humidex", humidex) setattr(station, "apparent_t", apparent_t) setattr(station, "heat_index", heat_index) station = utils.append_history(station, "Heat Stress Calculations") return station # run_calcs
def hcc(station, flag_col, start, end, logfile, diagnostics=False, plots=False): """ Run humidity cross checks on temperature and dewpoint temperature obs :param object station: station object :param array flag_col: which columns to fill in flag_array :param datetime start: start of dataset :param datetime end: end of dataset :param file logfile: logfile to store outputs :param bool plots: do plots or not :param bool diagnostics: extra verbose information :returns: """ temperatures = getattr(station, "temperatures") dewpoints = getattr(station, "dewpoints") month_ranges = utils.month_starts_in_pairs(start, end) # Supersaturation station.qc_flags[:, flag_col[0]] = hcc_sss( temperatures.data, dewpoints.data, month_ranges, start, logfile, plots=plots, diagnostics=diagnostics ) # Dew point depression precip = getattr(station, "precip1_depth") cloudbase = getattr(station, "cloud_base") past_sigwx = getattr(station, "past_sigwx1") times = station.time.data station.qc_flags[:, flag_col[1]] = hcc_dpd( times, temperatures.data, dewpoints.data, precip.data, cloudbase.data, past_sigwx.data, start, logfile, plots=plots, diagnostics=diagnostics, ) # Dew point cutoffs station.qc_flags[:, flag_col[2]] = hcc_cutoffs( temperatures.data, dewpoints.data, month_ranges, logfile, start, plots=plots, diagnostics=diagnostics ) for col in range(3): flag_locs = np.where(station.qc_flags[:, flag_col[col]] != 0) station.dewpoints.flags[flag_locs] = 1 station = utils.append_history(station, "Temperature-Humidity Cross Check") return # hcc
def hcc(station, flag_col, start, end, logfile, diagnostics = False, plots = False): ''' Run humidity cross checks on temperature and dewpoint temperature obs :param object station: station object :param array flag_col: which columns to fill in flag_array :param datetime start: start of dataset :param datetime end: end of dataset :param file logfile: logfile to store outputs :param bool plots: do plots or not :param bool diagnostics: extra verbose information :returns: ''' temperatures = getattr(station, 'temperatures') dewpoints = getattr(station, 'dewpoints') month_ranges = utils.month_starts_in_pairs(start, end) # Supersaturation station.qc_flags[:,flag_col[0]] = hcc_sss(temperatures.data, dewpoints.data, month_ranges, start, logfile, plots = plots, diagnostics = diagnostics) # Dew point depression precip1 = getattr(station, 'precip1_depth') precip2 = getattr(station, 'precip2_depth') precip3 = getattr(station, 'precip3_depth') precip6 = getattr(station, 'precip6_depth') precip9 = getattr(station, 'precip9_depth') precip12 = getattr(station, 'precip12_depth') precip15 = getattr(station, 'precip15_depth') precip18 = getattr(station, 'precip18_depth') precip24 = getattr(station, 'precip24_depth') cloudbase = getattr(station, 'cloud_base') past_sigwx = getattr(station, 'past_sigwx1') times = station.time.data # combine all the precips together precips = np.array([precip1.data, precip2.data, precip3.data, precip6.data, precip9.data, precip12.data, precip15.data, precip18.data, precip24.data]) station.qc_flags[:,flag_col[1]] = hcc_dpd(times, temperatures.data, dewpoints.data, precips, cloudbase.data, past_sigwx.data, start, logfile, plots = plots, diagnostics = diagnostics) # Dew point cutoffs station.qc_flags[:,flag_col[2]] = hcc_cutoffs(temperatures.data, dewpoints.data, month_ranges, logfile, start, plots = plots, diagnostics = diagnostics) for col in range(3): flag_locs = np.where(station.qc_flags[:, flag_col[col]] != 0) station.dewpoints.flags[flag_locs] = 1 station = utils.append_history(station, "Temperature-Humidity Cross Check") return # hcc
def ccc(station, flag_col, logfile, diagnostics=False, plots=False): ''' Call the logical cloud checks :param obj station: station object :param list flag_col: flag columns to use :param file logfile: logfile to store output :param bool diagnostics: diagnostic output (unused) :param bool plots: do the plots (unused) :returns: ''' if len(flag_col) != 8: print "insufficient flag columns given" return unobservable(station, flag_col[0:4], logfile, plots=plots, diagnostics=diagnostics) total_lt_max(station, flag_col[4], logfile, plots=plots, diagnostics=diagnostics) low_full(station, flag_col[5], logfile, plots=plots, diagnostics=diagnostics) mid_full(station, flag_col[6], logfile, plots=plots, diagnostics=diagnostics) fix_cloud_base(station) negative_cloud(station, flag_col[7], logfile, plots=plots, diagnostics=diagnostics) station = utils.append_history(station, "Cloud - Logical Cross Check") return # ccc
def pcc(station, flag_col, start, end, logfile, diagnostics=False, plots=False): ''' Run humidity cross checks on precipitation obs :param object station: station object :param array flag_col: which columns to fill in flag_array :param datetime start: start of dataset :param datetime end: end of dataset :param file logfile: logfile to store outputs :param bool plots: do plots or not :param bool diagnostics: extra verbose information :returns: ''' precip1 = getattr(station, 'precip1_depth') precip2 = getattr(station, 'precip2_depth') precip3 = getattr(station, 'precip3_depth') precip6 = getattr(station, 'precip6_depth') precip9 = getattr(station, 'precip9_depth') precip12 = getattr(station, 'precip12_depth') precip15 = getattr(station, 'precip15_depth') precip18 = getattr(station, 'precip18_depth') precip24 = getattr(station, 'precip24_depth') times = station.time.data # combine all the precips together precips = np.ma.array([ precip1.data, precip2.data, precip3.data, precip6.data, precip9.data, precip12.data, precip15.data, precip18.data, precip24.data ]) station.qc_flags[:, flag_col[0]] = pcc_accumulations(times, precips, start, logfile, plots=plots, diagnostics=diagnostics) station = utils.append_history(station, "Precipitation Cross Check") return # pcc
def run_calcs(station, logfile, plots = False, diagnostics = False): ''' Run the humidity calculations and add the attributes to the station file :param object station: station object :param file logfile: logfile to store outputs :param boolean diagnostics: output diagnostic information :param boolean plots: make a plot :returns: station - updated with humidity variables ''' temperatures = utils.apply_flags_to_mask(station, "temperatures") dewpoints = utils.apply_flags_to_mask(station, "dewpoints") # adjust from sea-level to station-level station_pressure = get_station_level_pressure(station) e_v = utils.set_MetVar_attributes("vapor_pressure", "Vapor pressure calculated w.r.t water", "water_vapor_pressure", "hPa", temperatures.mdi, np.dtype('float64')) e_s = utils.set_MetVar_attributes("saturation_vapor_pressure", "Saturation vapor pressure calculated w.r.t. water", "water_vapor_pressure", "hPa", temperatures.mdi, np.dtype('float64')) Tw = utils.set_MetVar_attributes("wet_bulb_temperature", "Wet bulb temperatures nearest to reporting hour", "wet_bulb_temperature", "C", temperatures.mdi, np.dtype('float64')) q = utils.set_MetVar_attributes("specific_humidity", "Specific humidity", "specific_humidity", "g/kg", temperatures.mdi, np.dtype('float64')) rh = utils.set_MetVar_attributes("relative_humidity", "Relative humidity", "relative_humidity", "%rh", temperatures.mdi, np.dtype('float64')) # sort the vapour pressures and wet-bulb --> ice or water? e_v.data, e_s.data, Tw.data = fix_wrt_ice_or_water(temperatures.data, dewpoints.data, station_pressure) # get relative and specific humidity q.data = calculate_q(e_v.data, station_pressure) rh.data = calculate_rh(e_v.data, e_s.data) if plots or diagnostics: print "Humidity variables calculated, setting attributes\n" else: logfile.write("Humidity variables calculated, setting attributes\n") setattr(station, "vapour_pressure", e_v) setattr(station, "saturation_vapour_pressure", e_s) setattr(station, "wetbulb_temperature", Tw) setattr(station, "specific_humidity", q) setattr(station, "relative_humidity", rh) station = utils.append_history(station, "Humidity Calculations") return station # run_calcs
def clu(station, var_list, flag_cols, FLAG_COL_DICT, start, end, logfile, plots=False, diagnostics=False): ''' Run the clean up for each variable :param file logfile: logfile to store outputs ''' for v, variable in enumerate(var_list): st_var = getattr(station, variable) clean_up(st_var, station.qc_flags, FLAG_COL_DICT[variable], flag_cols[v], start, end, station.time.data, plots=plots) flag_locs = np.where(station.qc_flags[:, flag_cols[v]] != 0) if plots or diagnostics: utils.print_flagged_obs_number(logfile, "Clean Up Months", variable, len(flag_locs[0]), noWrite=True) else: utils.print_flagged_obs_number(logfile, "Clean Up Months", variable, len(flag_locs[0])) # copy flags into attribute st_var.flags[flag_locs] = 1 station = utils.append_history(station, "Clean Up Months") return # clu
def krc(station, var_list, flag_col, logfile, diagnostics = False, plots = False): ''' Run the known records check for each variable in list :param object station: station to process :param list var_list: list of variables to process :param list flag_col: which columns to use for which variable :param file logfile: logfile to store output :param bool diagnostics: diagnostic output (unused) :param bool plots: do the plots (unused) ''' for v, variable in enumerate(var_list): st_var = getattr(station, variable) st_region = krc_get_wmo_region(station.id) all_filtered = utils.apply_filter_flags(st_var) too_high = np.where(all_filtered > maxes[variable][st_region]) krc_set_flags(too_high, station.qc_flags, flag_col[v]) # make sure that don't flag the missing values! too_low = np.where(np.logical_and(all_filtered < mins[variable][st_region], all_filtered.mask == False )) krc_set_flags(too_low, station.qc_flags, flag_col[v]) flag_locs = np.where(station.qc_flags[:, flag_col[v]] != 0) if plots or diagnostics: utils.print_flagged_obs_number(logfile, "World Record", variable, len(flag_locs[0]), noWrite = True) else: utils.print_flagged_obs_number(logfile, "World Record", variable, len(flag_locs[0])) # copy flags into attribute st_var.flags[flag_locs] = 1 station = utils.append_history(station, "World Record Check") return # krc
def rsc(station, var_list, flag_col, start, end, logfile, diagnostics = False, plots = False, doMonth = False): ''' Wrapper for the four individual repeating streak check tests ''' times = station.time.data for v, variable in enumerate(var_list): st_var = getattr(station, variable) if len(utils.apply_filter_flags(st_var).compressed()) > 0: wind = False if variable == "windspeeds": wind = True winddir= False if variable == "winddirs": winddir = True reporting_resolution = utils.reporting_accuracy(utils.apply_filter_flags(st_var, doMonth = doMonth, start = start, end = end), winddir = winddir, plots = plots) limits = limits_dict[variable][reporting_resolution] # need to apply flags to st_var.flags each time for filtering station.qc_flags[:,flag_col[v][0]] = rsc_straight_strings(st_var, times, limits[0], limits[1], start, end, reporting = reporting_resolution, wind = wind, diagnostics = diagnostics, plots = plots, dynamic = True, doMonth = doMonth) # no effect of final incomplete year ("month" option) as limits[2] and limits[3] fixed station.qc_flags[:, flag_col[v][1]], station.qc_flags[:, flag_col[v][2]] = rsc_hourly_repeats(st_var, times, limits[2], limits[3], diagnostics = diagnostics, plots = plots) for streak_type in range(3): flag_locs = np.where(station.qc_flags[:, flag_col[v][streak_type]] != 0) utils.print_flagged_obs_number(logfile, "Streak Check", variable, len(flag_locs[0]), noWrite = diagnostics) # copy flags into attribute st_var.flags[flag_locs] = 1 station = utils.append_history(station, "Streak Check") return
def rsc(station, var_list, flag_col, start, end, logfile, diagnostics = False, plots = False): ''' Wrapper for the four individual repeating streak check tests ''' times = station.time.data for v, variable in enumerate(var_list): st_var = getattr(station, variable) if len(utils.apply_filter_flags(st_var).compressed()) > 0: reporting_resolution = utils.reporting_accuracy(utils.apply_filter_flags(st_var)) limits = limits_dict[variable][reporting_resolution] wind = False if variable == "windspeeds": wind = True # need to apply flags to st_var.flags each time for filtering station.qc_flags[:,flag_col[v][0]] = rsc_straight_strings(st_var, times, limits[0], limits[1], start, end, reporting = reporting_resolution, wind = wind, diagnostics = diagnostics, plots = plots, dynamic = True) station.qc_flags[:, flag_col[v][1]], station.qc_flags[:, flag_col[v][2]]= rsc_hourly_repeats(st_var, times, limits[2], limits[3], diagnostics = diagnostics, plots = plots) for streak_type in range(3): flag_locs = np.where(station.qc_flags[:, flag_col[v][streak_type]] != 0) if plots or diagnostics: utils.print_flagged_obs_number(logfile, "Streak Check", variable, len(flag_locs[0]), noWrite = True) else: utils.print_flagged_obs_number(logfile, "Streak Check", variable, len(flag_locs[0])) # copy flags into attribute st_var.flags[flag_locs] = 1 station = utils.append_history(station, "Streak Check") return
def dcc(station, variable_list, full_variable_list, flag_col, start, end, logfile, diagnostics=False, plots=False, doMonth=False): ''' The diurnal cycle check. :param object station: the station object to be processed :param list variable_list: the variables to be processed :param list full_variable_list: the variables for flags to be applied to :param list flag_col: which column in the qc_flags array to work on :param file logfile: logfile to store outputs :param bool plots: to do any plots :param bool diagnostics: to do any extra diagnostic output :returns: ''' # list of flags for each variable diurnal_flags = [] for v, variable in enumerate(variable_list): st_var = getattr(station, variable) # is this needed 21/08/2014 # reporting_accuracy = utils.reporting_accuracy(utils.apply_filter_flags(st_var)) # apply flags, but discount incomplete year - so that test values against these later. all_data = utils.apply_filter_flags(st_var) all_data = all_data.reshape(-1, 24) # working in fulltimes. # apply flags - also apply to final incomplete year so that best values only use complete years filtered_data = utils.apply_filter_flags(st_var, doMonth=doMonth, start=start, end=end) filtered_data = filtered_data.reshape(-1, 24) # working in fulltimes. number_of_days = filtered_data.shape[0] if plots: import matplotlib.pyplot as plt plt.clf() plot_data = np.ma.zeros(filtered_data.shape) plot_data.mask = True # best_estimate_counter = np.zeros(HOURS) diurnal_filtered_fits = np.zeros(filtered_data.shape[0], dtype=(int)) diurnal_filtered_fits.fill(INTMDI) diurnal_best_fits = np.zeros(st_var.data.shape[0], dtype=(int)) diurnal_best_fits.fill(INTMDI) diurnal_uncertainties = np.zeros(filtered_data.shape[0]) diurnal_uncertainties.fill(INTMDI) for d, day in enumerate(all_data): '''enough observations and have large enough diurnal range ''' if len(day.compressed()) >= OBS_PER_DAY: obs_daily_range = max(day.compressed()) - min(day.compressed()) if obs_daily_range >= DAILY_RANGE: if dcc_quartile_check(day): scaled_sine = ((dcc_make_sine() + 1.) / 2. * obs_daily_range) + min(day.compressed()) diffs = np.zeros(HOURS) '''Find differences for each shifted sine --> cost function''' for h in range(HOURS): diffs[h] = np.sum( np.abs(day - scaled_sine).compressed()) scaled_sine = np.roll(scaled_sine, 1) # matched to IDL SHIFT() # and keep this for testing against the average value later diurnal_best_fits[d] = np.argmin(diffs) for d, day in enumerate(filtered_data): '''enough observations and have large enough diurnal range ''' if len(day.compressed()) >= OBS_PER_DAY: obs_daily_range = max(day.compressed()) - min(day.compressed()) if obs_daily_range >= DAILY_RANGE: if dcc_quartile_check(day): scaled_sine = ((dcc_make_sine() + 1.) / 2. * obs_daily_range) + min(day.compressed()) diffs = np.zeros(HOURS) '''Find differences for each shifted sine --> cost function''' for h in range(HOURS): diffs[h] = np.sum( np.abs(day - scaled_sine).compressed()) scaled_sine = np.roll(scaled_sine, 1) # matched to IDL SHIFT() diurnal_filtered_fits[d] = np.argmin(diffs) # default uncertainty is the average time resolution of the data diurnal_uncertainties[d] = round( float(HOURS) / len(day.compressed())) if DYNAMIC_DIURNAL: critical_value = min(diffs) + ( (max(diffs) - min(diffs)) * 0.33) # centre so minimum in middle diffs = np.roll(diffs, 11 - diurnal_filtered_fits[d]) uncertainty = 1 while uncertainty < 11: if (diffs[11 - uncertainty] > critical_value) and\ (diffs[11 + uncertainty] > critical_value): # break if both sides greater than critical difference # when counting outwards # see diurnal_example.py break uncertainty += 1 # check if uncertainty greater than time resolution for day if uncertainty > diurnal_uncertainties[d]: diurnal_uncertainties[d] = uncertainty if plots: # best_estimate_counter[np.argmin(diffs)] += 1 # scale daily data to range -1 -> 1, plot with random scatter for clarity plot_data[d] = ((2 * (day - min(day.compressed())) / obs_daily_range) - 1.) plt.plot( np.arange(24) + np.random.randn(24) * 0.25, plot_data[d] + np.random.randn(24) * 0.05, 'k,') if plots: plt.plot( np.arange(24), np.roll( dcc_make_sine(), np.argmax( np.bincount(diurnal_filtered_fits[np.where( diurnal_filtered_fits != INTMDI)]))), 'r-') plt.xlim([-1, 25]) plt.ylim([-1.2, 1.2]) plt.show() # dumb copy of IDL '''For each uncertainty range (1-6h) find median of cycle offset''' filtered_fits = np.zeros(6) filtered_fits.fill(-9) for h in range(6): locs = np.where(diurnal_uncertainties == h + 1) if len(locs[0]) > 300: # filtered_fits[h] = int(np.median(diurnal_filtered_fits[locs])) # Numpy median gives average of central two values which may not be integer # 25/11/2014 use IDL style which gives lower value filtered_fits[h] = utils.idl_median( diurnal_filtered_fits[locs]) '''Build up range of cycles incl, uncertainty to find where best of best located''' hours = np.arange(24) hour_matches = np.zeros(24) diurnal_peak = -9 number_estimates = 0 for h in range(6): if filtered_fits[h] != -9: '''Store lowest uncertainty best fit as first guess''' if diurnal_peak == -9: diurnal_peak = filtered_fits[h] hours = np.roll(hours, 11 - int(diurnal_peak)) hour_matches[11 - (h + 1):11 + (h + 2)] = 1 number_estimates += 1 centre, = np.where(hours == filtered_fits[h]) if (centre[0] - h + 1) >= 0: if (centre[0] + h + 1) <= 23: hour_matches[centre[0] - (h + 1):centre[0] + h + 2] += 1 else: hour_matches[centre[0] - (h + 1):] += 1 hour_matches[:centre[0] + h + 2 - 24] += 1 else: hour_matches[:centre[0] + h + 2] += 1 hour_matches[centre[0] - (h + 1):] += 1 number_estimates += 1 '''If value at lowest uncertainty not found in all others, then see what value is found by all others ''' if hour_matches[ 11] != number_estimates: # central estimate at 12 o'clock all_match = np.where(hour_matches == number_estimates) # if one is, then use it if len(all_match[0]) > 0: diurnal_peak = all_match[0][0] else: diurnal_peak = -9 '''Now have value for best fit diurnal offset''' potentially_spurious = np.zeros(number_of_days) potentially_spurious.fill(INTMDI) if diurnal_peak != -9: hours = np.arange(24) hours = np.roll(hours, 11 - int(diurnal_peak)) for d in range(number_of_days): # and now going back to the unfiltered data if diurnal_best_fits[d] != INTMDI: '''Checks if global falls inside daily value+/-range rather than seeing if each day falls in global value+/-range''' min_range = 11 - diurnal_uncertainties[d] max_range = 11 + diurnal_uncertainties[d] maxloc = np.where(hours == diurnal_best_fits[d])[0][0] if maxloc < min_range or maxloc > max_range: potentially_spurious[d] = 1 else: potentially_spurious[d] = 0 # count number of good, missing and not-bad days n_good = 0 n_miss = 0 n_not_bad = 0 total_points = 0 total_not_miss = 0 to_flag = np.zeros(number_of_days) for d in range(number_of_days): if potentially_spurious[d] == 1: n_good = 0 n_miss = 0 n_not_bad = 0 total_points += 1 total_not_miss += 1 else: if potentially_spurious[d] == 0: n_good += 1 n_not_bad += 1 if n_miss != 0: n_miss = 0 total_not_miss += 1 if potentially_spurious[d] == -999: n_miss += 1 n_not_bad += 1 if n_good != 0: n_good = 0 total_points += 1 if (n_good == 3) or (n_miss == 3) or (n_not_bad >= 6): if total_points >= 30: if float(total_not_miss) / total_points >= 0.5: to_flag[d - total_points:d] = 1 n_good = 0 n_miss = 0 n_not_bad = 0 total_points = 0 total_not_miss = 0 dcc_flags = np.zeros(filtered_data.shape) for d in range(number_of_days): if to_flag[d] == 1: good = np.where(filtered_data.mask[d, :] == False) if len(good[0]) >= 1: dcc_flags[d, good] = 1 if diagnostics: print len(np.where(dcc_flags == 1)[0]) print "currently matches IDL, but should all hours in days have flags set, not just the missing/flagged ones?" diurnal_flags += [dcc_flags] else: diurnal_flags += [np.zeros(filtered_data.shape)] station.qc_flags[:, flag_col[v]] = np.array(diurnal_flags).reshape(-1) flag_locs = np.where(station.qc_flags[:, flag_col[v]] != 0) utils.print_flagged_obs_number(logfile, "Diurnal Cycle", variable, len(flag_locs[0]), noWrite=diagnostics) # copy flags into attribute st_var.flags[flag_locs] = 1 # CHECKED 030660-99999, 30-06-2014, 855 flagged RJHD utils.apply_flags_all_variables(station, full_variable_list, flag_col[variable_list == "temperatures"], logfile, "Diurnal Cycle", plots=plots, diagnostics=diagnostics) station = utils.append_history(station, "Diurnal Cycle Check") return # dcc
def sc(station, variable_list, flag_col, start, end, logfile, diagnostics = False, plots = False, second = False): ''' Spike Check, looks for spikes up to 3 observations long, using thresholds calculated from the data itself. :param MetVar station: the station object :param list variable_list: list of observational variables to process :param list flag_col: the columns to set on the QC flag array :param datetime start: dataset start time :param datetime end: dataset end time :param file logfile: logfile to store outputs :param bool plots: do plots :param bool second: run for second time :returns: ''' print "refactor" for v, variable in enumerate(variable_list): flags = station.qc_flags[:, flag_col[v]] prev_flag_number = 0 if second: # count currently existing flags: prev_flag_number = len(flags[flags != 0]) st_var = getattr(station, variable) all_filtered = utils.apply_filter_flags(st_var) reporting_resolution = utils.reporting_accuracy(utils.apply_filter_flags(st_var)) # to match IDL system - should never be called as would mean no data if reporting_resolution == -1: reporting_resolution = 1 month_ranges = utils.month_starts_in_pairs(start, end) month_ranges = month_ranges.reshape(-1,12,2) good = np.where(all_filtered.mask == False) full_time_diffs = np.ma.zeros(len(all_filtered)) full_time_diffs.mask = all_filtered.mask full_time_diffs[good] = station.time.data[good][1:] - station.time.data[good][:-1] # develop critical values using clean values # NOTE 4/7/14 - make sure that Missing and Flagged values treated appropriately print "sort the differencing if values were flagged rather than missing" full_filtered_diffs = np.ma.zeros(len(all_filtered)) full_filtered_diffs.mask = all_filtered.mask full_filtered_diffs[good] = all_filtered.compressed()[1:] - all_filtered.compressed()[:-1] # test all values good_to_uncompress = np.where(st_var.data.mask == False) full_value_diffs = np.ma.zeros(len(st_var.data)) full_value_diffs.mask = st_var.data.mask full_value_diffs[good_to_uncompress] = st_var.data.compressed()[1:] - st_var.data.compressed()[:-1] # convert to compressed time to match IDL value_diffs = full_value_diffs.compressed() time_diffs = full_time_diffs.compressed() filtered_diffs = full_filtered_diffs.compressed() flags = flags[good_to_uncompress] critical_values = np.zeros([9,12]) critical_values.fill(st_var.mdi) # link observation to calendar month month_locs = np.zeros(full_time_diffs.shape) for month in range(12): for year in range(month_ranges.shape[0]): if year == 0: this_month_time_diff = full_time_diffs[month_ranges[year,month,0]:month_ranges[year,month,1]] this_month_filtered_diff = full_filtered_diffs[month_ranges[year,month,0]:month_ranges[year,month,1]] else: this_month_time_diff = np.ma.concatenate([this_month_time_diff, full_time_diffs[month_ranges[year,month,0]:month_ranges[year,month,1]]]) this_month_filtered_diff = np.ma.concatenate([this_month_filtered_diff, full_filtered_diffs[month_ranges[year,month,0]:month_ranges[year,month,1]]]) month_locs[month_ranges[year,month,0]:month_ranges[year,month,1]] = month for delta in range(1,9): locs = np.ma.where(this_month_time_diff == delta) if len(locs[0]) >= 100: iqr = utils.IQR(this_month_filtered_diff[locs]) if iqr == 0. and delta == 1: critical_values[delta-1,month] = 6. elif iqr == 0: critical_values[delta-1,month] = st_var.mdi else: critical_values[delta-1,month] = 6. * iqr # January 2015 - changed to dynamically calculating the thresholds if less than IQR method ^RJHD if plots: import calendar title = "{}, {}-hr differences".format(calendar.month_name[month+1], delta) line_label = st_var.name xlabel = "First Difference Magnitudes" else: title, line_label, xlabel = "","","" threshold = utils.get_critical_values(this_month_filtered_diff[locs], binmin = 0, binwidth = 0.5, plots = plots, diagnostics = diagnostics, title = title, line_label = line_label, xlabel = xlabel, old_threshold = critical_values[delta-1,month]) if threshold < critical_values[delta-1,month]: critical_values[delta-1,month] = threshold if plots or diagnostics: print critical_values[delta-1,month] , iqr, 6 * iqr month_locs = month_locs[good_to_uncompress] if diagnostics: print critical_values[0,:] # not less than 5x reporting accuracy good_critical_values = np.where(critical_values != st_var.mdi) low_critical_values = np.where(critical_values[good_critical_values] <= 5.*reporting_resolution) temporary = critical_values[good_critical_values] temporary[low_critical_values] = 5.*reporting_resolution critical_values[good_critical_values] = temporary if diagnostics: print critical_values[0,:], 5.*reporting_resolution # check hourly against 2 hourly, if <2/3 the increase to avoid crazy rejection rate for month in range(12): if critical_values[0,month] != st_var.mdi and critical_values[1,month] != st_var.mdi: if critical_values[0,month]/critical_values[1,month] <= 0.66: critical_values[0,month] = 0.66 * critical_values[1,month] if diagnostics: print critical_values[0,:] # get time differences for unfiltered data full_time_diffs = np.ma.zeros(len(st_var.data)) full_time_diffs.mask = st_var.data.mask full_time_diffs[good_to_uncompress] = station.time.data[good_to_uncompress][1:] - station.time.data[good_to_uncompress][:-1] time_diffs = full_time_diffs.compressed() # go through each difference, identify which month it is in if passes spike thresholds # spikes at the beginning or ends of sections for t in np.arange(len(time_diffs)): if (np.abs(time_diffs[t - 1]) > 240) and (np.abs(time_diffs[t]) < 3): # 10 days before but short gap thereafter next_values = st_var.data[good_to_uncompress[0][t + 1:]] good, = np.where(next_values.mask == False) next_median = np.ma.median(next_values[good[:10]]) next_diff = np.abs(value_diffs[t]) # out of spike median_diff = np.abs(next_median - st_var.data[good_to_uncompress[0][t]]) # are the remaining onees if (critical_values[time_diffs[t] - 1, month_locs[t]] != st_var.mdi): # jump from spike > critical but average after < critical / 2 if (np.abs(median_diff) < critical_values[time_diffs[t] - 1, month_locs[t]] / 2.) and\ (np.abs(next_diff) > critical_values[time_diffs[t] - 1, month_locs[t]]) : flags[t] = 1 if plots or diagnostics: sc_diagnostics_and_plots(station.time.data, st_var.data, good_to_uncompress[0][t], good_to_uncompress[0][t+1], start, variable, plots = plots) elif (np.abs(time_diffs[t - 1]) < 3) and (np.abs(time_diffs[t]) > 240): # 10 days after but short gap before prev_values = st_var.data[good_to_uncompress[0][:t - 1]] good, = np.where(prev_values.mask == False) prev_median = np.ma.median(prev_values[good[-10:]]) prev_diff = np.abs(value_diffs[t - 1]) median_diff = np.abs(prev_median - st_var.data[good_to_uncompress[0][t]]) if (critical_values[time_diffs[t - 1] - 1, month_locs[t]] != st_var.mdi): # jump into spike > critical but average before < critical / 2 if (np.abs(median_diff) < critical_values[time_diffs[t - 1] - 1, month_locs[t]] / 2.) and\ (np.abs(prev_diff) > critical_values[time_diffs[t - 1] - 1, month_locs[t]]) : flags[t] = 1 if plots or diagnostics: sc_diagnostics_and_plots(station.time.data, st_var.data, good_to_uncompress[0][t], good_to_uncompress[0][t+1], start, variable, plots = plots) ''' this isn't the nicest way, but a direct copy from IDL masked arrays might help remove some of the lines Also, this is relatively slow''' for t in np.arange(len(time_diffs)): for spk_len in [1,2,3]: if t >= spk_len and t < len(time_diffs) - spk_len: # check if time differences are appropriate, for multi-point spikes if (np.abs(time_diffs[t - spk_len]) <= spk_len * 3) and\ (np.abs(time_diffs[t]) <= spk_len * 3) and\ (time_diffs[t - spk_len - 1] - 1 < spk_len * 3) and\ (time_diffs[t + 1] - 1 < spk_len * 3) and \ ((spk_len == 1) or \ ((spk_len == 2) and (np.abs(time_diffs[t - spk_len + 1]) <= spk_len * 3)) or \ ((spk_len == 3) and (np.abs(time_diffs[t - spk_len + 1]) <= spk_len * 3) and (np.abs(time_diffs[t - spk_len + 2]) <= spk_len * 3))): # check if differences are valid if (value_diffs[t - spk_len] != st_var.mdi) and \ (value_diffs[t - spk_len] != st_var.fdi) and \ (critical_values[time_diffs[t - spk_len] - 1, month_locs[t]] != st_var.mdi): # if exceed critical values if (np.abs(value_diffs[t - spk_len]) >= critical_values[time_diffs[t - spk_len] - 1, month_locs[t]]): # are signs of two differences different if (math.copysign(1, value_diffs[t]) != math.copysign(1, value_diffs[t - spk_len])): # are within spike differences small if (spk_len == 1) or\ ((spk_len == 2) and (np.abs(value_diffs[t - spk_len + 1]) < critical_values[time_diffs[t - spk_len + 1] -1, month_locs[t]] / 2.)) or \ ((spk_len == 3) and (np.abs(value_diffs[t - spk_len + 1]) < critical_values[time_diffs[t - spk_len + 1] -1, month_locs[t]] / 2.) and\ (np.abs(value_diffs[t - spk_len + 2]) < critical_values[time_diffs[t - spk_len + 2] -1, month_locs[t]] / 2.)): # check if following value is valid if (value_diffs[t] != st_var.mdi) and (critical_values[time_diffs[t] - 1, month_locs[t]] != st_var.mdi) and\ (value_diffs[t] != st_var.fdi): # and if at least critical value if (np.abs(value_diffs[t]) >= critical_values[time_diffs[t] - 1, month_locs[t]]): # test if surrounding differences below 1/2 critical value if (np.abs(value_diffs[t - spk_len - 1]) <= critical_values[time_diffs[t - spk_len - 1] - 1, month_locs[t]] / 2.): if (np.abs(value_diffs[t + 1]) <= critical_values[time_diffs[t + 1] - 1, month_locs[t]] / 2.): # set the flags flags[ t - spk_len + 1 : t +1] = 1 if plots or diagnostics: sc_diagnostics_and_plots(station.time.data, st_var.data, good_to_uncompress[0][t-spk_len+1], good_to_uncompress[0][t+1], start, variable, plots = plots) station.qc_flags[good_to_uncompress, flag_col[v]] = flags flag_locs = np.where(station.qc_flags[:, flag_col[v]] != 0) if plots or diagnostics: utils.print_flagged_obs_number(logfile, "Spike", variable, len(flag_locs[0]) - prev_flag_number, noWrite = True) # additional flags else: utils.print_flagged_obs_number(logfile, "Spike", variable, len(flag_locs[0]) - prev_flag_number) # additional flags # copy flags into attribute st_var.flags[flag_locs] = 1 # matches 030660 - but with adapted IDL # matches 030220 OK, but finds more but all are reasonable 1/9/14 do_interactive = False if plots and do_interactive == True: import matplotlib.pyplot as plt plot_times = utils.times_hours_to_datetime(station.time.data, start) plt.clf() plt.plot(plot_times, all_filtered, 'bo', ls='-') flg = np.where(flags[:, flag_col[v]] == 1) plt.plot(plot_times[flg], all_filtered[flg], 'ro', markersize=10) plt.show() station = utils.append_history(station, "Spike Check") return # sc
def neighbour_checks(station_info, restart_id = "", end_id = "", distances=np.array([]), angles=np.array([]), second = False, masking = False, doZip=False, plots = False, diagnostics = False): """ Run through neighbour checks on list of stations passed :param list station_info: list of lists - [[ID, lat, lon, elev]] - strings :param array distances: array of distances between station pairs :param array angles: array of angles between station pairs :param bool second: do the second run :param bool masking: apply the flags to the data to mask the observations. """ first = not second qc_code_version = subprocess.check_output(['svnversion']).strip() # if distances and angles not calculated, then do so if (len(distances) == 0) or (len(angles) == 0): print "calculating distances and bearings matrix" distances, angles = get_distances_angles(station_info) # extract before truncate the array neighbour_elevations = np.array(station_info[:,3], dtype=float) neighbour_ids = np.array(station_info[:,0]) neighbour_info = np.array(station_info[:,:]) # sort truncated run startindex = 0 if restart_id != "": startindex, = np.where(station_info[:,0] == restart_id) if end_id != "": endindex, = np.where(station_info[:,0] == end_id) if endindex != len(station_info) -1: station_info = station_info[startindex: endindex+1] distances = distances[startindex:endindex+1,:] angles = angles[startindex:endindex+1,:] else: station_info = station_info[startindex:] distances = distances[startindex:,:] angles = angles[startindex:,:] else: station_info = station_info[startindex:] distances = distances[startindex:,:] angles = angles[startindex:,:] # process each neighbour for st, stat in enumerate(station_info): print dt.datetime.strftime(dt.datetime.now(), "%A, %d %B %Y, %H:%M:%S") print "Neighbour Check" print "{:35s} {}".format("Station Identifier :", stat[0]) if not plots and not diagnostics: logfile = file(LOG_OUTFILE_LOCS+stat[0]+'.log','a') # append to file if second iteration. logfile.write(dt.datetime.strftime(dt.datetime.now(), "%A, %d %B %Y, %H:%M:%S\n")) logfile.write("Neighbour Check\n") logfile.write("{:35s} {}\n".format("Station Identifier :", stat[0])) else: logfile = "" process_start_time = time.time() station = utils.Station(stat[0], float(stat[1]), float(stat[2]), float(stat[3])) # if running through the first time if first: if os.path.exists(os.path.join(NETCDF_DATA_LOCS, station.id + "_internal.nc.gz")): # if gzip file, unzip here subprocess.call(["gunzip",os.path.join(NETCDF_DATA_LOCS, station.id + "_internal.nc.gz")]) time.sleep(5) # make sure it is unzipped before proceeding # read in the data ncdfp.read(os.path.join(NETCDF_DATA_LOCS, station.id + "_internal.nc"), station, process_vars, carry_thru_vars, diagnostics = diagnostics) if plots or diagnostics: print "{:35s} {}\n".format("Total station record size :",len(station.time.data)) else: logfile.write("{:35s} {}\n".format("Total station record size :",len(station.time.data))) match_to_compress = utils.create_fulltimes(station, process_vars, DATASTART, DATAEND, carry_thru_vars) # or if second pass through? elif second: if os.path.exists(os.path.join(NETCDF_DATA_LOCS, station.id + "internal2.nc.gz")): # if gzip file, unzip here subprocess.call(["gunzip",os.path.join(NETCDF_DATA_LOCS, station.id + "_internal2.nc.gz")]) time.sleep(5) # make sure it is unzipped before proceeding ncdfp.read(os.path.join(NETCDF_DATA_LOCS, station.id + "_internal2.nc"), station, process_vars, carry_thru_vars, diagnostics = diagnostics) if plots or diagnostics: print "{:35s} {}\n".format("Total station record size :",len(station.time.data)) else: logfile.write("{:35s} {}\n".format("Total station record size :",len(station.time.data))) match_to_compress = utils.create_fulltimes(station, process_vars, DATASTART, DATAEND, carry_thru_vars) # select neighbours neighbour_distances = distances[st,:] neighbour_bearings = angles[st,:] # have to add in start index so that can use location in distance file. # neighbours = n_utils.get_neighbours(st+startindex, np.float(stat[3]), neighbour_distances, neighbour_bearings, neighbour_elevations) # return all neighbours up to a limit from the distance and elevation offsets (500km and 300m respectively) neighbours, neighbour_quadrants = n_utils.get_all_neighbours(st+startindex, np.float(stat[3]), neighbour_distances, neighbour_bearings, neighbour_elevations) if plots or diagnostics: print "{:14s} {:10s} {:10s}".format("Neighbour","Distance","Elevation") for n in neighbours: print "{:14s} {:10.1f} {:10.1f}".format(neighbour_ids[n],neighbour_distances[n],neighbour_elevations[n]) else: logfile.write("{:14s} {:10s} {:10s}\n".format("Neighbour","Distance","Elevation")) for n in neighbours: logfile.write("{:14s} {:10.1f} {:10.1f}\n".format(neighbour_ids[n],neighbour_distances[n],neighbour_elevations[n])) # if sufficient neighbours if len(neighbours) >= 3: for variable, col in FLAG_OUTLIER_DICT.items(): # NOTE - this requires multiple reads of the same file # but does make it easier to understand and code st_var = getattr(station, variable) if plots or diagnostics: print "Length of {} record: {}".format(variable, len(st_var.data.compressed())) else: logfile.write("Length of {} record: {}\n".format(variable, len(st_var.data.compressed()))) if len(st_var.data.compressed()) > 0: final_neighbours = n_utils.select_neighbours(station, variable, neighbour_info[neighbours], neighbours, neighbour_distances[neighbours], neighbour_quadrants, NETCDF_DATA_LOCS, DATASTART, DATAEND, logfile, second = second, diagnostics = diagnostics, plots = plots) # now read in final set of neighbours and process neigh_flags = np.zeros(len(station.time.data)) # count up how many neighbours think this obs is bad neigh_count = np.zeros(len(station.time.data)) # number of neighbours at each time stamp dpd_flags = np.zeros(len(station.time.data)) # number of neighbours at each time stamp reporting_accuracies = np.zeros(len(neighbours)) # reporting accuracy of each neighbour all_data = np.ma.zeros([len(final_neighbours), len(station.time.data)]) # store all the neighbour values for nn, nn_loc in enumerate(final_neighbours): neigh_details = neighbour_info[nn_loc] neigh = utils.Station(neigh_details[0], float(neigh_details[1]), float(neigh_details[2]), float(neigh_details[3])) if first: ncdfp.read(os.path.join(NETCDF_DATA_LOCS, neigh.id + "_internal.nc"), neigh, [variable], diagnostics = diagnostics, read_input_station_id = False) elif second: ncdfp.read(os.path.join(NETCDF_DATA_LOCS, neigh.id + "_internal2.nc"), neigh, [variable], diagnostics = diagnostics, read_input_station_id = False) dummy = utils.create_fulltimes(neigh, [variable], DATASTART, DATAEND, [], do_input_station_id = False) all_data[nn, :] = utils.apply_filter_flags(getattr(neigh, variable)) if diagnostics: print neigh_details n_utils.detect(station, neigh, variable, neigh_flags, neigh_count, DATASTART, DATAEND, distance = neighbour_distances[nn_loc], diagnostics = diagnostics, plots = plots) reporting_accuracies[nn] = utils.reporting_accuracy(getattr(neigh,variable).data) dpd_flags += neigh.qc_flags[:,31] # gone through all neighbours # if at least 2/3 of neighbours have flagged this point (and at least 3 neighbours) some_flags, = np.where(neigh_flags > 0) outlier_locs, = np.where(np.logical_and((neigh_count[some_flags] >= 3),(neigh_flags[some_flags].astype("float")/neigh_count[some_flags] > 2./3.))) # flag where < 3 neighbours locs = np.where(neigh_count[some_flags] < 3) station.qc_flags[some_flags[locs], col] = -1 if len(outlier_locs) >= 1: station.qc_flags[some_flags[outlier_locs], col] = 1 # print number flagged and copy into attribute if plots or diagnostics: utils.print_flagged_obs_number(logfile, "Neighbour", variable, len(outlier_locs), noWrite = True) else: utils.print_flagged_obs_number(logfile, "Neighbour", variable, len(outlier_locs)) st_var = getattr(station, variable) st_var.flags[some_flags[outlier_locs]] = 1 else: if plots or diagnostics: utils.print_flagged_obs_number(logfile, "Neighbour", variable, len(outlier_locs), noWrite = True) else: utils.print_flagged_obs_number(logfile, "Neighbour", variable, len(outlier_locs)) if plots: n_utils.plot_outlier(station, variable, some_flags[outlier_locs], all_data, DATASTART) # unflagging using neighbours n_utils.do_unflagging(station, variable, all_data, reporting_accuracies, neigh_count, dpd_flags, FLAG_COL_DICT, DATASTART, logfile, plots = plots, diagnostics = diagnostics) else: if plots or diagnostics: print "No observations to assess for {}".format(variable) else: logfile.write("No observations to assess for {}\n".format(variable)) # variable loop else: if plots or diagnostics: print "Fewer than 3 neighbours" else: logfile.write("Fewer than 3 neighbours\n") print dt.datetime.strftime(dt.datetime.now(), "%A, %d %B %Y, %H:%M:%S") print "processing took {:4.0f}s\n\n".format(time.time() - process_start_time) # end of neighbour check utils.append_history(station, "Neighbour Outlier Check") # clean up months qc_tests.clean_up.clu(station, ["temperatures","dewpoints","windspeeds","winddirs","slp"], [44,45,46,47,48], FLAG_COL_DICT, DATASTART, DATAEND, logfile, plots = plots) if diagnostics or plots: raw_input("stop") # masking (at least call from here - optional call from internal?) # write to file if first: ncdfp.write(os.path.join(NETCDF_DATA_LOCS, station.id + "_external.nc"), station, process_vars, os.path.join(INPUT_FILE_LOCS,'attributes.dat'), opt_var_list = carry_thru_vars, compressed = match_to_compress, processing_date = '', qc_code_version = qc_code_version) # gzip the raw file elif second: ncdfp.write(os.path.join(NETCDF_DATA_LOCS, station.id + "_external2.nc"), station, process_vars, os.path.join(INPUT_FILE_LOCS,'attributes.dat'), opt_var_list = carry_thru_vars, compressed = match_to_compress, processing_date = '', qc_code_version = qc_code_version) # gzip the raw file # masking - apply the flags and copy masked data to flagged_obs attribute if masking: station = utils.mask(station, process_vars, logfile) # write to file if first: ncdfp.write(os.path.join(NETCDF_DATA_LOCS, station.id + "_mask.nc"), station, process_vars, os.path.join(INPUT_FILE_LOCS,'attributes.dat'), opt_var_list = carry_thru_vars, compressed = match_to_compress, processing_date = '', qc_code_version = qc_code_version) elif second: ncdfp.write(os.path.join(NETCDF_DATA_LOCS, station.id + "_mask2.nc"), station, process_vars, os.path.join(INPUT_FILE_LOCS,'attributes.dat'), opt_var_list = carry_thru_vars, compressed = match_to_compress, processing_date = '', qc_code_version = qc_code_version) if plots or diagnostics: print "Masking completed\n" print dt.datetime.strftime(dt.datetime.now(), "%A, %d %B %Y, %H:%M:%S\n") print "processing took {:4.0f}s\n\n".format(time.time() - process_start_time) else: logfile.write("Masking completed\n") logfile.write(dt.datetime.strftime(dt.datetime.now(), "%A, %d %B %Y, %H:%M:%S\n")) logfile.write("processing took {:4.0f}s\n\n".format(time.time() - process_start_time)) logfile.close() # gzip up all the raw files if doZip: for st, stat in enumerate(station_info): if first: subprocess.call(["gzip",os.path.join(NETCDF_DATA_LOCS, stat[0]+"_internal.nc")]) if masking: subprocess.call(["gzip",os.path.join(NETCDF_DATA_LOCS, stat[0]+"_external.nc")]) subprocess.call(["gzip",os.path.join(NETCDF_DATA_LOCS, stat[0]+"_mask.nc")]) elif second: subprocess.call(["gzip",os.path.join(NETCDF_DATA_LOCS, stat[0]+"_internal2.nc")]) if masking: subprocess.call(["gzip",os.path.join(NETCDF_DATA_LOCS, stat[0]+"_external2.nc")]) subprocess.call(["gzip",os.path.join(NETCDF_DATA_LOCS, stat[0]+"_mask2.nc")]) print "Neighbour Checks completed\n" return # neighbour_checks
def fvc(station, variable_list, flag_col, start, end, logfile, diagnostics=False, plots=False, doMonth=False): ''' Check for certain values occurring more frequently than would be expected :param object station: station object to process :param list variable_list: list of variables to process :param list flag_col: columns to fill in flag array :param datetime start: datetime object of start of data :param datetime end: datetime object of end of data :param file logfile: logfile to store outputs :param bool diagnostics: produce extra diagnostic output :param bool plots: produce plots :param bool month: ignore months after last complete year/season for distribution ''' MIN_DATA_REQUIRED = 500 # to create histogram for complete record MIN_DATA_REQUIRED_YEAR = 100 # to create histogram month_ranges = utils.month_starts_in_pairs(start, end) month_ranges_years = month_ranges.reshape(-1, 12, 2) for v, variable in enumerate(variable_list): st_var = getattr(station, variable) reporting_accuracy = utils.reporting_accuracy( utils.apply_filter_flags(st_var)) # apply flags - for detection only filtered_data = utils.apply_filter_flags(st_var, doMonth=doMonth, start=start, end=end) for season in range(5): # Year,MAM,JJA,SON,JF+D if season == 0: # all year season_data = np.ma.masked_values(filtered_data.compressed(), st_var.fdi) thresholds = [30, 20, 10] else: thresholds = [20, 15, 10] season_data = np.ma.array([]) for y, year in enumerate(month_ranges_years): # churn through months extracting data, accounting for fdi and concatenating together if season == 1: #mam season_data = np.ma.concatenate([ season_data, np.ma.masked_values( filtered_data[year[2][0]:year[4][-1]], st_var.fdi) ]) elif season == 2: #jja season_data = np.ma.concatenate([ season_data, np.ma.masked_values( filtered_data[year[5][0]:year[7][-1]], st_var.fdi) ]) elif season == 3: #son season_data = np.ma.concatenate([ season_data, np.ma.masked_values( filtered_data[year[8][0]:year[10][-1]], st_var.fdi) ]) elif season == 4: #d+jf season_data = np.ma.concatenate([ season_data, np.ma.masked_values( filtered_data[year[0][0]:year[1][-1]], st_var.fdi) ]) season_data = np.ma.concatenate([ season_data, np.ma.masked_values( filtered_data[year[-1][0]:year[-1][-1]], st_var.fdi) ]) season_data = season_data.compressed() if len(season_data) > MIN_DATA_REQUIRED: if 0 < reporting_accuracy <= 0.5: # -1 used as missing value bins, bincenters = utils.create_bins(season_data, 0.5) else: bins, bincenters = utils.create_bins(season_data, 1.0) hist, binEdges = np.histogram(season_data, bins=bins) if plots: plot_hist, bincenters = fvc_plot_setup(season_data, hist, binEdges, st_var.name, title="%s" % (SEASONS[season])) bad_bin = np.zeros(len(hist)) # scan through bin values and identify bad ones for e, element in enumerate(hist): if e > 3 and e <= (len(hist) - 3): # don't bother with first three or last three bins seven_bins = hist[e - 3:e + 3 + 1] if (seven_bins[3] == seven_bins.max()) and (seven_bins[3] != 0): # is local maximum and != zero if (seven_bins[3] / float(seven_bins.sum()) >= 0.5) and (seven_bins[3] >= thresholds[0]): # contains >50% of data and is greater than threshold bad_bin[e] = 1 # for plotting remove good bins else: if plots: plot_hist[e] = 1e-1 else: if plots: plot_hist[e] = 1e-1 else: if plots: plot_hist[e] = 1e-1 if plots: import matplotlib.pyplot as plt plt.step(bincenters, plot_hist, 'r-', where='mid') plt.show() # having identified possible bad bins, check each year in turn, on unfiltered data for y, year in enumerate(month_ranges_years): if season == 0: # year year_data = np.ma.masked_values( st_var.data[year[0][0]:year[-1][-1]], st_var.fdi) year_flags = station.qc_flags[year[0][0]:year[-1][-1], flag_col[v]] elif season == 1: #mam year_data = np.ma.masked_values( st_var.data[year[2][0]:year[4][-1]], st_var.fdi) year_flags = station.qc_flags[year[2][0]:year[4][-1], flag_col[v]] elif season == 2: #jja year_data = np.ma.masked_values( st_var.data[year[5][0]:year[7][-1]], st_var.fdi) year_flags = station.qc_flags[year[5][0]:year[7][-1], flag_col[v]] elif season == 3: #son year_data = np.ma.masked_values( st_var.data[year[8][0]:year[10][-1]], st_var.fdi) year_flags = station.qc_flags[year[8][0]:year[10][-1], flag_col[v]] elif season == 4: #d+jf year_data = np.ma.concatenate([np.ma.masked_values(st_var.data[year[0][0]:year[1][-1]], st_var.fdi),\ np.ma.masked_values(st_var.data[year[-1][0]:year[-1][-1]], st_var.fdi)]) year_flags = np.append( station.qc_flags[year[0][0]:year[1][-1], flag_col[v]], station.qc_flags[year[-1][0]:year[-1][-1], flag_col[v]]) if len(year_data.compressed()) > MIN_DATA_REQUIRED_YEAR: hist, binEdges = np.histogram(year_data.compressed(), bins=bins) if plots: plot_hist, bincenters = fvc_plot_setup( year_data.compressed(), hist, binEdges, st_var.name, title="%s - %s" % (y + start.year, SEASONS[season])) for e, element in enumerate(hist): if bad_bin[e] == 1: # only look at pre-identified bins if e >= 3 and e <= (len(hist) - 3): # don't bother with first three or last three bins seven_bins = hist[e - 3:e + 3 + 1].astype('float') if (seven_bins[3] == seven_bins.max() ) and (seven_bins[3] != 0): # is local maximum and != zero if (seven_bins[3]/seven_bins.sum() >= 0.5 and seven_bins[3] >= thresholds[1]) \ or (seven_bins[3]/seven_bins.sum() >= 0.9 and seven_bins[3] >= thresholds[2]): # contains >50% or >90% of data and is greater than appropriate threshold # Flag these data bad_points = np.where( (year_data >= binEdges[e]) & (year_data < binEdges[e + 1])) year_flags[bad_points] = 1 # for plotting remove good bins else: if plots: plot_hist[e] = 1e-1 else: if plots: plot_hist[e] = 1e-1 else: if plots: plot_hist[e] = 1e-1 else: if plots: plot_hist[e] = 1e-1 if diagnostics or plots: nflags = len(np.where(year_flags != 0)[0]) print "{} {}".format(y + start.year, nflags) if plots: if nflags > 0: plt.step(bincenters, plot_hist, 'r-', where='mid') plt.show() else: plt.clf() # copy flags back if season == 0: station.qc_flags[year[0][0]:year[-1][-1], flag_col[v]] = year_flags elif season == 1: station.qc_flags[year[2][0]:year[4][-1], flag_col[v]] = year_flags elif season == 2: station.qc_flags[year[5][0]:year[7][-1], flag_col[v]] = year_flags elif season == 3: station.qc_flags[year[8][0]:year[10][-1], flag_col[v]] = year_flags elif season == 4: split = len(station.qc_flags[year[0][0]:year[1][-1], flag_col[v]]) station.qc_flags[year[0][0]:year[1][-1], flag_col[v]] = year_flags[:split] station.qc_flags[year[-1][0]:year[-1][-1], flag_col[v]] = year_flags[split:] flag_locs = np.where(station.qc_flags[:, flag_col[v]] != 0) utils.print_flagged_obs_number(logfile, "Frequent Value", variable, len(flag_locs[0]), noWrite=diagnostics) # copy flags into attribute st_var.flags[flag_locs] = 1 station = utils.append_history(station, "Frequent Values Check") return # fvc
def dgc(station, variable_list, flag_col, start, end, logfile, plots=False, diagnostics=False, idl=False, GH=False): '''Controller for two individual tests''' if plots: import matplotlib.pyplot as plt for v, variable in enumerate(variable_list): station.qc_flags[:, flag_col[v]] = dgc_monthly( station, variable, station.qc_flags[:, flag_col[v]], start, end, plots=plots, diagnostics=diagnostics, idl=idl) if variable == "slp": # need to send in windspeeds too station.qc_flags[:, flag_col[v]] = dgc_all_obs( station, variable, station.qc_flags[:, flag_col[v]], start, end, plots=plots, diagnostics=diagnostics, idl=idl, windspeeds=True, GH=GH) else: station.qc_flags[:, flag_col[v]] = dgc_all_obs( station, variable, station.qc_flags[:, flag_col[v]], start, end, plots=plots, diagnostics=diagnostics, idl=idl, GH=GH) flag_locs = np.where(station.qc_flags[:, flag_col[v]] != 0) if plots or diagnostics: utils.print_flagged_obs_number(logfile, "Distributional Gap", variable, len(flag_locs[0]), noWrite=True) else: utils.print_flagged_obs_number(logfile, "Distributional Gap", variable, len(flag_locs[0])) # copy flags into attribute st_var = getattr(station, variable) st_var.flags[flag_locs] = 1 # MATCHES IDL for 030660-99999, 2 flags in T, 30-06-2014 station = utils.append_history(station, "Distributional Gap Check") return # dgc
def fvc(station, variable_list, flag_col, start, end, logfile, diagnostics = False, plots = False): ''' Check for certain values occurring more frequently than would be expected :param object station: station object to process :param list variable_list: list of variables to process :param list flag_col: columns to fill in flag array :param datetime start: datetime object of start of data :param datetime end: datetime object of end of data :param file logfile: logfile to store outputs :param bool diagnostics: produce extra diagnostic output :param bool plots: produce plots ''' MIN_DATA_REQUIRED = 500 # to create histogram for complete record MIN_DATA_REQUIRED_YEAR = 100 # to create histogram month_ranges = utils.month_starts_in_pairs(start, end) month_ranges_years = month_ranges.reshape(-1,12,2) for v,variable in enumerate(variable_list): st_var = getattr(station, variable) reporting_accuracy = utils.reporting_accuracy(utils.apply_filter_flags(st_var)) # apply flags - for detection only filtered_data = utils.apply_filter_flags(st_var) for season in range(5): # Year,MAM,JJA,SON,JF+D if season == 0: # all year season_data = np.ma.masked_values(filtered_data.compressed(), st_var.fdi) thresholds = [30,20,10] else: thresholds = [20,15,10] season_data = np.ma.array([]) for y,year in enumerate(month_ranges_years): # churn through months extracting data, accounting for fdi and concatenating together if season == 1: #mam season_data = np.ma.concatenate([season_data, np.ma.masked_values(filtered_data[year[2][0]:year[4][-1]], st_var.fdi)]) elif season == 2: #jja season_data = np.ma.concatenate([season_data, np.ma.masked_values(filtered_data[year[5][0]:year[7][-1]], st_var.fdi)]) elif season == 3: #son season_data = np.ma.concatenate([season_data, np.ma.masked_values(filtered_data[year[8][0]:year[10][-1]], st_var.fdi)]) elif season == 4: #d+jf season_data = np.ma.concatenate([season_data, np.ma.masked_values(filtered_data[year[0][0]:year[1][-1]], st_var.fdi)]) season_data = np.ma.concatenate([season_data, np.ma.masked_values(filtered_data[year[-1][0]:year[-1][-1]], st_var.fdi)]) season_data = season_data.compressed() if len(season_data) > MIN_DATA_REQUIRED: if 0 < reporting_accuracy <= 0.5: # -1 used as missing value bins, bincenters = utils.create_bins(season_data, 0.5) else: bins, bincenters = utils.create_bins(season_data, 1.0) hist, binEdges = np.histogram(season_data, bins = bins) if plots: plot_hist, bincenters = fvc_plot_setup(season_data, hist, binEdges, st_var.name, title = "%s" % (SEASONS[season])) bad_bin = np.zeros(len(hist)) # scan through bin values and identify bad ones for e, element in enumerate(hist): if e > 3 and e <= (len(hist) - 3): # don't bother with first three or last three bins seven_bins = hist[e-3:e+3+1] if (seven_bins[3] == seven_bins.max()) and (seven_bins[3] != 0): # is local maximum and != zero if (seven_bins[3]/float(seven_bins.sum()) >= 0.5) and (seven_bins[3] >= thresholds[0]): # contains >50% of data and is greater than threshold bad_bin[e] = 1 # for plotting remove good bins else: if plots: plot_hist[e]=1e-1 else: if plots: plot_hist[e]=1e-1 else: if plots: plot_hist[e]=1e-1 if plots: plt.step(bincenters, plot_hist, 'r-', where='mid') plt.show() # having identified possible bad bins, check each year in turn for y,year in enumerate(month_ranges_years): if season == 0: # year year_data = np.ma.masked_values(st_var.data[year[0][0]:year[-1][-1]], st_var.fdi) year_flags = station.qc_flags[year[0][0]:year[-1][-1],flag_col[v]] elif season == 1: #mam year_data = np.ma.masked_values(st_var.data[year[2][0]:year[4][-1]], st_var.fdi) year_flags = station.qc_flags[year[2][0]:year[4][-1],flag_col[v]] elif season == 2: #jja year_data = np.ma.masked_values(st_var.data[year[5][0]:year[7][-1]], st_var.fdi) year_flags = station.qc_flags[year[5][0]:year[7][-1],flag_col[v]] elif season == 3: #son year_data = np.ma.masked_values(st_var.data[year[8][0]:year[10][-1]], st_var.fdi) year_flags = station.qc_flags[year[8][0]:year[10][-1],flag_col[v]] elif season == 4: #d+jf year_data = np.ma.concatenate([np.ma.masked_values(st_var.data[year[0][0]:year[1][-1]], st_var.fdi),\ np.ma.masked_values(st_var.data[year[-1][0]:year[-1][-1]], st_var.fdi)]) year_flags = np.append(station.qc_flags[year[0][0]:year[1][-1],flag_col[v]],station.qc_flags[year[-1][0]:year[-1][-1],flag_col[v]]) if len(year_data.compressed()) > MIN_DATA_REQUIRED_YEAR: hist, binEdges = np.histogram(year_data.compressed(), bins = bins) if plots: plot_hist, bincenters = fvc_plot_setup(hist, binEdges, st_var.name, title = "%s - %s" % (y+start.year, SEASONS[season])) for e, element in enumerate(hist): if bad_bin[e] == 1: # only look at pre-identified bins if e >= 3 and e <= (len(hist) - 3): # don't bother with first three or last three bins seven_bins = hist[e-3:e+3+1].astype('float') if (seven_bins[3] == seven_bins.max()) and (seven_bins[3] != 0): # is local maximum and != zero if (seven_bins[3]/seven_bins.sum() >= 0.5 and seven_bins[3] >= thresholds[1]) \ or (seven_bins[3]/seven_bins.sum() >= 0.9 and seven_bins[3] >= thresholds[2]): # contains >50% or >90% of data and is greater than appropriate threshold # Flag these data bad_points = np.where((year_data >= binEdges[e]) & (year_data < binEdges[e+1])) year_flags[bad_points] = 1 # for plotting remove good bins else: if plots: plot_hist[e]=1e-1 else: if plots: plot_hist[e]=1e-1 else: if plots: plot_hist[e]=1e-1 else: if plots: plot_hist[e]=1e-1 if diagnostics or plots: nflags = len(np.where(year_flags != 0)[0]) print "{} {}".format(y + start.year, nflags) if plots: if nflags > 0: plt.step(bincenters, plot_hist, 'r-', where='mid') plt.show() else: plt.clf() # copy flags back if season == 0: station.qc_flags[year[0][0]:year[-1][-1], flag_col[v]] = year_flags elif season == 1: station.qc_flags[year[2][0]:year[4][-1], flag_col[v]] = year_flags elif season == 2: station.qc_flags[year[5][0]:year[7][-1], flag_col[v]] = year_flags elif season == 3: station.qc_flags[year[8][0]:year[10][-1], flag_col[v]] = year_flags elif season == 4: split = len(station.qc_flags[year[0][0]:year[1][-1], flag_col[v]]) station.qc_flags[year[0][0]:year[1][-1], flag_col[v]] = year_flags[:split] station.qc_flags[year[-1][0]:year[-1][-1], flag_col[v]] = year_flags[split:] flag_locs = np.where(station.qc_flags[:, flag_col[v]] != 0) if plots or diagnostics: utils.print_flagged_obs_number(logfile, "Frequent Value", variable, len(flag_locs[0]), noWrite = True) else: utils.print_flagged_obs_number(logfile, "Frequent Value", variable, len(flag_locs[0])) # copy flags into attribute st_var.flags[flag_locs] = 1 station = utils.append_history(station, "Frequent Values Check") return # fvc
def dcc(station, variable_list, full_variable_list, flag_col, logfile, plots = False, diagnostics = False): ''' The diurnal cycle check. :param object station: the station object to be processed :param list variable_list: the variables to be processed :param list full_variable_list: the variables for flags to be applied to :param list flag_col: which column in the qc_flags array to work on :param file logfile: logfile to store outputs :param bool plots: to do any plots :param bool diagnostics: to do any extra diagnostic output :returns: ''' # list of flags for each variable diurnal_flags = [] for v,variable in enumerate(variable_list): st_var = getattr(station, variable) # is this needed 21/08/2014 # reporting_accuracy = utils.reporting_accuracy(utils.apply_filter_flags(st_var)) # apply flags - for detection only filtered_data = utils.apply_filter_flags(st_var) filtered_data = filtered_data.reshape(-1,24) # working in fulltimes. number_of_days = filtered_data.shape[0] if plots: import matplotlib.pyplot as plt plt.clf() plot_data = np.ma.zeros(filtered_data.shape) plot_data.mask = True # best_estimate_counter = np.zeros(HOURS) diurnal_best_fits = np.zeros(filtered_data.shape[0], dtype = (int)) diurnal_best_fits.fill(INTMDI) diurnal_uncertainties = np.zeros(filtered_data.shape[0]) diurnal_uncertainties.fill(INTMDI) for d,day in enumerate(filtered_data): '''enough observations and have large enough diurnal range ''' if len(day.compressed()) >= OBS_PER_DAY: obs_daily_range = max(day.compressed()) - min(day.compressed()) if obs_daily_range >= DAILY_RANGE: if dcc_quartile_check(day): scaled_sine = ((dcc_make_sine() + 1.) / 2. * obs_daily_range) + min(day.compressed()) diffs = np.zeros(HOURS) '''Find differences for each shifted sine --> cost function''' for h in range(HOURS): diffs[h] = np.sum(np.abs(day - scaled_sine).compressed()) scaled_sine = np.roll(scaled_sine, 1) # matched to IDL SHIFT() diurnal_best_fits[d] = np.argmin(diffs) # default uncertainty is the average time resolution of the data diurnal_uncertainties[d] = round(float(HOURS) / len(day.compressed())) if DYNAMIC_DIURNAL: critical_value = min(diffs) + ((max(diffs) - min(diffs)) * 0.33) # centre so minimum in middle diffs = np.roll(diffs, 11 - diurnal_best_fits[d]) uncertainty = 1 while uncertainty < 11: if (diffs[11 - uncertainty] > critical_value) and\ (diffs[11 + uncertainty] > critical_value): # break if both sides greater than critical difference # when counting outwards # see diurnal_example.py break uncertainty += 1 # check if uncertainty greater than time resolution for day if uncertainty > diurnal_uncertainties[d] : diurnal_uncertainties[d] = uncertainty if plots: # best_estimate_counter[np.argmin(diffs)] += 1 # scale daily data to range -1 -> 1, plot with random scatter for clarity plot_data[d] = ((2 * (day - min(day.compressed())) / obs_daily_range) - 1.) plt.plot(np.arange(24)+np.random.randn(24)*0.25, plot_data[d]+np.random.randn(24)*0.05, 'k,') if plots: plt.plot(np.arange(24),np.roll(dcc_make_sine(), np.argmax(np.bincount(diurnal_best_fits[np.where(diurnal_best_fits != INTMDI)]))),'r-') plt.xlim([-1,25]) plt.ylim([-1.2,1.2]) plt.show() # dumb copy of IDL '''For each uncertainty range (1-6h) find median of cycle offset''' best_fits = np.zeros(6) best_fits.fill(-9) for h in range(6): locs = np.where(diurnal_uncertainties == h+1) if len(locs[0]) > 300: # best_fits[h] = int(np.median(diurnal_best_fits[locs])) # Numpy median gives average of central two values which may not be integer # 25/11/2014 use IDL style which gives lower value best_fits[h] = utils.idl_median(diurnal_best_fits[locs]) '''Build up range of cycles incl, uncertainty to find where best of best located''' hours = np.arange(24) hour_matches=np.zeros(24) diurnal_peak = -9 number_estimates = 0 for h in range(6): if best_fits[h] != -9: '''Store lowest uncertainty best fit as first guess''' if diurnal_peak == -9: diurnal_peak = best_fits[h] hours = np.roll(hours,11-int(diurnal_peak)) hour_matches[11-(h+1):11+(h+2)] = 1 number_estimates += 1 centre = np.where(hours == best_fits[h]) if (centre[0] - h + 1) >= 0: if (centre[0] + h + 1 ) <=23: hour_matches[centre[0] - (h + 1) : centre[0] + h + 2] += 1 else: hour_matches[centre[0] - (h + 1) : ] += 1 hour_matches[ : centre[0] + h + 2- 24] += 1 else: hour_matches[: centre[0] + h + 2] += 1 hour_matches[centre[0] - (h + 1) :] += 1 number_estimates += 1 '''If value at lowest uncertainty not found in all others, then see what value is found by all others ''' if hour_matches[11] != number_estimates: # central estimate at 12 o'clock all_match = np.where(hour_matches == number_estimates) # if one is, then use it if len(all_match[0]) > 0: diurnal_peak = all_match[0][0] else: diurnal_peak = -9 '''Now have value for best fit diurnal offset''' potentially_spurious = np.zeros(number_of_days) potentially_spurious.fill(INTMDI) if diurnal_peak != -9: hours = np.arange(24) hours = np.roll(hours,11-int(diurnal_peak)) for d in range(number_of_days): if diurnal_best_fits[d] != INTMDI: '''Checks if global falls inside daily value+/-range rather than seeing if each day falls in global value+/-range''' min_range = 11 - diurnal_uncertainties[d] max_range = 11 + diurnal_uncertainties[d] maxloc = np.where(hours == diurnal_best_fits[d])[0][0] if maxloc < min_range or maxloc > max_range: potentially_spurious[d] = 1 else: potentially_spurious[d] = 0 # count number of good, missing and not-bad days n_good = 0 n_miss = 0 n_not_bad = 0 total_points = 0 total_not_miss = 0 to_flag = np.zeros(number_of_days) for d in range(number_of_days): if potentially_spurious[d] == 1: n_good = 0 n_miss = 0 n_not_bad = 0 total_points += 1 total_not_miss +=1 else: if potentially_spurious[d] == 0: n_good += 1 n_not_bad += 1 if n_miss != 0: n_miss = 0 total_not_miss += 1 if potentially_spurious[d] == -999: n_miss += 1 n_not_bad += 1 if n_good != 0: n_good = 0 total_points += 1 if (n_good == 3) or (n_miss == 3) or (n_not_bad >=6): if total_points >= 30: if float(total_not_miss)/total_points >= 0.5: to_flag[d - total_points : d ] = 1 n_good = 0 n_miss = 0 n_not_bad = 0 total_points = 0 total_not_miss = 0 dcc_flags = np.zeros(filtered_data.shape) for d in range(number_of_days): if to_flag[d] == 1: good = np.where(filtered_data.mask[d,:] == False) if len(good[0]) >= 1: dcc_flags[d,good]=1 if diagnostics: print len(np.where(dcc_flags == 1)[0]) print "currently matches IDL, but should all hours in days have flags set, not just the missing/flagged ones?" diurnal_flags += [dcc_flags] else: diurnal_flags += [np.zeros(filtered_data.shape)] station.qc_flags[:, flag_col[v]] = np.array(diurnal_flags).reshape(-1) flag_locs = np.where(station.qc_flags[:, flag_col[v]] != 0) if plots or diagnostics: utils.print_flagged_obs_number(logfile, "Diurnal Cycle", variable, len(flag_locs[0]), noWrite = True) else: utils.print_flagged_obs_number(logfile, "Diurnal Cycle", variable, len(flag_locs[0])) # copy flags into attribute st_var.flags[flag_locs] = 1 # CHECKED 030660-99999, 30-06-2014, 855 flagged RJHD utils.apply_flags_all_variables(station, full_variable_list, flag_col[variable_list == "temperatures"], logfile, "Diurnal Cycle", plots = plots, diagnostics = diagnostics) station = utils.append_history(station, "Diurnal Cycle Check") return # dcc
def sc(station, variable_list, flag_col, start, end, logfile, diagnostics=False, plots=False, doMonth=False): ''' Spike Check, looks for spikes up to 3 observations long, using thresholds calculated from the data itself. :param MetVar station: the station object :param list variable_list: list of observational variables to process :param list flag_col: the columns to set on the QC flag array :param datetime start: dataset start time :param datetime end: dataset end time :param file logfile: logfile to store outputs :param bool plots: do plots :param bool doMonth: account for incomplete months :returns: ''' print "refactor" for v, variable in enumerate(variable_list): flags = station.qc_flags[:, flag_col[v]] st_var = getattr(station, variable) # if incomplete year, mask all obs for the incomplete bit all_filtered = utils.apply_filter_flags(st_var, doMonth=doMonth, start=start, end=end) reporting_resolution = utils.reporting_accuracy( utils.apply_filter_flags(st_var)) # to match IDL system - should never be called as would mean no data if reporting_resolution == -1: reporting_resolution = 1 month_ranges = utils.month_starts_in_pairs(start, end) month_ranges = month_ranges.reshape(-1, 12, 2) good, = np.where(all_filtered.mask == False) full_time_diffs = np.ma.zeros(len(all_filtered), dtype=int) full_time_diffs.mask = copy.deepcopy(all_filtered.mask[:]) full_time_diffs[good[:-1]] = station.time.data[ good[1:]] - station.time.data[good[:-1]] # develop critical values using clean values # NOTE 4/7/14 - make sure that Missing and Flagged values treated appropriately print "sort the differencing if values were flagged rather than missing" full_filtered_diffs = np.ma.zeros(len(all_filtered)) full_filtered_diffs.mask = copy.deepcopy(all_filtered.mask[:]) full_filtered_diffs[good[:-1]] = all_filtered.compressed( )[1:] - all_filtered.compressed()[:-1] # test all values good_to_uncompress, = np.where(st_var.data.mask == False) full_value_diffs = np.ma.zeros(len(st_var.data)) full_value_diffs.mask = copy.deepcopy(st_var.data.mask[:]) full_value_diffs[good_to_uncompress[:-1]] = st_var.data.compressed( )[1:] - st_var.data.compressed()[:-1] # convert to compressed time to match IDL value_diffs = full_value_diffs.compressed() time_diffs = full_time_diffs.compressed() filtered_diffs = full_filtered_diffs.compressed() flags = flags[good_to_uncompress] critical_values = np.zeros([9, 12]) critical_values.fill(st_var.mdi) # link observation to calendar month month_locs = np.zeros(full_time_diffs.shape, dtype=int) for month in range(12): for year in range(month_ranges.shape[0]): if year == 0: this_month_time_diff = full_time_diffs[month_ranges[ year, month, 0]:month_ranges[year, month, 1]] this_month_filtered_diff = full_filtered_diffs[ month_ranges[year, month, 0]:month_ranges[year, month, 1]] else: this_month_time_diff = np.ma.concatenate([ this_month_time_diff, full_time_diffs[month_ranges[year, month, 0]:month_ranges[year, month, 1]] ]) this_month_filtered_diff = np.ma.concatenate([ this_month_filtered_diff, full_filtered_diffs[month_ranges[year, month, 0]:month_ranges[year, month, 1]] ]) month_locs[month_ranges[year, month, 0]:month_ranges[year, month, 1]] = month for delta in range(1, 9): locs = np.ma.where(this_month_time_diff == delta) if len(locs[0]) >= 100: iqr = utils.IQR(this_month_filtered_diff[locs]) if iqr == 0. and delta == 1: critical_values[delta - 1, month] = 6. elif iqr == 0: critical_values[delta - 1, month] = st_var.mdi else: critical_values[delta - 1, month] = 6. * iqr # January 2015 - changed to dynamically calculating the thresholds if less than IQR method ^RJHD if plots: import calendar title = "{}, {}-hr differences".format( calendar.month_name[month + 1], delta) line_label = st_var.name xlabel = "First Difference Magnitudes" else: title, line_label, xlabel = "", "", "" threshold = utils.get_critical_values( this_month_filtered_diff[locs], binmin=0, binwidth=0.5, plots=plots, diagnostics=diagnostics, title=title, line_label=line_label, xlabel=xlabel, old_threshold=critical_values[delta - 1, month]) if threshold < critical_values[delta - 1, month]: critical_values[delta - 1, month] = threshold if plots or diagnostics: print critical_values[delta - 1, month], iqr, 6 * iqr month_locs = month_locs[good_to_uncompress] if diagnostics: print critical_values[0, :] # not less than 5x reporting accuracy good_critical_values = np.where(critical_values != st_var.mdi) low_critical_values = np.where( critical_values[good_critical_values] <= 5. * reporting_resolution) temporary = critical_values[good_critical_values] temporary[low_critical_values] = 5. * reporting_resolution critical_values[good_critical_values] = temporary if diagnostics: print critical_values[0, :], 5. * reporting_resolution # check hourly against 2 hourly, if <2/3 the increase to avoid crazy rejection rate for month in range(12): if critical_values[0, month] != st_var.mdi and critical_values[ 1, month] != st_var.mdi: if critical_values[0, month] / critical_values[1, month] <= 0.66: critical_values[0, month] = 0.66 * critical_values[1, month] if diagnostics: print "critical values" print critical_values[0, :] # get time differences for unfiltered data full_time_diffs = np.ma.zeros(len(st_var.data), dtype=int) full_time_diffs.mask = copy.deepcopy(st_var.data.mask[:]) full_time_diffs[good_to_uncompress[:-1]] = station.time.data[ good_to_uncompress[1:]] - station.time.data[ good_to_uncompress[:-1]] time_diffs = full_time_diffs.compressed() # go through each difference, identify which month it is in if passes spike thresholds # spikes at the beginning or ends of sections for t in np.arange(len(time_diffs)): if (np.abs(time_diffs[t - 1]) > 240) and (np.abs(time_diffs[t]) < 3): # 10 days before but short gap thereafter next_values = st_var.data[good_to_uncompress[t + 1:]] good, = np.where(next_values.mask == False) next_median = np.ma.median(next_values[good[:10]]) next_diff = np.abs(value_diffs[t]) # out of spike median_diff = np.abs(next_median - st_var.data[good_to_uncompress[t]] ) # are the remaining onees if (critical_values[time_diffs[t] - 1, month_locs[t]] != st_var.mdi): # jump from spike > critical but average after < critical / 2 if (np.abs(median_diff) < critical_values[time_diffs[t] - 1, month_locs[t]] / 2.) and\ (np.abs(next_diff) > critical_values[time_diffs[t] - 1, month_locs[t]]) : flags[t] = 1 if plots or diagnostics: sc_diagnostics_and_plots(station.time.data, st_var.data, good_to_uncompress[t], good_to_uncompress[t + 1], start, variable, plots=plots) elif (np.abs(time_diffs[t - 1]) < 3) and (np.abs(time_diffs[t]) > 240): # 10 days after but short gap before prev_values = st_var.data[good_to_uncompress[:t - 1]] good, = np.where(prev_values.mask == False) prev_median = np.ma.median(prev_values[good[-10:]]) prev_diff = np.abs(value_diffs[t - 1]) median_diff = np.abs(prev_median - st_var.data[good_to_uncompress[t]]) if (critical_values[time_diffs[t - 1] - 1, month_locs[t]] != st_var.mdi): # jump into spike > critical but average before < critical / 2 if (np.abs(median_diff) < critical_values[time_diffs[t - 1] - 1, month_locs[t]] / 2.) and\ (np.abs(prev_diff) > critical_values[time_diffs[t - 1] - 1, month_locs[t]]) : flags[t] = 1 if plots or diagnostics: sc_diagnostics_and_plots(station.time.data, st_var.data, good_to_uncompress[t], good_to_uncompress[t + 1], start, variable, plots=plots) ''' this isn't the nicest way, but a direct copy from IDL masked arrays might help remove some of the lines Also, this is relatively slow''' for t in np.arange(len(time_diffs)): for spk_len in [1, 2, 3]: if t >= spk_len and t < len(time_diffs) - spk_len: # check if time differences are appropriate, for multi-point spikes if (np.abs(time_diffs[t - spk_len]) <= spk_len * 3) and\ (np.abs(time_diffs[t]) <= spk_len * 3) and\ (time_diffs[t - spk_len - 1] - 1 < spk_len * 3) and\ (time_diffs[t + 1] - 1 < spk_len * 3) and \ ((spk_len == 1) or \ ((spk_len == 2) and (np.abs(time_diffs[t - spk_len + 1]) <= spk_len * 3)) or \ ((spk_len == 3) and (np.abs(time_diffs[t - spk_len + 1]) <= spk_len * 3) and (np.abs(time_diffs[t - spk_len + 2]) <= spk_len * 3))): # check if differences are valid if (value_diffs[t - spk_len] != st_var.mdi) and \ (value_diffs[t - spk_len] != st_var.fdi) and \ (critical_values[time_diffs[t - spk_len] - 1, month_locs[t]] != st_var.mdi): # if exceed critical values if (np.abs(value_diffs[t - spk_len]) >= critical_values[time_diffs[t - spk_len] - 1, month_locs[t]]): # are signs of two differences different if (math.copysign(1, value_diffs[t]) != math.copysign( 1, value_diffs[t - spk_len])): # are within spike differences small if (spk_len == 1) or\ ((spk_len == 2) and (np.abs(value_diffs[t - spk_len + 1]) < critical_values[time_diffs[t - spk_len + 1] -1, month_locs[t]] / 2.)) or \ ((spk_len == 3) and (np.abs(value_diffs[t - spk_len + 1]) < critical_values[time_diffs[t - spk_len + 1] -1, month_locs[t]] / 2.) and\ (np.abs(value_diffs[t - spk_len + 2]) < critical_values[time_diffs[t - spk_len + 2] -1, month_locs[t]] / 2.)): # check if following value is valid if (value_diffs[t] != st_var.mdi) and (critical_values[time_diffs[t] - 1, month_locs[t]] != st_var.mdi) and\ (value_diffs[t] != st_var.fdi): # and if at least critical value if (np.abs(value_diffs[t]) >= critical_values[ time_diffs[t] - 1, month_locs[t]]): # test if surrounding differences below 1/2 critical value if (np.abs( value_diffs[t - spk_len - 1] ) <= critical_values[ time_diffs[t - spk_len - 1] - 1, month_locs[t]] / 2.): if (np.abs( value_diffs[t + 1] ) <= critical_values[ time_diffs[t + 1] - 1, month_locs[t]] / 2.): # set the flags flags[t - spk_len + 1:t + 1] = 1 if plots or diagnostics: sc_diagnostics_and_plots( station.time. data, st_var.data, good_to_uncompress[ t - spk_len + 1], good_to_uncompress[ t + 1], start, variable, plots=plots) station.qc_flags[good_to_uncompress, flag_col[v]] = flags flag_locs, = np.where(station.qc_flags[:, flag_col[v]] != 0) utils.print_flagged_obs_number(logfile, "Spike", variable, len(flag_locs), noWrite=diagnostics) # additional flags # copy flags into attribute st_var.flags[flag_locs] = 1 # matches 030660 - but with adapted IDL # matches 030220 OK, but finds more but all are reasonable 1/9/14 do_interactive = False if plots and do_interactive == True: import matplotlib.pyplot as plt plot_times = utils.times_hours_to_datetime(station.time.data, start) plt.clf() plt.plot(plot_times, all_filtered, 'bo', ls='-') flg = np.where(flags[:, flag_col[v]] == 1) plt.plot(plot_times[flg], all_filtered[flg], 'ro', markersize=10) plt.show() station = utils.append_history(station, "Spike Check") return # sc
def coc(station, variable_list, flag_col, start, end, logfile, diagnostics=False, plots=False, idl=False): for v, variable in enumerate(variable_list): st_var = getattr(station, variable) all_filtered = utils.apply_filter_flags(st_var) # is this needed 13th Nov 2014 RJHD #reporting_resolution = utils.reporting_accuracy(utils.apply_filter_flags(st_var)) month_ranges = utils.month_starts_in_pairs(start, end) month_ranges = month_ranges.reshape(-1, 12, 2) for month in range(12): hourly_climatologies = np.zeros(24) hourly_climatologies.fill(st_var.mdi) # append all e.g. Januaries together this_month, year_ids, dummy = utils.concatenate_months( month_ranges[:, month, :], st_var.data, hours=True) this_month_filtered, dummy, dummy = utils.concatenate_months( month_ranges[:, month, :], all_filtered, hours=True) # if fixed climatology period, sort this here # get as array of 24 hrs. this_month = np.ma.array(this_month) this_month = this_month.reshape(-1, 24) this_month_filtered = np.ma.array(this_month_filtered) this_month_filtered = this_month_filtered.reshape(-1, 24) # get hourly climatology for each month for hour in range(24): this_hour = this_month[:, hour] # need to have data if this is going to work! if len(this_hour.compressed()) > 0: # winsorize & climatologies - done to match IDL if idl: this_hour = utils.winsorize(np.append( this_hour.compressed(), -999999), 0.05, idl=idl) hourly_climatologies[hour] = np.ma.sum(this_hour) / ( len(this_hour) - 1) else: this_hour = utils.winsorize(this_hour.compressed(), 0.05, idl=idl) hourly_climatologies[hour] = np.ma.mean(this_hour) if len(this_month.compressed()) > 0: # can get stations with few obs in a particular variable. # anomalise each hour over month appropriately anomalies = this_month - np.tile(hourly_climatologies, (this_month.shape[0], 1)) anomalies_filtered = this_month_filtered - np.tile( hourly_climatologies, (this_month_filtered.shape[0], 1)) if len(anomalies.compressed()) >= 10: iqr = utils.IQR(anomalies.compressed().reshape( -1)) / 2. # to match IDL if iqr < 1.5: iqr = 1.5 else: iqr = st_var.mdi normed_anomalies = anomalies / iqr normed_anomalies_filtered = anomalies_filtered / iqr # get average anomaly for year year_ids = np.array(year_ids) monthly_vqvs = np.ma.zeros(month_ranges.shape[0]) monthly_vqvs.mask = [ False for x in range(month_ranges.shape[0]) ] for year in range(month_ranges.shape[0]): year_locs = np.where(year_ids == year) this_year = normed_anomalies_filtered[year_locs, :] if len(this_year.compressed()) > 0: # need to have data for this to work! if idl: monthly_vqvs[year] = utils.idl_median( this_year.compressed().reshape(-1)) else: monthly_vqvs[year] = np.ma.median(this_year) else: monthly_vqvs.mask[year] = True # low pass filter normed_anomalies = coc_low_pass_filter(normed_anomalies, year_ids, monthly_vqvs, month_ranges.shape[0]) # copy from distributional_gap.py - refactor! # get the threshold value bins, bincenters = utils.create_bins(normed_anomalies, 1.) hist, binEdges = np.histogram(normed_anomalies, bins=bins) gaussian = utils.fit_gaussian(bincenters, hist, max(hist), mu=np.mean(normed_anomalies), sig=np.std(normed_anomalies)) minimum_threshold = round( 1. + utils.invert_gaussian(FREQUENCY_THRESHOLD, gaussian)) if diagnostics: print iqr, minimum_threshold, 1. + utils.invert_gaussian( FREQUENCY_THRESHOLD, gaussian) print gaussian print hist if plots: coc_set_up_plot(bincenters, hist, gaussian, variable, threshold=minimum_threshold, sub_par="observations") uppercount = len( np.where(normed_anomalies > minimum_threshold)[0]) lowercount = len( np.where(normed_anomalies < -minimum_threshold)[0]) these_flags = station.qc_flags[:, flag_col[v]] gap_plot_values, tentative_plot_values = [], [] # find the gaps and apply the flags gap_start = dgc.dgc_find_gap(hist, binEdges, minimum_threshold, gap_size=1) # in DGC it is 2. these_flags, gap_plot_values, tentative_plot_values =\ coc_find_and_apply_flags(month_ranges[:,month,:],normed_anomalies, these_flags, year_ids, minimum_threshold, gap_start, \ upper = True, plots = plots, gpv = gap_plot_values, tpv = tentative_plot_values) gap_start = dgc.dgc_find_gap(hist, binEdges, -minimum_threshold, gap_size=1) # in DGC it is 2. these_flags, gap_plot_values, tentative_plot_values =\ coc_find_and_apply_flags(month_ranges[:,month,:],normed_anomalies, these_flags, year_ids, minimum_threshold, gap_start, \ upper = False, plots = plots, gpv = gap_plot_values, tpv = tentative_plot_values) station.qc_flags[:, flag_col[v]] = these_flags if uppercount + lowercount > 1000: #print "not sorted spurious stations yet" pass if plots: import matplotlib.pyplot as plt hist, binEdges = np.histogram(tentative_plot_values, bins=bins) plot_hist = np.array([0.01 if h == 0 else h for h in hist]) plt.step(bincenters, plot_hist, c='orange', ls='-', label='tentative', where='mid') hist, binEdges = np.histogram(gap_plot_values, bins=bins) plot_hist = np.array([0.01 if h == 0 else h for h in hist]) plt.step(bincenters, plot_hist, 'r-', label='flagged', where='mid') import calendar plt.text(0.1, 0.9, calendar.month_name[month + 1], transform=plt.gca().transAxes) leg = plt.legend(loc='lower center', ncol=4, bbox_to_anchor=(0.5, -0.2), frameon=False, prop={'size': 13}, labelspacing=0.15, columnspacing=0.5) plt.setp(leg.get_title(), fontsize=14) plt.show() #plt.savefig(IMAGELOCATION+'/'+station.id+'_ClimatologicalGap_'+str(month+1)+'.png') flag_locs = np.where(station.qc_flags[:, flag_col[v]] != 0) # copy flags into attribute st_var.flags[flag_locs] = 1 if plots or diagnostics: utils.print_flagged_obs_number(logfile, "Climatological", variable, len(flag_locs[0]), noWrite=True) print "where\n" nflags = len(np.where(station.qc_flags[:, flag_col[v]] == 1)[0]) utils.print_flagged_obs_number(logfile, " Firm Clim", variable, nflags, noWrite=True) nflags = len(np.where(station.qc_flags[:, flag_col[v]] == 2)[0]) utils.print_flagged_obs_number(logfile, " Tentative Clim", variable, nflags, noWrite=True) else: utils.print_flagged_obs_number(logfile, "Climatological", variable, len(flag_locs[0])) logfile.write("where\n") nflags = len(np.where(station.qc_flags[:, flag_col[v]] == 1)[0]) utils.print_flagged_obs_number(logfile, " Firm Clim", variable, nflags) nflags = len(np.where(station.qc_flags[:, flag_col[v]] == 2)[0]) utils.print_flagged_obs_number(logfile, " Tentative Clim", variable, nflags) # firm flags match 030220 station = utils.append_history(station, "Climatological Check") return
def evc(station, variable_list, flag_col, start, end, logfile, diagnostics = False, plots = False, idl = False): if plots or diagnostics: import matplotlib.pyplot as plt import calendar # very similar to climatological check - ensure that not duplicating for v, variable in enumerate(variable_list): st_var = getattr(station, variable) reporting_resolution = utils.reporting_accuracy(utils.apply_filter_flags(st_var)) reporting_freq = utils.reporting_frequency(utils.apply_filter_flags(st_var)) month_ranges = utils.month_starts_in_pairs(start, end) month_ranges = month_ranges.reshape(-1,12,2) month_data_count = np.zeros(month_ranges.shape[0:2]) # for each month for month in range(12): # set up hourly climatologies hourly_clims = np.zeros(24) hourly_clims.fill(st_var.data.fill_value) this_month, year_ids, month_data_count[:,month] = utils.concatenate_months(month_ranges[:,month,:], st_var.data, hours = True) # # extract each year and append together # year_ids = [] # counter to determine which year each day corresponds to # for year in range(month_ranges.shape[0]): # this_year = st_var.data[month_ranges[year,month][0]:month_ranges[year,month][1]] # if year == 0: # # store so can access each hour of day separately # this_month = this_year.reshape(-1,24) # year_ids = [year for x in range(this_month.shape[0])] # month_data_count[year,month] = len(this_year.compressed()) # else: # this_year = this_year.reshape(-1,24) # this_month = np.ma.concatenate((this_month, this_year), axis = 0) # year_ids.extend([year for x in range(this_year.shape[0])]) # month_data_count[year,month] = len(this_year.compressed()) # winsorize and get hourly climatology for h in range(24): this_hour = this_month[:,h] if len(this_hour.compressed()) > 100: # winsorize & climatologies - done to match IDL if idl: this_hour_winsorized = utils.winsorize(np.append(this_hour.compressed(), -999999), 0.05, idl = idl) hourly_clims[h] = np.ma.sum(this_hour_winsorized)/(len(this_hour_winsorized) - 1) else: this_hour_winsorized = utils.winsorize(this_hour.compressed(), 0.05, idl = idl) hourly_clims[h] = np.ma.mean(this_hour_winsorized) hourly_clims = np.ma.masked_where(hourly_clims == st_var.data.fill_value, hourly_clims) anomalies = this_month - np.tile(hourly_clims, (this_month.shape[0], 1)) # extract IQR of anomalies (using 1/2 value to match IDL) if len(anomalies.compressed()) >= 10: iqr = utils.IQR(anomalies.compressed().reshape(-1)) / 2. # to match IDL if iqr < 1.5: iqr = 1.5 else: iqr = st_var.mdi normed_anomalies = anomalies / iqr variances = np.ma.zeros(month_ranges.shape[0]) variances.mask = [False for i in range(month_ranges.shape[0])] rep_accuracies = np.zeros(month_ranges.shape[0]) rep_freqs = np.zeros(month_ranges.shape[0]) variances.fill(st_var.mdi) rep_accuracies.fill(st_var.mdi) rep_freqs.fill(st_var.mdi) year_ids = np.array(year_ids) # extract variance of normalised anomalies for each year for y, year in enumerate(range(month_ranges.shape[0])): year_locs = np.where(year_ids == y) this_year = normed_anomalies[year_locs,:] this_year = this_year.reshape(-1) # end of similarity with Climatological check if len(this_year.compressed()) >= 30: variances[y] = utils.mean_absolute_deviation(this_year, median = True) rep_accuracies[y] = utils.reporting_accuracy(this_year) rep_freqs[y] = utils.reporting_frequency(this_year) else: variances.mask[y] = True good = np.where(month_data_count[:,month] >= 100) # get median and IQR of variance for all years for this month if len(good[0]) >= 10: median_variance = np.median(variances[good]) iqr_variance = utils.IQR(variances[good]) / 2. # to match IDL if iqr_variance < 0.01: iqr_variance = 0.01 else: median_variance = st_var.mdi iqr_variance = st_var.mdi # if SLP, then get median and MAD of SLP and windspeed for month if variable in ["slp", "windspeeds"]: winds = getattr(station, "windspeeds") slp = getattr(station, "slp") # refactor this as similar in style to how target data extracted for y, year in enumerate(range(month_ranges.shape[0])): if y == 0: winds_year = winds.data[month_ranges[year,month][0]:month_ranges[year,month][1]] winds_month = winds_year.reshape(-1,24) slp_year = slp.data[month_ranges[year,month][0]:month_ranges[year,month][1]] slp_month = slp_year.reshape(-1,24) else: winds_year = winds.data[month_ranges[year,month][0]:month_ranges[year,month][1]] winds_year = winds_year.reshape(-1,24) winds_month = np.ma.concatenate((winds_month, winds_year), axis = 0) slp_year = slp.data[month_ranges[year,month][0]:month_ranges[year,month][1]] slp_year = slp_year.reshape(-1,24) slp_month = np.ma.concatenate((slp_month, slp_year), axis = 0) median_wind = np.ma.median(winds_month) median_slp = np.ma.median(slp_month) wind_MAD = utils.mean_absolute_deviation(winds_month.compressed()) slp_MAD = utils.mean_absolute_deviation(slp_month.compressed()) if diagnostics: print "median windspeed {} m/s, MAD = {}".format(median_wind, wind_MAD) print "median slp {} hPa, MAD = {}".format(median_slp, slp_MAD) # now test to see if variance exceeds expected range for y, year in enumerate(range(month_ranges.shape[0])): if (variances[y] != st_var.mdi) and (iqr_variance != st_var.mdi) and \ (median_variance != st_var.mdi) and (month_data_count[y,month] >= DATA_COUNT_THRESHOLD): # if SLP, then need to test if deep low pressure ("hurricane/storm") present # as this will increase the variance for this month + year if variable in ["slp", "windspeeds"]: iqr_threshold = 6. # increase threshold if reporting frequency and resolution of this # year doesn't match average if (rep_accuracies[y] != reporting_resolution) and \ (rep_freqs[y] != reporting_freq): iqr_threshold = 8. if diagnostics: print np.abs(variances[y] - median_variance) / iqr_variance, variances[y] , median_variance , iqr_variance , iqr_threshold, month+1, year+start.year if np.abs((variances[y] - median_variance) / iqr_variance) > iqr_threshold: # check for storms winds_month = winds.data[month_ranges[year,month][0]:month_ranges[year,month][1]] slp_month = slp.data[month_ranges[year,month][0]:month_ranges[year,month][1]] storm = False if (len(winds_month.compressed()) >= 1) and (len(slp_month.compressed()) >= 1): # find max wind & min SLP # max_wind_loc = np.where(winds_month == np.max(winds_month))[0][0] # min_slp_loc = np.where(slp_month == np.min(slp_month))[0][0] # if these are above thresholds and within one day of each other, # then it likely was a storm # print "fix this in case of multiple max/min locations" # if (np.abs(max_wind_loc - min_slp_loc) <= 24) and \ # (((np.max(winds_month) - median_wind) / wind_MAD) > MAD_THRESHOLD) and \ # (((median_slp - np.min(slp_month)) / slp_MAD) > MAD_THRESHOLD): # locations where winds greater than threshold high_winds, = np.where((winds_month - median_wind)/wind_MAD > MAD_THRESHOLD) # and where SLP less than threshold low_slps, = np.where((median_slp - slp_month)/slp_MAD > MAD_THRESHOLD) # if any locations match, then it's a storm match_loc = high_winds[np.in1d(high_winds, low_slps)] if len(match_loc) > 0: storm = True else: print "write spurious" # check the SLP first difference series # to ensure a drop down and climb out of minimum SLP/or climb up and down from maximum wind speed if variable == "slp": diffs = np.diff(slp_month.compressed()) elif variable == "windspeeds": diffs = np.diff(winds_month.compressed()) negs, poss = 0,0 biggest_neg, biggest_pos = 0,0 for diff in diffs: if diff > 0: if negs > biggest_neg: biggest_neg = negs negs = 0 poss += 1 else: if poss > biggest_pos: biggest_pos = poss poss = 0 negs += 1 if (biggest_neg < 10) and (biggest_pos < 10) and not storm: # not a hurricane, so mask station.qc_flags[month_ranges[year,month,0]:month_ranges[year,month,1], flag_col[v]] = 1 if plots or diagnostics: print "No Storm or Hurricane in %i %i - flagging\n" % (month+1, y+start.year) else: logfile.write("No Storm or Hurricane in %i %i - flagging\n" % (month+1, y+start.year)) else: # hurricane if plots or diagnostics: print "Storm or Hurricane in %i %i - not flagging\n" % (month+1, y+start.year) else: logfile.write("Storm or Hurricane in %i %i - not flagging\n" % (month+1, y+start.year)) if plots: # plot showing the pressure, pressure first differences and the wind speeds plot_times = utils.times_hours_to_datetime(station.time.data[month_ranges[year,month][0]:month_ranges[year,month][1]], start) evc_plot_slp_wind(plot_times, slp_month, diffs, median_slp, slp_MAD, winds_month, median_wind, wind_MAD) else: iqr_threshold = 8. if (rep_accuracies[y] != reporting_resolution) and \ (rep_freqs[y] != reporting_freq): iqr_threshold = 10. if np.abs(variances[y] - median_variance) / iqr_variance > iqr_threshold: if diagnostics: print "flagging {} {}".format(year+start.year,calendar.month_name[month+1]) # remove the data station.qc_flags[month_ranges[year,month,0]:month_ranges[year,month,1], flag_col[v]] = 1 if plots: plot_variances = (variances - median_variance) / iqr_variance plot_variances = np.ma.masked_where(month_data_count[:,month] < DATA_COUNT_THRESHOLD,plot_variances) evc_plot_hist(plot_variances, iqr_threshold, "Variance Check - %s - %s" % (variable, calendar.month_name[month+1])) flag_locs = np.where(station.qc_flags[:, flag_col[v]] != 0) if plots or diagnostics: utils.print_flagged_obs_number(logfile, "Variance", variable, len(flag_locs[0]), noWrite = True) else: utils.print_flagged_obs_number(logfile, "Variance", variable, len(flag_locs[0])) # copy flags into attribute st_var.flags[flag_locs] = 1 # matches 030660 for T, D and SLP 21/8/2014 station = utils.append_history(station, "Excess Variance Check") return # evc
def neighbour_checks(station_info, restart_id = "", end_id = "", distances=np.array([]), angles=np.array([]), second = False, masking = False, doZip=False, plots = False, diagnostics = False): """ Run through neighbour checks on list of stations passed :param list station_info: list of lists - [[ID, lat, lon, elev]] - strings :param array distances: array of distances between station pairs :param array angles: array of angles between station pairs :param bool second: do the second run :param bool masking: apply the flags to the data to mask the observations. """ first = not second qc_code_version = subprocess.check_output(['svnversion']).strip() # if distances and angles not calculated, then do so if (len(distances) == 0) or (len(angles) == 0): print "calculating distances and bearings matrix" distances, angles = get_distances_angles(station_info) # extract before truncate the array neighbour_elevations = np.array(station_info[:,3], dtype=float) neighbour_ids = np.array(station_info[:,0]) neighbour_info = np.array(station_info[:,:]) # sort truncated run startindex = 0 if restart_id != "": startindex, = np.where(station_info[:,0] == restart_id) if end_id != "": endindex, = np.where(station_info[:,0] == end_id) if endindex != len(station_info) -1: station_info = station_info[startindex: endindex+1] distances = distances[startindex:endindex+1,:] angles = angles[startindex:endindex+1,:] else: station_info = station_info[startindex:] distances = distances[startindex:,:] angles = angles[startindex:,:] else: station_info = station_info[startindex:] distances = distances[startindex:,:] angles = angles[startindex:,:] # process each neighbour for st, stat in enumerate(station_info): print dt.datetime.strftime(dt.datetime.now(), "%A, %d %B %Y, %H:%M:%S") print "Neighbour Check" print "{:35s} {}".format("Station Identifier :", stat[0]) if not plots and not diagnostics: logfile = file(LOG_OUTFILE_LOCS+stat[0]+'.log','a') # append to file if second iteration. logfile.write(dt.datetime.strftime(dt.datetime.now(), "%A, %d %B %Y, %H:%M:%S\n")) logfile.write("Neighbour Check\n") logfile.write("{:35s} {}\n".format("Station Identifier :", stat[0])) else: logfile = "" process_start_time = time.time() station = utils.Station(stat[0], float(stat[1]), float(stat[2]), float(stat[3])) # if running through the first time if first: if os.path.exists(os.path.join(NETCDF_DATA_LOCS, station.id + "_internal.nc.gz")): # if gzip file, unzip here subprocess.call(["gunzip",os.path.join(NETCDF_DATA_LOCS, station.id + "_internal.nc.gz")]) time.sleep(5) # make sure it is unzipped before proceeding # read in the data ncdfp.read(os.path.join(NETCDF_DATA_LOCS, station.id + "_internal.nc"), station, process_vars, carry_thru_vars, diagnostics = diagnostics) if plots or diagnostics: print "{:35s} {}\n".format("Total station record size :",len(station.time.data)) else: logfile.write("{:35s} {}\n".format("Total station record size :",len(station.time.data))) match_to_compress = utils.create_fulltimes(station, process_vars, DATASTART, DATAEND, carry_thru_vars) # or if second pass through? elif second: if os.path.exists(os.path.join(NETCDF_DATA_LOCS, station.id + "internal2.nc.gz")): # if gzip file, unzip here subprocess.call(["gunzip",os.path.join(NETCDF_DATA_LOCS, station.id + "_internal2.nc.gz")]) time.sleep(5) # make sure it is unzipped before proceeding ncdfp.read(os.path.join(NETCDF_DATA_LOCS, station.id + "_internal2.nc"), station, process_vars, carry_thru_vars, diagnostics = diagnostics) if plots or diagnostics: print "{:35s} {}\n".format("Total station record size :",len(station.time.data)) else: logfile.write("{:35s} {}\n".format("Total station record size :",len(station.time.data))) match_to_compress = utils.create_fulltimes(station, process_vars, DATASTART, DATAEND, carry_thru_vars) # select neighbours neighbour_distances = distances[st,:] neighbour_bearings = angles[st,:] # have to add in start index so that can use location in distance file. # neighbours = n_utils.get_neighbours(st+startindex, np.float(stat[3]), neighbour_distances, neighbour_bearings, neighbour_elevations) # return all neighbours up to a limit from the distance and elevation offsets (500km and 300m respectively) neighbours, neighbour_quadrants = n_utils.get_all_neighbours(st+startindex, np.float(stat[3]), neighbour_distances, neighbour_bearings, neighbour_elevations) if plots or diagnostics: print "{:14s} {:10s} {:10s}".format("Neighbour","Distance","Elevation") for n in neighbours: print "{:14s} {:10.1f} {:10.1f}".format(neighbour_ids[n],neighbour_distances[n],neighbour_elevations[n]) else: logfile.write("{:14s} {:10s} {:10s}\n".format("Neighbour","Distance","Elevation")) for n in neighbours: logfile.write("{:14s} {:10.1f} {:10.1f}\n".format(neighbour_ids[n],neighbour_distances[n],neighbour_elevations[n])) # if sufficient neighbours if len(neighbours) >= 3: for variable, col in FLAG_OUTLIER_DICT.items(): # NOTE - this requires multiple reads of the same file # but does make it easier to understand and code st_var = getattr(station, variable) if plots or diagnostics: print "Length of {} record: {}".format(variable, len(st_var.data.compressed())) else: logfile.write("Length of {} record: {}\n".format(variable, len(st_var.data.compressed()))) if len(st_var.data.compressed()) > 0: final_neighbours = n_utils.select_neighbours(station, variable, neighbour_info[neighbours], neighbours, neighbour_distances[neighbours], neighbour_quadrants, NETCDF_DATA_LOCS, DATASTART, DATAEND, logfile, second = second, diagnostics = diagnostics, plots = plots) # now read in final set of neighbours and process neigh_flags = np.zeros(len(station.time.data)) # count up how many neighbours think this obs is bad neigh_count = np.zeros(len(station.time.data)) # number of neighbours at each time stamp dpd_flags = np.zeros(len(station.time.data)) # number of neighbours at each time stamp reporting_accuracies = np.zeros(len(neighbours)) # reporting accuracy of each neighbour all_data = np.ma.zeros([len(final_neighbours), len(station.time.data)]) # store all the neighbour values for nn, nn_loc in enumerate(final_neighbours): neigh_details = neighbour_info[nn_loc] neigh = utils.Station(neigh_details[0], float(neigh_details[1]), float(neigh_details[2]), float(neigh_details[3])) if first: ncdfp.read(os.path.join(NETCDF_DATA_LOCS, neigh.id + "_internal.nc"), neigh, [variable], diagnostics = diagnostics, read_input_station_id = False) elif second: ncdfp.read(os.path.join(NETCDF_DATA_LOCS, neigh.id + "_internal2.nc"), neigh, [variable], diagnostics = diagnostics, read_input_station_id = False) dummy = utils.create_fulltimes(neigh, [variable], DATASTART, DATAEND, [], do_input_station_id = False) all_data[nn, :] = utils.apply_filter_flags(getattr(neigh, variable)) if diagnostics: print neigh_details n_utils.detect(station, neigh, variable, neigh_flags, neigh_count, DATASTART, DATAEND, distance = neighbour_distances[nn_loc], diagnostics = diagnostics, plots = plots) reporting_accuracies[nn] = utils.reporting_accuracy(getattr(neigh,variable).data) dpd_flags += neigh.qc_flags[:,31] # gone through all neighbours # if at least 2/3 of neighbours have flagged this point (and at least 3 neighbours) some_flags, = np.where(neigh_flags > 0) outlier_locs, = np.where(np.logical_and((neigh_count[some_flags] >= 3),(neigh_flags[some_flags].astype("float")/neigh_count[some_flags] > 2./3.))) # flag where < 3 neighbours locs = np.where(neigh_count[some_flags] < 3) station.qc_flags[some_flags[locs], col] = -1 if len(outlier_locs) >= 1: station.qc_flags[some_flags[outlier_locs], col] = 1 # print number flagged and copy into attribute if plots or diagnostics: utils.print_flagged_obs_number(logfile, "Neighbour", variable, len(outlier_locs), noWrite = True) else: utils.print_flagged_obs_number(logfile, "Neighbour", variable, len(outlier_locs)) st_var = getattr(station, variable) st_var.flags[some_flags[outlier_locs]] = 1 else: if plots or diagnostics: utils.print_flagged_obs_number(logfile, "Neighbour", variable, len(outlier_locs), noWrite = True) else: utils.print_flagged_obs_number(logfile, "Neighbour", variable, len(outlier_locs)) if plots: n_utils.plot_outlier(station, variable, some_flags[outlier_locs], all_data, DATASTART) # unflagging using neighbours n_utils.do_unflagging(station, variable, all_data, reporting_accuracies, neigh_count, dpd_flags, FLAG_COL_DICT, DATASTART, logfile, plots = plots, diagnostics = diagnostics) else: if plots or diagnostics: print "No observations to assess for {}".format(variable) else: logfile.write("No observations to assess for {}\n".format(variable)) # variable loop else: if plots or diagnostics: print "Fewer than 3 neighbours" else: logfile.write("Fewer than 3 neighbours\n") print dt.datetime.strftime(dt.datetime.now(), "%A, %d %B %Y, %H:%M:%S") print "processing took {:4.0f}s\n\n".format(time.time() - process_start_time) # end of neighbour check utils.append_history(station, "Neighbour Outlier Check") # clean up months qc_tests.clean_up.clu(station, ["temperatures","dewpoints","slp","windspeeds","winddirs"], [44,45,46,47,48], FLAG_COL_DICT, DATASTART, DATAEND, logfile, plots = plots, diagnostics = diagnostics) if diagnostics or plots: raw_input("stop") # masking (at least call from here - optional call from internal?) # write to file if first: ncdfp.write(os.path.join(NETCDF_DATA_LOCS, station.id + "_external.nc"), station, process_vars, os.path.join(INPUT_FILE_LOCS,'attributes.dat'), opt_var_list = carry_thru_vars, compressed = match_to_compress, processing_date = '', qc_code_version = qc_code_version) # gzip the raw file elif second: ncdfp.write(os.path.join(NETCDF_DATA_LOCS, station.id + "_external2.nc"), station, process_vars, os.path.join(INPUT_FILE_LOCS,'attributes.dat'), opt_var_list = carry_thru_vars, compressed = match_to_compress, processing_date = '', qc_code_version = qc_code_version) # gzip the raw file # masking - apply the flags and copy masked data to flagged_obs attribute if masking: station = utils.mask(station, process_vars, logfile, FLAG_COL_DICT) # write to file if first: ncdfp.write(os.path.join(NETCDF_DATA_LOCS, station.id + "_mask.nc"), station, process_vars, os.path.join(INPUT_FILE_LOCS,'attributes.dat'), opt_var_list = carry_thru_vars, compressed = match_to_compress, processing_date = '', qc_code_version = qc_code_version) elif second: ncdfp.write(os.path.join(NETCDF_DATA_LOCS, station.id + "_mask2.nc"), station, process_vars, os.path.join(INPUT_FILE_LOCS,'attributes.dat'), opt_var_list = carry_thru_vars, compressed = match_to_compress, processing_date = '', qc_code_version = qc_code_version) if plots or diagnostics: print "Masking completed\n" print dt.datetime.strftime(dt.datetime.now(), "%A, %d %B %Y, %H:%M:%S\n") print "processing took {:4.0f}s\n\n".format(time.time() - process_start_time) else: logfile.write("Masking completed\n") logfile.write(dt.datetime.strftime(dt.datetime.now(), "%A, %d %B %Y, %H:%M:%S\n")) logfile.write("processing took {:4.0f}s\n\n".format(time.time() - process_start_time)) logfile.close() # looped through all stations # gzip up all the raw files if doZip: for st, stat in enumerate(station_info): if first: subprocess.call(["gzip",os.path.join(NETCDF_DATA_LOCS, stat[0]+"_internal.nc")]) if masking: subprocess.call(["gzip",os.path.join(NETCDF_DATA_LOCS, stat[0]+"_external.nc")]) subprocess.call(["gzip",os.path.join(NETCDF_DATA_LOCS, stat[0]+"_mask.nc")]) elif second: subprocess.call(["gzip",os.path.join(NETCDF_DATA_LOCS, stat[0]+"_internal2.nc")]) if masking: subprocess.call(["gzip",os.path.join(NETCDF_DATA_LOCS, stat[0]+"_external2.nc")]) subprocess.call(["gzip",os.path.join(NETCDF_DATA_LOCS, stat[0]+"_mask2.nc")]) print "Neighbour Checks completed\n" return # neighbour_checks
def occ(station, variable_list, flag_col, datastart, logfile, diagnostics = False, plots = False, second = False): ''' Check for odd clusters of data surrounded by missing up to 6hr/24hr surrounded by at least 48 on each side :param MetVar station: the station object :param list variable_list: list of observational variables to process :param list flag_col: the columns to set on the QC flag array :param datetime datastart: dataset start time :param file logfile: logfile to store outputs :param bool diagnostics: do extra verbose output :param bool plots: do plots :param bool second: run for second time :returns: ''' # the four options of what to do with each observation # the keys give values which are subroutines, and can be called # all subroutines have to take the same set of inputs options = {0 : occ_normal, 1 : occ_start_cluster, 2 : occ_in_cluster, 3 : occ_after_cluster} for v,variable in enumerate(variable_list): st_var = getattr(station, variable) filtered_data = utils.apply_filter_flags(st_var) var_flags = station.qc_flags[:,flag_col[v]] prev_flag_number = 0 if second: # count currently existing flags: prev_flag_number = len(var_flags[var_flags != 0]) # using IDL copy as method to ensure reproducibility (initially) oc_details = OddCluster(st_var.mdi, st_var.mdi, 0, st_var.mdi, st_var.mdi, -1) obs_type = 1 for time in station.time.data: if filtered_data.mask[time] == False: # process observation point using subroutines, called from named tuple if plots and (obs_type == 3) and (time - oc_details.end >= 48): # do plotting if matches flagging criteria oc_plots(station, oc_details, time, datastart, filtered_data, variable) oc_details, obs_type = options[obs_type](oc_details, obs_type, time, var_flags) else: # have missing data, if obs_type == 2: obs_type = 3 elif obs_type == 0: obs_type = 1 station.qc_flags[:,flag_col[v]] = var_flags flag_locs = np.where(station.qc_flags[:, flag_col[v]] != 0) if plots or diagnostics: utils.print_flagged_obs_number(logfile, "Odd Cluster", variable, len(flag_locs[0]) - prev_flag_number, noWrite = True) else: utils.print_flagged_obs_number(logfile, "Odd Cluster", variable, len(flag_locs[0]) - prev_flag_number) # copy flags into attribute st_var.flags[flag_locs] = 1 # matches 032070 temperature 26/8/2014 station = utils.append_history(station, "Isolated Odd Cluster Check") return # occ
def dmc(station, variable_list, full_variable_list, flag_col, start, end, logfile, diagnostics=False, plots=False): ''' Method copied from check_duplicates.pro :param obj station: station object with suitable attributes (see netcdf_procs.py) :param list variable_list: list of netcdf variables to process :param list full_variable_list: the variables for flags to be applied to :param list flag_col: which column to set in flag array :param datetime start: data start :param datetime end: data end :param file logfile: logfile to store outputs :param bool diagnostics: extra verbosity :param bool plots: do plots ''' MIN_DATA_REQUIRED = 20 # obs per month # get array of Nx2 start/end pairs month_ranges = utils.month_starts_in_pairs(start, end) for v, variable in enumerate(variable_list): st_var = getattr(station, variable) # double loop structure - not ideal duplicated = np.zeros(len(month_ranges)) for sm, source_month in enumerate(month_ranges): if diagnostics: print "Month %i of %i" % (sm + 1, len(month_ranges)) source_data = st_var.data[source_month[0]:source_month[1]] if duplicated[sm] == 0: # don't repeat if already a duplicated for tm, target_month in enumerate(month_ranges[sm + 1:]): target_data = st_var.data[target_month[0]:target_month[1]] # match the data periods overlap = np.min([len(source_data), len(target_data)]) s_data, t_data = source_data[:overlap], target_data[:overlap] s_valid, t_valid = np.where(s_data.compressed() != st_var.fdi), \ np.where(t_data.compressed() != st_var.fdi) # if enough of an overlap if (len(s_valid[0]) >= MIN_DATA_REQUIRED) and \ (len(t_valid[0]) >= MIN_DATA_REQUIRED): if len(s_valid[0]) < len(t_valid[0]): duplicated = duplication_test(source_data, target_data, s_valid, sm, tm, source_month, target_month, duplicated, diagnostics, station.qc_flags, flag_col[v]) else: # swap the list of valid points duplicated = duplication_test(source_data, target_data, t_valid, sm, tm, source_month, target_month, duplicated, diagnostics, station.qc_flags, flag_col[v]) if plots: dmc_plot(s_data, t_data, start, source_month[0], target_month[0], st_var.name) # target month # source month # variable list flag_locs = np.where(station.qc_flags[:, flag_col[v]] != 0) utils.print_flagged_obs_number(logfile, "Duplicate Month", variable, len(flag_locs[0]), noWrite = diagnostics) # copy flags into attribute st_var.flags[flag_locs] = 1 utils.apply_flags_all_variables(station, full_variable_list, flag_col[variable_list == "temperatures"], logfile, "Duplicate Months", plots = plots, diagnostics = diagnostics) station = utils.append_history(station, "Duplicate Months Check") return # dmc
def do_unflagging(station, variable, all_data, reporting_accuracies, neigh_count, dpd_flags, FLAG_COL_DICT, start, logfile, plots=False, diagnostics=False): ''' Set up and run the unflagging process for the specified tests :param MetVar station: station object :param string variable: variable to process :param array all_data: array containing all neighbour obs for full time period :param array reporting accuracies: reporting accuracy for each neighbour :param array neigh_count: number of neighbours with data at each time stamp :param array dpd_flags: number of neighbours that have DPD set at each time stamp :param dict FLAG_COL_DICT: look up dictionary to :param datetime start: start of dataset :param file logfile: logfile to store outputs :param bool plots: do plots ''' # unflagging using neighbours '''This is slow - np.ma.median is known to be slow https://github.com/astropy/ccdproc/issues/74 https://github.com/astropy/ccdproc/blob/122cdbd5713140174f057eaa8fdb6f9ce03312df/docs/ccdproc/bottleneck_example.rst''' mean_of_neighbours = bn_median(all_data, axis=0) std_of_neighbours = median_absolute_deviation(all_data, axis=0) # find where the spread of neighbour observations is less than 1/2 # of maximum reporting accuracy std_of_neighbours[ std_of_neighbours < 0.5 * max(reporting_accuracies)] = 0.5 * max(reporting_accuracies) # create series of normalised differences of obs from neighbour mean st_var = getattr(station, variable) normalised_differences = np.ma.abs(st_var.data - mean_of_neighbours) / std_of_neighbours for qc_test in ["climatological", "gap", "odd", "dpd"]: if qc_test == "dpd" and variable == "dewpoints": flags = station.qc_flags[:, UNFLAG_COL_DICT[qc_test][variable]] unset_locs = unflagging_locs(normalised_differences, flags, neigh_count, dpd_count=dpd_flags) elif qc_test == "dpd": # only unflag DPD on dewpoints continue elif qc_test == "gap" and variable != "slp": # only unflag gap check on slp observations continue else: flags = station.qc_flags[:, UNFLAG_COL_DICT[qc_test][variable]] if qc_test == "gap" or qc_test == "climatological": # only tentative flags unset_locs = unflagging_locs(normalised_differences, flags, neigh_count, flag_value=2) else: unset_locs = unflagging_locs(normalised_differences, flags, neigh_count) if len(unset_locs) > 0: station.qc_flags[unset_locs, UNFLAG_COL_DICT[qc_test][variable]] = 0 # need to unflag attribute if and only if no other flags are set subset_flags = station.qc_flags[:, FLAG_COL_DICT[variable]] total_flags = np.sum(subset_flags[unset_locs, :], axis=1) clean_locs = np.where(total_flags == 0) st_var.flags[unset_locs[clean_locs]] = 0 # and print result if plots or diagnostics: utils.print_flagged_obs_number(logfile, "Unflagging " + qc_test, variable, len(unset_locs), noWrite=True) else: utils.print_flagged_obs_number(logfile, "Unflagging " + qc_test, variable, len(unset_locs)) if plots: if len(unset_locs) > 0: plot_outlier(station, variable, unset_locs, all_data, start) station = utils.append_history(station, "Unflagging - " + variable) return # do_unflagging
def do_unflagging(station, variable, all_data, reporting_accuracies, neigh_count, dpd_flags, FLAG_COL_DICT, start, logfile, plots = False, diagnostics = False): ''' Set up and run the unflagging process for the specified tests :param MetVar station: station object :param string variable: variable to process :param array all_data: array containing all neighbour obs for full time period :param array reporting accuracies: reporting accuracy for each neighbour :param array neigh_count: number of neighbours with data at each time stamp :param array dpd_flags: number of neighbours that have DPD set at each time stamp :param dict FLAG_COL_DICT: look up dictionary to :param datetime start: start of dataset :param file logfile: logfile to store outputs :param bool plots: do plots ''' # unflagging using neighbours '''This is slow - np.ma.median is known to be slow https://github.com/astropy/ccdproc/issues/74 https://github.com/astropy/ccdproc/blob/122cdbd5713140174f057eaa8fdb6f9ce03312df/docs/ccdproc/bottleneck_example.rst''' mean_of_neighbours = bn_median(all_data, axis = 0) std_of_neighbours = median_absolute_deviation(all_data, axis = 0) # find where the spread of neighbour observations is less than 1/2 # of maximum reporting accuracy std_of_neighbours[std_of_neighbours < 0.5*max(reporting_accuracies)] = 0.5*max(reporting_accuracies) # create series of normalised differences of obs from neighbour mean st_var = getattr(station, variable) normalised_differences = np.ma.abs(st_var.data - mean_of_neighbours)/std_of_neighbours for qc_test in ["climatological","gap","odd","dpd"]: if qc_test == "dpd" and variable == "dewpoints": flags = station.qc_flags[:, UNFLAG_COL_DICT[qc_test][variable]] unset_locs = unflagging_locs(normalised_differences, flags, neigh_count, dpd_count = dpd_flags) elif qc_test == "dpd": # only unflag DPD on dewpoints continue elif qc_test == "gap" and variable != "slp": # only unflag gap check on slp observations continue else: flags = station.qc_flags[:, UNFLAG_COL_DICT[qc_test][variable]] if qc_test == "gap" or qc_test == "climatological": # only tentative flags unset_locs = unflagging_locs(normalised_differences, flags, neigh_count, flag_value = 2) else: unset_locs = unflagging_locs(normalised_differences, flags, neigh_count) if len(unset_locs) > 0: station.qc_flags[unset_locs, UNFLAG_COL_DICT[qc_test][variable]] = 0 # need to unflag attribute if and only if no other flags are set subset_flags = station.qc_flags[:, FLAG_COL_DICT[variable]] total_flags = np.sum(subset_flags[unset_locs, :], axis = 1) clean_locs = np.where(total_flags == 0) st_var.flags[unset_locs[clean_locs]] = 0 # and print result if plots or diagnostics: utils.print_flagged_obs_number(logfile, "Unflagging "+qc_test, variable, len(unset_locs), noWrite = True) else: utils.print_flagged_obs_number(logfile, "Unflagging "+qc_test, variable, len(unset_locs)) if plots: if len(unset_locs) > 0: plot_outlier(station, variable, unset_locs, all_data, start) station = utils.append_history(station, "Unflagging - "+variable) return # do_unflagging
def coc(station, variable_list, flag_col, start, end, logfile, diagnostics = False, plots = False, idl = False): for v, variable in enumerate(variable_list): st_var = getattr(station, variable) all_filtered = utils.apply_filter_flags(st_var) # is this needed 13th Nov 2014 RJHD #reporting_resolution = utils.reporting_accuracy(utils.apply_filter_flags(st_var)) month_ranges = utils.month_starts_in_pairs(start, end) month_ranges = month_ranges.reshape(-1,12,2) for month in range(12): hourly_climatologies = np.zeros(24) hourly_climatologies.fill(st_var.mdi) # append all e.g. Januaries together this_month, year_ids, dummy = utils.concatenate_months(month_ranges[:,month,:], st_var.data, hours = True) this_month_filtered, dummy, dummy = utils.concatenate_months(month_ranges[:,month,:], all_filtered, hours = True) # if fixed climatology period, sort this here # get as array of 24 hrs. this_month = np.ma.array(this_month) this_month = this_month.reshape(-1,24) this_month_filtered = np.ma.array(this_month_filtered) this_month_filtered = this_month_filtered.reshape(-1,24) # get hourly climatology for each month for hour in range(24): this_hour = this_month[:,hour] # need to have data if this is going to work! if len(this_hour.compressed()) > 0: # winsorize & climatologies - done to match IDL if idl: this_hour = utils.winsorize(np.append(this_hour.compressed(), -999999), 0.05, idl = idl) hourly_climatologies[hour] = np.ma.sum(this_hour)/(len(this_hour) - 1) else: this_hour = utils.winsorize(this_hour.compressed(), 0.05, idl = idl) hourly_climatologies[hour] = np.ma.mean(this_hour) if len(this_month.compressed()) > 0: # can get stations with few obs in a particular variable. # anomalise each hour over month appropriately anomalies = this_month - np.tile(hourly_climatologies, (this_month.shape[0],1)) anomalies_filtered = this_month_filtered - np.tile(hourly_climatologies, (this_month_filtered.shape[0],1)) if len(anomalies.compressed()) >= 10: iqr = utils.IQR(anomalies.compressed().reshape(-1))/2. # to match IDL if iqr < 1.5: iqr = 1.5 else: iqr = st_var.mdi normed_anomalies = anomalies / iqr normed_anomalies_filtered = anomalies_filtered / iqr # get average anomaly for year year_ids = np.array(year_ids) monthly_vqvs = np.ma.zeros(month_ranges.shape[0]) monthly_vqvs.mask = [False for x in range(month_ranges.shape[0])] for year in range(month_ranges.shape[0]): year_locs = np.where(year_ids == year) this_year = normed_anomalies_filtered[year_locs,:] if len(this_year.compressed()) > 0: # need to have data for this to work! if idl: monthly_vqvs[year] = utils.idl_median(this_year.compressed().reshape(-1)) else: monthly_vqvs[year] = np.ma.median(this_year) else: monthly_vqvs.mask[year] = True # low pass filter normed_anomalies = coc_low_pass_filter(normed_anomalies, year_ids, monthly_vqvs, month_ranges.shape[0]) # copy from distributional_gap.py - refactor! # get the threshold value bins, bincenters = utils.create_bins(normed_anomalies, 1.) hist, binEdges = np.histogram(normed_anomalies, bins = bins) gaussian = utils.fit_gaussian(bincenters, hist, max(hist), mu=np.mean(normed_anomalies), sig = np.std(normed_anomalies)) minimum_threshold = round(1. + utils.invert_gaussian(FREQUENCY_THRESHOLD, gaussian)) if diagnostics: print iqr, minimum_threshold, 1. + utils.invert_gaussian(FREQUENCY_THRESHOLD, gaussian) print gaussian print hist if plots: coc_set_up_plot(bincenters, hist, gaussian, variable, threshold = minimum_threshold, sub_par = "observations") uppercount = len(np.where(normed_anomalies > minimum_threshold)[0]) lowercount = len(np.where(normed_anomalies < -minimum_threshold)[0]) these_flags = station.qc_flags[:, flag_col[v]] gap_plot_values, tentative_plot_values = [], [] # find the gaps and apply the flags gap_start = dgc.dgc_find_gap(hist, binEdges, minimum_threshold, gap_size = 1) # in DGC it is 2. these_flags, gap_plot_values, tentative_plot_values =\ coc_find_and_apply_flags(month_ranges[:,month,:],normed_anomalies, these_flags, year_ids, minimum_threshold, gap_start, \ upper = True, plots = plots, gpv = gap_plot_values, tpv = tentative_plot_values) gap_start = dgc.dgc_find_gap(hist, binEdges, -minimum_threshold, gap_size = 1) # in DGC it is 2. these_flags, gap_plot_values, tentative_plot_values =\ coc_find_and_apply_flags(month_ranges[:,month,:],normed_anomalies, these_flags, year_ids, minimum_threshold, gap_start, \ upper = False, plots = plots, gpv = gap_plot_values, tpv = tentative_plot_values) station.qc_flags[:, flag_col[v]] = these_flags if uppercount + lowercount > 1000: #print "not sorted spurious stations yet" pass if plots: import matplotlib.pyplot as plt hist, binEdges = np.histogram(tentative_plot_values, bins = bins) plot_hist = np.array([0.01 if h == 0 else h for h in hist]) plt.step(bincenters, plot_hist, c='orange', ls='-', label = 'tentative', where='mid') hist, binEdges = np.histogram(gap_plot_values, bins = bins) plot_hist = np.array([0.01 if h == 0 else h for h in hist]) plt.step(bincenters, plot_hist, 'r-', label = 'flagged', where='mid') import calendar plt.text(0.1,0.9,calendar.month_name[month+1], transform = plt.gca().transAxes) leg=plt.legend(loc='lower center',ncol=4, bbox_to_anchor=(0.5,-0.2),frameon=False,prop={'size':13},labelspacing=0.15,columnspacing=0.5) plt.setp(leg.get_title(), fontsize=14) plt.show() #plt.savefig(IMAGELOCATION+'/'+station.id+'_ClimatologicalGap_'+str(month+1)+'.png') flag_locs = np.where(station.qc_flags[:, flag_col[v]] != 0) # copy flags into attribute st_var.flags[flag_locs] = 1 if plots or diagnostics: utils.print_flagged_obs_number(logfile, "Climatological", variable, len(flag_locs[0]), noWrite = True) print "where\n" nflags = len(np.where(station.qc_flags[:, flag_col[v]] == 1)[0]) utils.print_flagged_obs_number(logfile, " Firm Clim", variable, nflags, noWrite = True) nflags = len(np.where(station.qc_flags[:, flag_col[v]] == 2)[0]) utils.print_flagged_obs_number(logfile, " Tentative Clim", variable, nflags, noWrite = True) else: utils.print_flagged_obs_number(logfile, "Climatological", variable, len(flag_locs[0])) logfile.write("where\n") nflags = len(np.where(station.qc_flags[:, flag_col[v]] == 1)[0]) utils.print_flagged_obs_number(logfile, " Firm Clim", variable, nflags) nflags = len(np.where(station.qc_flags[:, flag_col[v]] == 2)[0]) utils.print_flagged_obs_number(logfile, " Tentative Clim", variable, nflags) # firm flags match 030220 station = utils.append_history(station, "Climatological Check") return