def neighbour_outlier(target_station, initial_neighbours, variable, diagnostics=False, plots=False, full=False): """ Works on a single station and variable. Reads in neighbour's data, finds locations where sufficent are sufficiently different. :param Station target_station: station to run on :param array initial_neighbours: input neighbours (ID, distance) pairs :param str variable: obs variable being run on :param bool diagnostics: print extra material to screen :param bool plots: create plots from each test :param bool full: run full reprocessing rather than using stored values. """ station_list = utils.get_station_list() # if sufficient n_neighbours = len(np.where(initial_neighbours[:, 0] != "-")[0]) - 1 if n_neighbours < utils.MIN_NEIGHBOURS: print("{} has insufficient neighbours ({}<{})".format( target_station.id, n_neighbours, utils.MIN_NEIGHBOURS)) else: #************************* # extract target observations obs_var = getattr(target_station, variable) flags = np.array(["" for i in range(obs_var.data.shape[0]) ]).astype("<U10") #************************* # read in in the neighbour (buddy) data all_buddy_data = np.ma.zeros( [len(initial_neighbours[:, 0]), len(target_station.times)]) all_buddy_data.mask = np.ones(all_buddy_data.shape) for bid, buddy_id in enumerate(initial_neighbours[:, 0]): if buddy_id == target_station.id: # first entry is self continue if buddy_id == "-": # end of the list of buddies break if diagnostics: print("{}/{} {}".format(bid, len(initial_neighbours[:, 0]), buddy_id)) # set up station object to hold information buddy_idx, = np.where(station_list.id == buddy_id) buddy = utils.Station(buddy_id, station_list.iloc[buddy_idx].latitude.values[0], \ station_list.iloc[buddy_idx].longitude.values[0], station_list.iloc[buddy_idx].elevation.values[0]) try: buddy, buddy_df = io.read_station(os.path.join( setup.SUBDAILY_PROC_DIR, "{:11s}.qff".format(buddy_id)), buddy, read_flags=True) buddy_var = getattr(buddy, variable) # apply flags flag_locs, = np.where(buddy_var.flags != "") buddy_var.data.mask[flag_locs] = True except OSError as e: # file missing, move on to next in sequence io.write_error( target_station, "File Missing (Buddy, {}) - {}".format(variable, buddy_id)) continue except ValueError as e: # some issue in the raw file io.write_error(target_station, "Error in input file (Buddy, {}) - {}".format( variable, buddy_id), error=str(e)) continue # match the timestamps of target_station and copy over match = np.in1d(target_station.times, buddy.times) match_back = np.in1d(buddy.times, target_station.times) if True in match and True in match_back: # skip if no overlapping times at all! all_buddy_data[bid, match] = buddy_var.data[match_back] if diagnostics: print("All buddies read in") #************************* # find differences differences = all_buddy_data - obs_var.data #************************* # find spread of differences on monthly basis (with minimum value) spreads = np.ma.zeros(differences.shape) for month in range(1, 13): month_locs = np.where(target_station.months == month) for bid, buddy in enumerate(differences): if len(differences[bid, month_locs].compressed() ) > utils.DATA_COUNT_THRESHOLD: this_spread = utils.spread(differences[bid, month_locs]) if this_spread < MIN_SPREAD: spreads[bid, month_locs] = MIN_SPREAD else: spreads[bid, month_locs] = utils.spread( differences[bid, month_locs]) else: spreads[bid, month_locs] = MIN_SPREAD spreads.mask = np.copy(differences.mask) # store which entries may be sufficient to flag dubious = np.ma.zeros(differences.shape) dubious.mask = np.copy(differences.mask) #************************* # adjust for storms if variable in ["sea_level_pressure", "station_level_pressure"]: distant, = np.where(initial_neighbours[:, 1].astype(int) > 100) if len(distant) > 0: # find positive and negative differences across neighbours positive = np.ma.where( differences[distant] > spreads[distant] * SPREAD_LIMIT) negative = np.ma.where( differences[distant] < spreads[distant] * SPREAD_LIMIT) # spin through each neighbour for dn, dist_neigh in enumerate(distant): pos, = np.where(positive[0] == dn) neg, = np.where(negative[0] == dn) if len(neg) > 0: ratio = len(neg) / (len(pos) + len(neg)) if ratio > 0.667: # majority negative, only flag the positives [definitely not storms] dubious[dist_neigh, positive[1][pos]] = 1 else: # all stations close by so storms shouldn't affect, include all # note where differences exceed the spread dubious_locs = np.ma.where( np.ma.abs(differences) > spreads * SPREAD_LIMIT) dubious[dubious_locs] = 1 else: #************************* # note where differences exceed the spread [all non pressure variables] dubious_locs = np.ma.where( np.ma.abs(differences) > spreads * SPREAD_LIMIT) dubious[dubious_locs] = 1 if diagnostics: print("cross checks complete - assessing all outcomes") #************************* # sum across neighbours neighbour_count = np.ma.count(differences, axis=0) dubious_count = np.ma.sum(dubious, axis=0) # flag if large enough fraction (>0.66) sufficient, = np.ma.where(dubious_count > 0.66 * neighbour_count) flags[sufficient] = "N" if plots: for flag in sufficient: plot_neighbour_flags(target_station.times, flag, obs_var, all_buddy_data) # append flags to object obs_var.flags = utils.insert_flags(obs_var.flags, flags) if diagnostics: print("Neighbour Outlier {}".format(obs_var.name)) print(" Cumulative number of flags set: {}".format( len(np.where(flags != "")[0]))) return # neighbour_outlier
except IOError: print "station list not found" sys.exit() all_flag_sums = np.zeros([len(station_info), len(qc_test) + 6]) all_flag_pct = np.zeros([len(station_info), len(qc_test)]) Lons = [] Lats = [] uk_stns = [] for st, stat in enumerate(station_info): # set up station station = utils.Station(stat[0], float(stat[1]), float(stat[2]), float(stat[3])) # if station.id[:2] != "03": # continue print st, station.id # read attributes and qc_flags ncdfp.read(os.path.join(NETCDF_DATA_LOCS, station.id + "_mask.nc"), station, process_vars, [], diagnostics=diagnostics) # sum qc_flags: # remove multi-level flagging qc_flags = station.qc_flags[:]
def neighbour_checks(station_info, restart_id = "", end_id = "", distances=np.array([]), angles=np.array([]), second = False, masking = False, doZip=False, plots = False, diagnostics = False): """ Run through neighbour checks on list of stations passed :param list station_info: list of lists - [[ID, lat, lon, elev]] - strings :param array distances: array of distances between station pairs :param array angles: array of angles between station pairs :param bool second: do the second run :param bool masking: apply the flags to the data to mask the observations. """ first = not second qc_code_version = subprocess.check_output(['svnversion']).strip() # if distances and angles not calculated, then do so if (len(distances) == 0) or (len(angles) == 0): print "calculating distances and bearings matrix" distances, angles = get_distances_angles(station_info) # extract before truncate the array neighbour_elevations = np.array(station_info[:,3], dtype=float) neighbour_ids = np.array(station_info[:,0]) neighbour_info = np.array(station_info[:,:]) # sort truncated run startindex = 0 if restart_id != "": startindex, = np.where(station_info[:,0] == restart_id) if end_id != "": endindex, = np.where(station_info[:,0] == end_id) if endindex != len(station_info) -1: station_info = station_info[startindex: endindex+1] distances = distances[startindex:endindex+1,:] angles = angles[startindex:endindex+1,:] else: station_info = station_info[startindex:] distances = distances[startindex:,:] angles = angles[startindex:,:] else: station_info = station_info[startindex:] distances = distances[startindex:,:] angles = angles[startindex:,:] # process each neighbour for st, stat in enumerate(station_info): print dt.datetime.strftime(dt.datetime.now(), "%A, %d %B %Y, %H:%M:%S") print "Neighbour Check" print "{:35s} {}".format("Station Identifier :", stat[0]) if not plots and not diagnostics: logfile = file(LOG_OUTFILE_LOCS+stat[0]+'.log','a') # append to file if second iteration. logfile.write(dt.datetime.strftime(dt.datetime.now(), "%A, %d %B %Y, %H:%M:%S\n")) logfile.write("Neighbour Check\n") logfile.write("{:35s} {}\n".format("Station Identifier :", stat[0])) else: logfile = "" process_start_time = time.time() station = utils.Station(stat[0], float(stat[1]), float(stat[2]), float(stat[3])) # if running through the first time if first: if os.path.exists(os.path.join(NETCDF_DATA_LOCS, station.id + "_internal.nc.gz")): # if gzip file, unzip here subprocess.call(["gunzip",os.path.join(NETCDF_DATA_LOCS, station.id + "_internal.nc.gz")]) time.sleep(5) # make sure it is unzipped before proceeding # read in the data ncdfp.read(os.path.join(NETCDF_DATA_LOCS, station.id + "_internal.nc"), station, process_vars, carry_thru_vars, diagnostics = diagnostics) if plots or diagnostics: print "{:35s} {}\n".format("Total station record size :",len(station.time.data)) else: logfile.write("{:35s} {}\n".format("Total station record size :",len(station.time.data))) match_to_compress = utils.create_fulltimes(station, process_vars, DATASTART, DATAEND, carry_thru_vars) # or if second pass through? elif second: if os.path.exists(os.path.join(NETCDF_DATA_LOCS, station.id + "internal2.nc.gz")): # if gzip file, unzip here subprocess.call(["gunzip",os.path.join(NETCDF_DATA_LOCS, station.id + "_internal2.nc.gz")]) time.sleep(5) # make sure it is unzipped before proceeding ncdfp.read(os.path.join(NETCDF_DATA_LOCS, station.id + "_internal2.nc"), station, process_vars, carry_thru_vars, diagnostics = diagnostics) if plots or diagnostics: print "{:35s} {}\n".format("Total station record size :",len(station.time.data)) else: logfile.write("{:35s} {}\n".format("Total station record size :",len(station.time.data))) match_to_compress = utils.create_fulltimes(station, process_vars, DATASTART, DATAEND, carry_thru_vars) # select neighbours neighbour_distances = distances[st,:] neighbour_bearings = angles[st,:] # have to add in start index so that can use location in distance file. # neighbours = n_utils.get_neighbours(st+startindex, np.float(stat[3]), neighbour_distances, neighbour_bearings, neighbour_elevations) # return all neighbours up to a limit from the distance and elevation offsets (500km and 300m respectively) neighbours, neighbour_quadrants = n_utils.get_all_neighbours(st+startindex, np.float(stat[3]), neighbour_distances, neighbour_bearings, neighbour_elevations) if plots or diagnostics: print "{:14s} {:10s} {:10s}".format("Neighbour","Distance","Elevation") for n in neighbours: print "{:14s} {:10.1f} {:10.1f}".format(neighbour_ids[n],neighbour_distances[n],neighbour_elevations[n]) else: logfile.write("{:14s} {:10s} {:10s}\n".format("Neighbour","Distance","Elevation")) for n in neighbours: logfile.write("{:14s} {:10.1f} {:10.1f}\n".format(neighbour_ids[n],neighbour_distances[n],neighbour_elevations[n])) # if sufficient neighbours if len(neighbours) >= 3: for variable, col in FLAG_OUTLIER_DICT.items(): # NOTE - this requires multiple reads of the same file # but does make it easier to understand and code st_var = getattr(station, variable) if plots or diagnostics: print "Length of {} record: {}".format(variable, len(st_var.data.compressed())) else: logfile.write("Length of {} record: {}\n".format(variable, len(st_var.data.compressed()))) if len(st_var.data.compressed()) > 0: final_neighbours = n_utils.select_neighbours(station, variable, neighbour_info[neighbours], neighbours, neighbour_distances[neighbours], neighbour_quadrants, NETCDF_DATA_LOCS, DATASTART, DATAEND, logfile, second = second, diagnostics = diagnostics, plots = plots) # now read in final set of neighbours and process neigh_flags = np.zeros(len(station.time.data)) # count up how many neighbours think this obs is bad neigh_count = np.zeros(len(station.time.data)) # number of neighbours at each time stamp dpd_flags = np.zeros(len(station.time.data)) # number of neighbours at each time stamp reporting_accuracies = np.zeros(len(neighbours)) # reporting accuracy of each neighbour all_data = np.ma.zeros([len(final_neighbours), len(station.time.data)]) # store all the neighbour values for nn, nn_loc in enumerate(final_neighbours): neigh_details = neighbour_info[nn_loc] neigh = utils.Station(neigh_details[0], float(neigh_details[1]), float(neigh_details[2]), float(neigh_details[3])) if first: ncdfp.read(os.path.join(NETCDF_DATA_LOCS, neigh.id + "_internal.nc"), neigh, [variable], diagnostics = diagnostics, read_input_station_id = False) elif second: ncdfp.read(os.path.join(NETCDF_DATA_LOCS, neigh.id + "_internal2.nc"), neigh, [variable], diagnostics = diagnostics, read_input_station_id = False) dummy = utils.create_fulltimes(neigh, [variable], DATASTART, DATAEND, [], do_input_station_id = False) all_data[nn, :] = utils.apply_filter_flags(getattr(neigh, variable)) if diagnostics: print neigh_details n_utils.detect(station, neigh, variable, neigh_flags, neigh_count, DATASTART, DATAEND, distance = neighbour_distances[nn_loc], diagnostics = diagnostics, plots = plots) reporting_accuracies[nn] = utils.reporting_accuracy(getattr(neigh,variable).data) dpd_flags += neigh.qc_flags[:,31] # gone through all neighbours # if at least 2/3 of neighbours have flagged this point (and at least 3 neighbours) some_flags, = np.where(neigh_flags > 0) outlier_locs, = np.where(np.logical_and((neigh_count[some_flags] >= 3),(neigh_flags[some_flags].astype("float")/neigh_count[some_flags] > 2./3.))) # flag where < 3 neighbours locs = np.where(neigh_count[some_flags] < 3) station.qc_flags[some_flags[locs], col] = -1 if len(outlier_locs) >= 1: station.qc_flags[some_flags[outlier_locs], col] = 1 # print number flagged and copy into attribute if plots or diagnostics: utils.print_flagged_obs_number(logfile, "Neighbour", variable, len(outlier_locs), noWrite = True) else: utils.print_flagged_obs_number(logfile, "Neighbour", variable, len(outlier_locs)) st_var = getattr(station, variable) st_var.flags[some_flags[outlier_locs]] = 1 else: if plots or diagnostics: utils.print_flagged_obs_number(logfile, "Neighbour", variable, len(outlier_locs), noWrite = True) else: utils.print_flagged_obs_number(logfile, "Neighbour", variable, len(outlier_locs)) if plots: n_utils.plot_outlier(station, variable, some_flags[outlier_locs], all_data, DATASTART) # unflagging using neighbours n_utils.do_unflagging(station, variable, all_data, reporting_accuracies, neigh_count, dpd_flags, FLAG_COL_DICT, DATASTART, logfile, plots = plots, diagnostics = diagnostics) else: if plots or diagnostics: print "No observations to assess for {}".format(variable) else: logfile.write("No observations to assess for {}\n".format(variable)) # variable loop else: if plots or diagnostics: print "Fewer than 3 neighbours" else: logfile.write("Fewer than 3 neighbours\n") print dt.datetime.strftime(dt.datetime.now(), "%A, %d %B %Y, %H:%M:%S") print "processing took {:4.0f}s\n\n".format(time.time() - process_start_time) # end of neighbour check utils.append_history(station, "Neighbour Outlier Check") # clean up months qc_tests.clean_up.clu(station, ["temperatures","dewpoints","slp","windspeeds","winddirs"], [44,45,46,47,48], FLAG_COL_DICT, DATASTART, DATAEND, logfile, plots = plots, diagnostics = diagnostics) if diagnostics or plots: raw_input("stop") # masking (at least call from here - optional call from internal?) # write to file if first: ncdfp.write(os.path.join(NETCDF_DATA_LOCS, station.id + "_external.nc"), station, process_vars, os.path.join(INPUT_FILE_LOCS,'attributes.dat'), opt_var_list = carry_thru_vars, compressed = match_to_compress, processing_date = '', qc_code_version = qc_code_version) # gzip the raw file elif second: ncdfp.write(os.path.join(NETCDF_DATA_LOCS, station.id + "_external2.nc"), station, process_vars, os.path.join(INPUT_FILE_LOCS,'attributes.dat'), opt_var_list = carry_thru_vars, compressed = match_to_compress, processing_date = '', qc_code_version = qc_code_version) # gzip the raw file # masking - apply the flags and copy masked data to flagged_obs attribute if masking: station = utils.mask(station, process_vars, logfile, FLAG_COL_DICT) # write to file if first: ncdfp.write(os.path.join(NETCDF_DATA_LOCS, station.id + "_mask.nc"), station, process_vars, os.path.join(INPUT_FILE_LOCS,'attributes.dat'), opt_var_list = carry_thru_vars, compressed = match_to_compress, processing_date = '', qc_code_version = qc_code_version) elif second: ncdfp.write(os.path.join(NETCDF_DATA_LOCS, station.id + "_mask2.nc"), station, process_vars, os.path.join(INPUT_FILE_LOCS,'attributes.dat'), opt_var_list = carry_thru_vars, compressed = match_to_compress, processing_date = '', qc_code_version = qc_code_version) if plots or diagnostics: print "Masking completed\n" print dt.datetime.strftime(dt.datetime.now(), "%A, %d %B %Y, %H:%M:%S\n") print "processing took {:4.0f}s\n\n".format(time.time() - process_start_time) else: logfile.write("Masking completed\n") logfile.write(dt.datetime.strftime(dt.datetime.now(), "%A, %d %B %Y, %H:%M:%S\n")) logfile.write("processing took {:4.0f}s\n\n".format(time.time() - process_start_time)) logfile.close() # looped through all stations # gzip up all the raw files if doZip: for st, stat in enumerate(station_info): if first: subprocess.call(["gzip",os.path.join(NETCDF_DATA_LOCS, stat[0]+"_internal.nc")]) if masking: subprocess.call(["gzip",os.path.join(NETCDF_DATA_LOCS, stat[0]+"_external.nc")]) subprocess.call(["gzip",os.path.join(NETCDF_DATA_LOCS, stat[0]+"_mask.nc")]) elif second: subprocess.call(["gzip",os.path.join(NETCDF_DATA_LOCS, stat[0]+"_internal2.nc")]) if masking: subprocess.call(["gzip",os.path.join(NETCDF_DATA_LOCS, stat[0]+"_external2.nc")]) subprocess.call(["gzip",os.path.join(NETCDF_DATA_LOCS, stat[0]+"_mask2.nc")]) print "Neighbour Checks completed\n" return # neighbour_checks
def internal_checks(station_info, restart_id="", end_id="", second=False, all_checks=True, duplicate=False, odd=False, frequent=False, diurnal=False, gap=False, records=False, streaks=False, climatological=False, spike=False, humidity=False, cloud=False, variance=False, winds=False, diagnostics=False, plots=False): ''' Run through internal checks on list of stations passed :param list station_info: list of lists - [[ID, lat, lon, elev]] - strings :param str restart_id: which station to start on :param str end_id: which station to end on :param bool second: do the second run :param bool all_checks: run all the checks :param bool duplicate/odd/frequent/diurnal/gap/records/streaks/ climatological/spike/humidity/cloud/variance/winds: run each test separately :param bool diagnostics: print extra material to screen :param bool plots: create plots from each test [many files if all stations/all tests] ''' first = not second if all_checks: duplicate = True odd = True frequent = True diurnal = True gap = True records = True streaks = True climatological = True spike = True humidity = True cloud = True variance = True winds = True else: print "single tests selected" qc_code_version = subprocess.check_output(['svnversion']).strip() # sort truncated run startindex = 0 if restart_id != "": startindex, = np.where(station_info[:, 0] == restart_id) if end_id != "": endindex, = np.where(station_info[:, 0] == end_id) if endindex != len(station_info) - 1: station_info = station_info[startindex:endindex + 1] else: station_info = station_info[startindex:] else: station_info = station_info[startindex:] for st, stat in enumerate(station_info): # if st%100 != 0: continue # do every nth station print dt.datetime.strftime(dt.datetime.now(), "%A, %d %B %Y, %H:%M:%S") print "{:35s} {:d}/{:d}".format("Station Number : ", st + 1, len(station_info)) print "{:35s} {}".format("Station Identifier :", stat[0]) if plots or diagnostics: logfile = "" else: if first: logfile = file(LOG_OUTFILE_LOCS + stat[0] + '.log', 'w') elif second: logfile = file(LOG_OUTFILE_LOCS + stat[0] + '.log', 'a') # append to file if second iteration. logfile.write( dt.datetime.strftime(dt.datetime.now(), "%A, %d %B %Y, %H:%M:%S\n")) logfile.write("Internal Checks\n") logfile.write("{:35s} {}\n".format("Station Identifier :", stat[0])) process_start_time = time.time() station = utils.Station(stat[0], float(stat[1]), float(stat[2]), float(stat[3])) # latitude and longitude check if np.abs(station.lat) > 90.: if plots or diagnostics: print "{} {} {} {} {} {} {}\n".format(\ station.id,"Latitude Check",DATASTART.year, DATAEND.year,"All", "Unphysical latitude {}".format(station.lat)) else: logfile.write("{} {} {} {} {} {} {}\n".format(\ station.id,"Latitude Check",DATASTART.year, DATAEND.year,"All", "Unphysical latitude {}".format(station.lat))) logfile.close() continue if np.abs(station.lon) > 180.: if plots or diagnostics: print "{} {} {} {} {} {} {}\n".format(\ station.id,"Longitude Check",DATASTART.year, DATAEND.year,"All", "Unphysical longitude {}".format(station.lon)) else: logfile.write("{} {} {} {} {} {} {}\n".format(\ station.id,"Longitude Check",DATASTART.year, DATAEND.year,"All", "Unphysical longitude {}".format(station.lon))) logfile.close() continue # if running through the first time if first: if os.path.exists( os.path.join(NETCDF_DATA_LOCS, station.id + ".nc.gz")): # if gzip file, unzip here subprocess.call([ "gunzip", os.path.join(NETCDF_DATA_LOCS, station.id + ".nc.gz") ]) time.sleep(5) # make sure it is unzipped before proceeding # read in the data ncdfp.read(os.path.join(NETCDF_DATA_LOCS, station.id + ".nc"), station, process_vars, opt_var_list=carry_thru_vars, diagnostics=diagnostics) if plots or diagnostics: print "{:35s} {}\n".format("Total station record size :", len(station.time.data)) else: logfile.write("{:35s} {}\n".format( "Total station record size :", len(station.time.data))) match_to_compress = utils.create_fulltimes(station, process_vars, DATASTART, DATAEND, carry_thru_vars) station.qc_flags = np.zeros( [len(station.time.data), 69]) # changed to include updated wind tests # get reporting accuracies and frequencies. for var in process_vars: st_var = getattr(station, var) st_var.reporting_stats = utils.monthly_reporting_statistics( st_var, DATASTART, DATAEND) # or if second pass through? elif second: ncdfp.read(os.path.join(NETCDF_DATA_LOCS, station.id + "_mask.nc"), station, process_vars, opt_var_list=carry_thru_vars, diagnostics=diagnostics) print "{:35s} {}\n".format("Total station record size :", len(station.time.data)) match_to_compress = utils.create_fulltimes(station, process_vars, DATASTART, DATAEND, carry_thru_vars) # Add history text to netcdf file # Reporting Changes - TODO # Duplicate months - check on temperature ONLY if duplicate: qc_tests.duplicate_months.dmc(station, ['temperatures'], process_vars, [0], DATASTART, DATAEND, logfile, diagnostics=diagnostics, plots=plots) # Odd Clusters if odd: qc_tests.odd_cluster.occ( station, ['temperatures', 'dewpoints', 'windspeeds', 'slp'], [54, 55, 56, 57], DATASTART, logfile, diagnostics=diagnostics, plots=plots, second=second) utils.apply_windspeed_flags_to_winddir(station, diagnostics=diagnostics) # Frequent Values if frequent: qc_tests.frequent_values.fvc(station, ['temperatures', 'dewpoints', 'slp'], [1, 2, 3], DATASTART, DATAEND, logfile, diagnostics=diagnostics, plots=plots) # Diurnal Cycle if diurnal: if np.abs(station.lat) <= 60.: qc_tests.diurnal_cycle.dcc(station, ['temperatures'], process_vars, [4], logfile, diagnostics=diagnostics, plots=plots) else: if plots or diagnostics: print "Diurnal Cycle Check not run as station latitude ({}) > 60\n".format( station.lat) else: logfile.write( "Diurnal Cycle Check not run as station latitude ({}) > 60\n" .format(station.lat)) # Distributional Gap if gap: qc_tests.distributional_gap.dgc( station, ['temperatures', 'dewpoints', 'slp'], [5, 6, 7], DATASTART, DATAEND, logfile, diagnostics=diagnostics, plots=plots, GH=True) # Records if records: qc_tests.records.krc( station, ['temperatures', 'dewpoints', 'windspeeds', 'slp'], [8, 9, 10, 11], logfile, diagnostics=diagnostics, plots=plots) utils.apply_windspeed_flags_to_winddir(station, diagnostics=diagnostics) # Streaks and Repetitions if streaks: qc_tests.streaks.rsc( station, ['temperatures', 'dewpoints', 'windspeeds', 'slp', 'winddirs'], [[12, 16, 20], [13, 17, 21], [14, 18, 22], [15, 19, 23], [66, 67, 68]], DATASTART, DATAEND, logfile, diagnostics=diagnostics, plots=plots) utils.apply_windspeed_flags_to_winddir(station, diagnostics=diagnostics) # Climatological Outlier if climatological: qc_tests.climatological.coc(station, ['temperatures', 'dewpoints'], [24, 25], DATASTART, DATAEND, logfile, diagnostics=diagnostics, plots=plots) # column 26 kept spare for slp # Spike if spike: qc_tests.spike.sc( station, ['temperatures', 'dewpoints', 'slp', 'windspeeds'], [27, 28, 29, 65], DATASTART, DATAEND, logfile, diagnostics=diagnostics, plots=plots, second=second) utils.apply_windspeed_flags_to_winddir(station, diagnostics=diagnostics) # Humidity cross checks if humidity: qc_tests.humidity.hcc(station, [30, 31, 32], DATASTART, DATAEND, logfile, diagnostics=diagnostics, plots=plots) # Cloud cross check if cloud: qc_tests.clouds.ccc(station, [33, 34, 35, 36, 37, 38, 39, 40], logfile, diagnostics=diagnostics, plots=plots) # Variance if variance: qc_tests.variance.evc( station, ['temperatures', 'dewpoints', 'slp', 'windspeeds'], [58, 59, 60, 61], DATASTART, DATAEND, logfile, diagnostics=diagnostics, plots=plots) utils.apply_windspeed_flags_to_winddir(station, diagnostics=diagnostics) # Winds if winds: qc_tests.winds.wdc(station, [62, 63, 64], DATASTART, DATAEND, logfile, diagnostics=diagnostics, plots=plots) # are flags actually applied? if diagnostics or plots: raw_input("stop") # write to file if first: ncdfp.write(os.path.join(NETCDF_DATA_LOCS, station.id + "_internal.nc"), station, process_vars, os.path.join(INPUT_FILE_LOCS, 'attributes.dat'), opt_var_list=carry_thru_vars, compressed=match_to_compress, processing_date='', qc_code_version=qc_code_version) # gzip the raw file subprocess.call( ["gzip", os.path.join(NETCDF_DATA_LOCS, station.id + ".nc")]) elif second: ncdfp.write(os.path.join(NETCDF_DATA_LOCS, station.id + "_internal2.nc"), station, process_vars, os.path.join(INPUT_FILE_LOCS, 'attributes.dat'), opt_var_list=carry_thru_vars, compressed=match_to_compress, processing_date='', qc_code_version=qc_code_version) # gzip the raw file subprocess.call([ "gzip", os.path.join(NETCDF_DATA_LOCS, station.id + "_mask.nc") ]) logfile.write( dt.datetime.strftime(dt.datetime.now(), "%A, %d %B %Y, %H:%M:%S\n")) logfile.write( "processing took {:4.0f}s\n\n".format(time.time() - process_start_time)) logfile.close() print "Internal Checks completed\n" return # internal_checks
def run_checks(restart_id="", end_id="", diagnostics=False, plots=False, full=False, test="all"): """ Main script. Reads in station data, populates internal objects and passes to the tests. :param str restart_id: which station to start on :param str end_id: which station to end on :param bool diagnostics: print extra material to screen :param bool plots: create plots from each test :param bool full: run full reprocessing rather than using stored values. :param str test: specify a single test to run (useful for diagnostics) [climatological/distribution/diurnal frequent/humidity/odd_cluster/pressure/spike/streaks/timestamp/variance/winds/world_records] """ # process the station list station_list = utils.get_station_list(restart_id=restart_id, end_id=end_id) station_IDs = station_list.id # now spin through each ID in the curtailed list for st, station_id in enumerate(station_IDs): print("{} {:11s} ({}/{})".format(dt.datetime.now(), station_id, st + 1, station_IDs.shape[0])) startT = dt.datetime.now() # set up config file to hold thresholds etc config_file = os.path.join(setup.SUBDAILY_CONFIG_DIR, "{:11s}.config".format(station_id)) if full: try: # recreating, so remove completely os.remove(config_file) except IOError: pass #************************* # set up the stations station = utils.Station(station_id, station_list.latitude[st], station_list.longitude[st], station_list.elevation[st]) if diagnostics: print(station) try: station, station_df = io.read_station( os.path.join(setup.SUBDAILY_MFF_DIR, "{:11s}.mff".format(station_id)), station) except OSError as e: # file missing, move on to next in sequence io.write_error(station, "File Missing") continue except ValueError as e: # some issue in the raw file io.write_error(station, "Error in input file", error=str(e)) continue # some may have no data (for whatever reason) if station.times.shape[0] == 0: if diagnostics: print("No data in station {}".format(station.id)) # scoot onto next station io.write_error(station, "No data in input file") continue #************************* # Add the country and continent station.country = utils.find_country_code(station.lat, station.lon) station.continent = utils.find_continent(station.country) #************************* """ HadISD tests and order Duplicated months Odd Clusters of data - need to address output with buddy checks in due course. Frequent Values - tick Diurnal Cycle Gaps in distributions - tick World Records - tick Repeated values (streaks or just too common short ones) - partial tick Climatology - tick Spike - tick Humidity Cross checks - super saturation, dewpoint depression, dewpoint cut off - tick (dewpoint cut off not applied) Cloud logical checks - clouds not in C3S 311a @Aug 2019 Excess Variance - partial tick Winds (logical wind & wind rose) - logical tick. Not sure if wind rose is robust enough Logical SLP/StnLP - tick Precipitation logical checks - precip not in C3S 311a @Aug 2019 """ #************************* if test in ["all", "logic"]: # incl lat, lon and elev checks # print("L", dt.datetime.now() - startT) good_metadata = qc_tests.logic_checks.lc(station, [ "temperature", "dew_point_temperature", "station_level_pressure", "sea_level_pressure", "wind_speed", "wind_direction" ], full=full, plots=plots, diagnostics=diagnostics) if good_metadata != 0: print("Issue with station metadata") # skip on to next one continue if test in ["all", "odd_cluster"]: print("O", dt.datetime.now() - startT) # TODO - use suite config file to store all settings for tests qc_tests.odd_cluster.occ(station, [ "temperature", "dew_point_temperature", "station_level_pressure", "sea_level_pressure", "wind_speed" ], config_file, full=full, plots=plots, diagnostics=diagnostics) if test in ["all", "frequent"]: print("F", dt.datetime.now() - startT) qc_tests.frequent.fvc(station, [ "temperature", "dew_point_temperature", "station_level_pressure", "sea_level_pressure" ], config_file, full=full, plots=plots, diagnostics=diagnostics) # HadISD only runs on stations where latitude lower than 60(N/S) # Takes a long time, this one if test in ["all", "diurnal"]: print("U", dt.datetime.now() - startT) if np.abs(station.lat < 60): qc_tests.diurnal.dcc(station, config_file, full=full, plots=plots, diagnostics=diagnostics) if test in ["all", "distribution"]: print("D", dt.datetime.now() - startT) qc_tests.distribution.dgc(station, [ "temperature", "dew_point_temperature", "station_level_pressure", "sea_level_pressure" ], config_file, full=full, plots=plots, diagnostics=diagnostics) if test in ["all", "world_records"]: print("W", dt.datetime.now() - startT) qc_tests.world_records.wrc(station, [ "temperature", "dew_point_temperature", "sea_level_pressure", "wind_speed" ], full=full, plots=plots, diagnostics=diagnostics) if test in ["all", "streaks"]: print("K", dt.datetime.now() - startT) qc_tests.streaks.rsc(station, [ "temperature", "dew_point_temperature", "station_level_pressure", "sea_level_pressure", "wind_speed", "wind_direction" ], config_file, full=full, plots=plots, diagnostics=diagnostics) # not run on pressure data in HadISD. if test in ["all", "climatological"]: print("C", dt.datetime.now() - startT) qc_tests.climatological.coc( station, ["temperature", "dew_point_temperature"], config_file, full=full, plots=plots, diagnostics=diagnostics) if test in ["all", "timestamp"]: print("T", dt.datetime.now() - startT) qc_tests.timestamp.tsc(station, [ "temperature", "dew_point_temperature", "station_level_pressure", "sea_level_pressure", "wind_speed" ], config_file, full=full, plots=plots, diagnostics=diagnostics) if test in ["all", "spike"]: print("S", dt.datetime.now() - startT) qc_tests.spike.sc(station, [ "temperature", "dew_point_temperature", "station_level_pressure", "sea_level_pressure", "wind_speed" ], config_file, full=full, plots=plots, diagnostics=diagnostics) if test in ["all", "humidity"]: print("h", dt.datetime.now() - startT) qc_tests.humidity.hcc(station, config_file, full=full, plots=plots, diagnostics=diagnostics) if test in ["all", "variance"]: print("V", dt.datetime.now() - startT) qc_tests.variance.evc(station, [ "temperature", "dew_point_temperature", "station_level_pressure", "sea_level_pressure", "wind_speed" ], config_file, full=full, plots=plots, diagnostics=diagnostics) if test in ["all", "pressure"]: print("P", dt.datetime.now() - startT) qc_tests.pressure.pcc(station, config_file, full=full, plots=plots, diagnostics=diagnostics) if test in ["all", "winds"]: print("w", dt.datetime.now() - startT) qc_tests.winds.wcc(station, config_file, fix=True, full=full, plots=plots, diagnostics=diagnostics) if test in ["all", "high_flag"]: print("H", dt.datetime.now() - startT) hfr_vars_set = qc_tests.high_flag.hfr(station, [ "temperature", "dew_point_temperature", "station_level_pressure", "sea_level_pressure", "wind_speed", "wind_direction" ], full=full, plots=plots, diagnostics=diagnostics) print(dt.datetime.now() - startT) #************************* # Insert flags into Data Frame # need to insert columns in correct place column_names = station_df.columns.values #************************* # add QC flag columns to each variable # initialise with blank # need to automate the column identification new_column_indices = [] for c, column in enumerate(station_df.columns): if column in setup.obs_var_list: new_column_indices += [ c + 2 ] # 2 offset rightwards from variable's column # reverse order so can insert without messing up the indices new_column_indices.reverse() for index in new_column_indices: station_df.insert( index, "{}_QC_flag".format(station_df.columns[index - 2]), ["" for i in range(station_df.shape[0])], True) # # sort source_ID.x columns - purely for first release # for c, column in enumerate(station_df.columns): # if "Source_ID" in column: # # replace the NaN with empty string # station_df[column] = station_df[column].fillna('') # # rename the column # variable = station_df.columns[c-1] # station_df = station_df.rename(columns={column : "{}_Source_ID".format(variable)}) # write in the flag information for var in setup.obs_var_list: obs_var = getattr(station, var) station_df["{}_QC_flag".format(var)] = obs_var.flags #************************* # Output of QFF # write out the dataframe to output format if hfr_vars_set > 1: # high flagging rates in more than one variable. Withholding station completely print("{} withheld as too high flagging".format(station.id)) io.write( os.path.join(setup.SUBDAILY_BAD_DIR, "{:11s}.qff".format(station_id)), station_df) else: io.write( os.path.join(setup.SUBDAILY_PROC_DIR, "{:11s}.qff".format(station_id)), station_df) #************************* # Output flagging summary file io.flag_write(os.path.join(setup.SUBDAILY_FLAG_DIR, "{:11s}.flg".format(station_id)), station_df, diagnostics=diagnostics) print(dt.datetime.now() - startT) # if diagnostics or plots: # input("end") # break return # run_checks
def run_checks(restart_id="", end_id="", diagnostics=False, plots=False, full=False, test="all"): """ Main script. Reads in station data, populates internal objects and passes to the tests. :param str restart_id: which station to start on :param str end_id: which station to end on :param bool diagnostics: print extra material to screen :param bool plots: create plots from each test :param bool full: run full reprocessing rather than using stored values. :param str test: specify a single test to run (useful for diagnostics) [neighbour/clean_up/high_flag] """ # process the station list station_list = utils.get_station_list(restart_id=restart_id, end_id=end_id) station_IDs = station_list.id # read in all the neighbours for these stations to hold ready all_neighbours = read_neighbours(restart_id=restart_id, end_id=end_id) # now spin through each ID in the curtailed list for st, target_station_id in enumerate(station_IDs): print("{} {} ({}/{})".format(dt.datetime.now(), target_station_id, st+1, station_IDs.shape[0])) startT = dt.datetime.now() #************************* # set up the stations target_station = utils.Station(target_station_id, station_list.latitude[st], station_list.longitude[st], station_list.elevation[st]) if diagnostics: print(target_station) try: target_station, target_station_df = io.read_station(os.path.join(setup.SUBDAILY_PROC_DIR, "{:11s}.qff".format(target_station_id)), target_station, read_flags=True) except OSError: # file missing, move on to next in sequence continue # some may have no data (for whatever reason) if target_station.times.shape[0] == 0: if diagnostics: print("No data in station {}".format(target_station.id)) # scoot onto next station continue # extract neighbours for this station nloc, = np.where(all_neighbours[:, 0, 0] == target_station_id) initial_neighbours = all_neighbours[nloc].squeeze() #************************* # TODO: refine neighbours [quadrants, correlation?] if test in ["all", "outlier"]: print("N", dt.datetime.now()-startT) qc_tests.neighbour_outlier.noc(target_station, initial_neighbours, \ ["temperature", "dew_point_temperature", "wind_speed", "station_level_pressure", "sea_level_pressure"], full=full, plots=plots, diagnostics=diagnostics) if test in ["all", "clean_up"]: print("U", dt.datetime.now()-startT) qc_tests.clean_up.mcu(target_station, ["temperature", "dew_point_temperature", "station_level_pressure", "sea_level_pressure", "wind_speed", "wind_direction"], full=full, plots=plots, diagnostics=diagnostics) if test in ["all", "high_flag"]: print("H", dt.datetime.now()-startT) hfr_vars_set = qc_tests.high_flag.hfr(target_station, ["temperature", "dew_point_temperature", "station_level_pressure", "sea_level_pressure", "wind_speed", "wind_direction"], full=full, plots=plots, diagnostics=diagnostics) print(dt.datetime.now()-startT) # write in the flag information for var in setup.obs_var_list: obs_var = getattr(target_station, var) target_station_df["{}_QC_flag".format(var)] = obs_var.flags #************************* # Output of QFF # write out the dataframe to output format if hfr_vars_set > 1: # high flagging rates in more than one variable. Withholding station completely print("{} withheld as too high flagging".format(target_station.id)) io.write(os.path.join(setup.SUBDAILY_BAD_DIR, "{:11s}.qff".format(target_station_id)), target_station_df, formatters={"Latitude" : "{:7.4f}", "Longitude" : "{:7.4f}", "Month": "{:02d}", "Day": "{:02d}", "Hour" : "{:02d}", "Minute" : "{:02d}"}) else: io.write(os.path.join(setup.SUBDAILY_OUT_DIR, "{:11s}.qff".format(target_station_id)), target_station_df, formatters={"Latitude" : "{:7.4f}", "Longitude" : "{:7.4f}", "Month": "{:02d}", "Day": "{:02d}", "Hour" : "{:02d}", "Minute" : "{:02d}"}) #************************* # Output flagging summary file io.flag_write(os.path.join(setup.SUBDAILY_FLAG_DIR, "{:11s}.flg".format(target_station_id)), target_station_df, diagnostics=diagnostics) print(dt.datetime.now()-startT) # input("stop") return # run_checks
def internal_checks(restart_id = "", end_id = "", all_checks = True, duplicate = False, odd = False, frequent = False, diurnal = False, gap = False, records = False, streaks = False, climatological = False, spike = False, humidity = False, cloud = False, variance = False, winds = False, pressure = False, precipitation = False, diagnostics = False, plots = False, doMonth = False): ''' Run through internal checks on list of stations passed :param str restart_id: which station to start on :param str end_id: which station to end on :param bool all_checks: run all the checks :param bool duplicate/odd/frequent/diurnal/gap/records/streaks/climatological/spike/humidity/cloud/variance/winds/pressure/precipitation: run each test separately :param bool diagnostics: print extra material to screen :param bool plots: create plots from each test [many files if all stations/all tests] :param bool doMonth: a monthly append process ''' if all_checks: duplicate = True odd = True frequent = True diurnal = True gap = True records = True streaks = True climatological = True spike = True humidity = True cloud = True variance = True winds = True pressure = True precipitation = True else: print "single tests selected" # qc_code_version = subprocess.check_output(['svnversion']).strip() qc_code_version = subprocess.check_output(['svn', 'info', 'file:///home/h05/rdunn/svn/hadisd_py_qc/branches/monthly/']) for line in qc_code_version.split("\n"): if line.split(":")[0] == "Revision": qc_code_version = line.split(":")[1] break # get station information try: station_info = np.genfromtxt(os.path.join(INPUT_FILE_LOCS, STATION_LIST), dtype=(str)) except IOError: print "station list not found" sys.exit() # sort truncated run startindex = [0] if restart_id != "": startindex, = np.where(station_info[:,0] == restart_id) if end_id != "": endindex, = np.where(station_info[:,0] == end_id) if endindex != len(station_info) -1: station_info = station_info[startindex[0]: endindex[0]+1] else: station_info = station_info[startindex[0]:] else: station_info = station_info[startindex[0]:] for st,stat in enumerate(station_info): # if st%100 != 0: continue # do every nth station print dt.datetime.strftime(dt.datetime.now(), "%A, %d %B %Y, %H:%M:%S") print "{:35s} {:d}/{:d}".format("Station Number : ", st + 1, len(station_info)) print "{:35s} {}".format("Station Identifier :", stat[0]) if doMonth: print "Running with incomplete final year" # set up the log file logfile = file(LOG_OUTFILE_LOCS+stat[0]+'.log','w') logfile.write(dt.datetime.strftime(dt.datetime.now(), "%A, %d %B %Y, %H:%M:%S\n")) logfile.write("Internal Checks\n") logfile.write("{:35s} {}\n".format("Station Identifier :", stat[0])) process_start_time = time.time() station = utils.Station(stat[0], float(stat[1]), float(stat[2]), float(stat[3])) # latitude and longitude check if np.abs(station.lat) > 90.: if plots or diagnostics: print "{} {} {} {} {} {} {}\n".format(\ station.id,"Latitude Check",DATASTART.year, DATAEND.year,"All", "Unphysical latitude {}".format(station.lat)) else: logfile.write("{} {} {} {} {} {} {}\n".format(\ station.id,"Latitude Check",DATASTART.year, DATAEND.year,"All", "Unphysical latitude {}".format(station.lat))) logfile.close() continue # check if station longitude outside of bounds if np.abs(station.lon) > 180.: if plots or diagnostics: print "{} {} {} {} {} {} {}\n".format(\ station.id,"Longitude Check",DATASTART.year, DATAEND.year,"All", "Unphysical longitude {}".format(station.lon)) else: logfile.write("{} {} {} {} {} {} {}\n".format(\ station.id,"Longitude Check",DATASTART.year, DATAEND.year,"All", "Unphysical longitude {}".format(station.lon))) logfile.close() continue # check if file is zipped if os.path.exists(os.path.join(NETCDF_DATA_LOCS, "hadisd.{}_19310101-{}_{}_raw.nc.gz".format(LONG_VERSION, END_TIME, station.id))): # if gzip file, unzip here subprocess.call(["gunzip",os.path.join(NETCDF_DATA_LOCS, "hadisd.{}_19310101-{}_{}_raw.nc.gz".format(LONG_VERSION, END_TIME, station.id))]) time.sleep(5) # make sure it is unzipped before proceeding # read in the data ncdfp.read(os.path.join(NETCDF_DATA_LOCS, "hadisd.{}_19310101-{}_{}_raw.nc".format(LONG_VERSION, END_TIME, station.id)), station, process_vars, opt_var_list = carry_thru_vars, diagnostics = diagnostics) if plots or diagnostics: print "{:35s} {}\n".format("Total station record size :",len(station.time.data)) else: logfile.write("{:35s} {}\n".format("Total station record size :",len(station.time.data))) match_to_compress = utils.create_fulltimes(station, process_vars, DATASTART, DATAEND, carry_thru_vars) station.qc_flags = np.zeros([len(station.time.data),71]) # changed to include updated wind tests, station level pressure & precipitation # get reporting accuracies and frequencies. for var in process_vars: st_var = getattr(station, var) st_var.reporting_stats = utils.monthly_reporting_statistics(st_var, DATASTART, DATAEND) # Add history text to netcdf file # Reporting Changes - TODO # Duplicate months - check on temperature ONLY if duplicate: # no change as result of incomplete year qc_tests.duplicate_months.dmc(station, ['temperatures'], process_vars, [0], DATASTART, DATAEND, logfile, diagnostics = diagnostics, plots = plots) # Odd Clusters if odd: # no change as result of incomplete year qc_tests.odd_cluster.occ(station,['temperatures','dewpoints','windspeeds','slp'], [54,55,56,57], DATASTART, logfile, diagnostics = diagnostics, plots = plots) utils.apply_flags_from_A_to_B(station, "slp", "stnlp", diagnostics = diagnostics) utils.apply_flags_from_A_to_B(station, "windspeeds", "winddirs", diagnostics = diagnostics) # Frequent Values if frequent: qc_tests.frequent_values.fvc(station, ['temperatures', 'dewpoints','slp'], [1,2,3], DATASTART, DATAEND, logfile, diagnostics = diagnostics, plots = plots, doMonth = doMonth) utils.apply_flags_from_A_to_B(station, "slp", "stnlp", diagnostics = diagnostics) # Diurnal Cycle if diurnal: if np.abs(station.lat) <= 60.: qc_tests.diurnal_cycle.dcc(station, ['temperatures'], process_vars, [4], DATASTART, DATAEND, logfile, diagnostics = diagnostics, plots = plots, doMonth = doMonth) else: if plots or diagnostics: print "Diurnal Cycle Check not run as station latitude ({}) > 60\n".format(station.lat) else: logfile.write("Diurnal Cycle Check not run as station latitude ({}) > 60\n".format(station.lat)) # Distributional Gap if gap: qc_tests.distributional_gap.dgc(station, ['temperatures','dewpoints','slp'], [5,6,7], DATASTART, DATAEND, logfile, diagnostics = diagnostics, plots = plots, GH = True, doMonth = doMonth) utils.apply_flags_from_A_to_B(station, "slp", "stnlp", diagnostics = diagnostics) # Records if records: qc_tests.records.krc(station, ['temperatures','dewpoints','windspeeds','slp'], [8,9,10,11], logfile, diagnostics = diagnostics, plots = plots) utils.apply_flags_from_A_to_B(station, "windspeeds", "winddirs", diagnostics = diagnostics) utils.apply_flags_from_A_to_B(station, "slp", "stnlp", diagnostics = diagnostics) # Streaks and Repetitions if streaks: qc_tests.streaks.rsc(station, ['temperatures','dewpoints','windspeeds','slp','winddirs'], [[12,16,20],[13,17,21],[14,18,22],[15,19,23],[66,67,68]], DATASTART, DATAEND, logfile, diagnostics = diagnostics, plots = plots, doMonth = doMonth) utils.apply_flags_from_A_to_B(station, "windspeeds", "winddirs", diagnostics = diagnostics) utils.apply_flags_from_A_to_B(station, "slp", "stnlp", diagnostics = diagnostics) # Climatological Outlier if climatological: qc_tests.climatological.coc(station, ['temperatures','dewpoints'], [24,25], DATASTART, DATAEND, logfile, diagnostics = diagnostics, plots = plots, doMonth = doMonth) # column 26 kept spare for slp # Spike if spike: qc_tests.spike.sc(station, ['temperatures','dewpoints','slp','windspeeds'], [27,28,29,65], DATASTART, DATAEND, logfile, diagnostics = diagnostics, plots = plots, doMonth = doMonth) utils.apply_flags_from_A_to_B(station, "windspeeds", "winddirs", diagnostics = diagnostics) utils.apply_flags_from_A_to_B(station, "slp", "stnlp", diagnostics = diagnostics) # Humidity cross checks if humidity: qc_tests.humidity.hcc(station, [30,31,32], DATASTART, DATAEND, logfile, diagnostics = diagnostics, plots = plots) # Cloud cross check if cloud: qc_tests.clouds.ccc(station, [33,34,35,36,37,38,39,40], logfile, diagnostics = diagnostics, plots = plots) # Variance if variance: qc_tests.variance.evc(station, ['temperatures','dewpoints','slp','windspeeds'], [58,59,60,61], DATASTART, DATAEND, logfile, diagnostics = diagnostics, plots = plots, doMonth = doMonth) utils.apply_flags_from_A_to_B(station, "windspeeds", "winddirs", diagnostics = diagnostics) utils.apply_flags_from_A_to_B(station, "slp", "stnlp", diagnostics = diagnostics) # Winds if winds: qc_tests.winds.wdc(station, [62,63,64], DATASTART, DATAEND, logfile, diagnostics = diagnostics, plots = plots, doMonth = doMonth) # Pressure if pressure: qc_tests.pressure.spc(station, [69], DATASTART, DATAEND, logfile, diagnostics = diagnostics, plots = plots, doMonth = doMonth) # Precipitation if precipitation: qc_tests.precipitation.pcc(station, [70], DATASTART, DATAEND, logfile, diagnostics = diagnostics, plots = plots) # are flags actually applied? sys.stdout.flush() if diagnostics or plots: raw_input("stop") # write to file ncdfp.write(os.path.join(NETCDF_DATA_LOCS, "hadisd.{}_19310101-{}_{}_internal.nc".format(LONG_VERSION, END_TIME, station.id)), station, process_vars, os.path.join(INPUT_FILE_LOCS,'attributes.dat'), opt_var_list = carry_thru_vars, compressed = match_to_compress, processing_date = dt.datetime.strftime(dt.datetime.now(), "%d-%b-%Y"), qc_code_version = qc_code_version) # gzip the raw file subprocess.call(["gzip",os.path.join(NETCDF_DATA_LOCS, "hadisd.{}_19310101-{}_{}_raw.nc".format(LONG_VERSION, END_TIME, station.id))]) logfile.write(dt.datetime.strftime(dt.datetime.now(), "%A, %d %B %Y, %H:%M:%S\n")) logfile.write("processing took {:4.0f}s\n\n".format(time.time() - process_start_time)) logfile.close() # clean up gc.collect() print "Internal Checks completed\n" return # internal_checks
def main(): """ Main plot function - no inputs. Runs from settings in set_paths_and_vars. """ qc_test_names = make_test_dictionary() try: station_info = np.genfromtxt(os.path.join(INPUT_FILE_LOCS, STATION_LIST), dtype=(str)) except IOError: print "station list not found" sys.exit() all_flag_sums = np.zeros([len(station_info), len(qc_test)]) all_flag_pct = np.zeros([len(station_info), len(qc_test)]) Lons = [] Lats = [] uk_stns = [] for st, stat in enumerate(station_info): # set up station station = utils.Station(stat[0], float(stat[1]), float(stat[2]), float(stat[3])) # if station.id[:2] != "03": # continue print st, station.id # read attributes and qc_flags try: ncdfp.read(os.path.join( NETCDF_DATA_LOCS, "hadisd.{}_19310101-{}_{}.nc".format(LONG_VERSION, END_TIME, station.id)), station, process_vars, diagnostics=diagnostics) # sum qc_flags: # remove multi-level flagging qc_flags = station.qc_flags[:] qc_flags[qc_flags[:] > 1] = 1 # remove multi-level flagging - neighbour flags no_neighbours = qc_flags[qc_flags[:] == -1].size qc_flags[qc_flags[:] < 0] = 0 total_flags = qc_flags[qc_flags[:] != 0].size sum_flags = np.sum(qc_flags[:], axis=0) # 71 column array for cols in [ strT_QC, strD_QC, strWS_QC, strWD_QC, strS_QC, T_QC, D_QC, S_QC, WS_QC, WD_QC, C_QC ]: # to prevent double counting of flags on individual time stamps, re-sum combined_flags = np.sum(np.max(qc_flags[:, cols], axis=1)) sum_flags = np.append(sum_flags, combined_flags) all_flag_sums[st] = sum_flags # now do percentage flagged of total obs pct_flag = np.zeros(len(qc_test), dtype=float) for t, test in enumerate(qc_test): if t in T_QC: if station.temperatures.data.compressed().size > 0: pct_flag[t] = sum_flags[ t] / station.temperatures.data.compressed().size elif t in D_QC: if station.dewpoints.data.compressed().size > 0: pct_flag[t] = sum_flags[ t] / station.dewpoints.data.compressed().size elif t in S_QC: if station.slp.data.compressed().size > 0: pct_flag[t] = sum_flags[ t] / station.slp.data.compressed().size elif t in WS_QC: if station.windspeeds.data.compressed().size > 0: pct_flag[t] = sum_flags[ t] / station.windspeeds.data.compressed().size elif t in WD_QC: if station.winddirs.data.compressed().size > 0: pct_flag[t] = sum_flags[ t] / station.winddirs.data.compressed().size elif t in C_QC: if station.total_cloud_cover.data.compressed().size > 0: pct_flag[t] = sum_flags[ t] / station.total_cloud_cover.data.size else: if station.temperatures.data.compressed().size > 0: pct_flag[ t] = sum_flags[t] / station.temperatures.data.size all_flag_pct[st] = 100. * pct_flag # get occasions when more locations are flagged than have data. over_100, = np.where(all_flag_pct[st] > 100.) all_flag_pct[st][over_100] = 100. Lons += [station.lon] Lats += [station.lat] uk_stns += [st] except RuntimeError: # file doesn't exist pass Lats = np.array(Lats) Lons = np.array(Lons) outfile = file( INPUT_FILE_LOCS + "all_fails_summary_{}.dat".format(start_time_string), 'w') for t, test in enumerate(qc_test): plt.figure(figsize=(8, 6)) plt.clf() ax = plt.axes([0, 0, 1, 1], projection=ccrs.Robinson()) ax.set_global() ax.coastlines('50m') try: ax.gridlines(draw_labels=True) except TypeError: ax.gridlines() # colors are the exact same RBG codes as in IDL colors = [(150, 150, 150), (41, 10, 216), (63, 160, 255), (170, 247, 255), (255, 224, 153), (247, 109, 94), (165, 0, 33), (0, 0, 0)] limits = [0.0, 0.1, 0.2, 0.5, 1.0, 2.0, 5.0, 100.] all_locs = [] for u, upper in enumerate(limits): if u == 0: locs, = np.where(all_flag_pct[uk_stns, t] == 0) label = "{}%: {}".format(upper, len(locs)) else: locs, = np.where( np.logical_and(all_flag_pct[uk_stns, t] <= upper, all_flag_pct[uk_stns, t] > limits[u - 1])) label = ">{} to {}%: {}".format(limits[u - 1], upper, len(locs)) if upper == limits[-1]: label = ">{}%: {}".format(limits[u - 1], len(locs)) if len(locs) > 0: ax.scatter(Lons[locs], Lats[locs], transform=ccrs.Geodetic(), s=15, c=tuple([float(c) / 255 for c in colors[u]]), edgecolors="none", label=label) else: ax.scatter([0], [-90], transform=ccrs.Geodetic(), s=15, c=tuple([float(c) / 255 for c in colors[u]]), edgecolors="none", label=label) all_locs += [len(locs)] plt.title(qc_test_names[test]) watermarkstring = "/".join( os.getcwd().split('/')[4:]) + '/' + os.path.basename( __file__) + " " + dt.datetime.strftime( dt.datetime.now(), "%d-%b-%Y %H:%M") plt.figtext(0.01, 0.01, watermarkstring, size=5) leg = plt.legend(loc='lower center', ncol=4, bbox_to_anchor=(0.5, -0.2), frameon=False, title='', prop={'size': 11}, labelspacing=0.15, columnspacing=0.5, numpoints=1) plt.savefig(IMAGE_LOCS + "All_fails_{}_{}.png".format(test, start_time_string)) plt.close() outfile.write("{:10s}".format(test) + ''.join(['%7i' % n for n in all_locs]) + ''.join([ "%7.1f" % n for n in [100. * n / len(Lats) for n in all_locs] ]) + "\n") outfile.close() return # main
def get_summary(stage="N", restart_id="", end_id="", diagnostics=False): """ Main script. Reads in station data, populates internal objects extracts counts per year. :param str stage: after which stage to run Internal, Neighbour :param str restart_id: which station to start on :param str end_id: which station to end on :param bool diagnostics: print extra material to screen """ # process the station list station_list = utils.get_station_list(restart_id=restart_id, end_id=end_id) station_IDs = station_list.id yearly_counts = {} # now spin through each ID in the curtailed list for st, station_id in enumerate(station_IDs): print("{} {:11s} ({}/{})".format(dt.datetime.now(), station_id, st+1, station_IDs.shape[0])) #************************* # set up the stations station = utils.Station(station_id, station_list.latitude[st], station_list.longitude[st], station_list.elevation[st]) if diagnostics: print(station) try: if stage == "I": station, station_df = io.read_station(os.path.join(setup.SUBDAILY_PROC_DIR, "{:11s}.qff".format(station_id)), station) elif stage == "N": station, station_df = io.read_station(os.path.join(setup.SUBDAILY_OUT_DIR, "{:11s}.qff".format(station_id)), station) except OSError as e: # file missing, move on to next in sequence # io.write_error(station, "File Missing") continue except ValueError as e: # some issue in the raw file # io.write_error(station, "Error in input file", error=str(e)) continue # some may have no data (for whatever reason) if station.times.shape[0] == 0: if diagnostics: print("No data in station {}".format(station.id)) # scoot onto next station # io.write_error(station, "No data in input file") continue unique_years = np.unique(station.years) year_counts = np.zeros(unique_years.shape).astype(int) # spin through each variable (might be heaviest lift) for var in setup.obs_var_list: obs_var = getattr(station, var) # spin through each year for y, year in enumerate(unique_years): locs, = np.where(station.years == year) # where obs and years intersect year_obs = obs_var.data[locs] # and just keep unflagged set year_counts[y] += [len(year_obs.compressed())] # store in dictionary for year, count in zip(unique_years, year_counts): yearly_counts[year] = yearly_counts.get(year, 0) + count # now print with open("summary_counts.txt", "w") as outfile: for key, value in sorted(yearly_counts.items(), key=lambda x: x[0]): outfile.write("{} : {}\n".format(key, value)) return # get_summary
def make_hum_heat_vars(station_info, restart_id="", end_id="", diagnostics=False, plots=False): """ Make the humidity and heat-stress variable netCDF files Make two sets of output files containing the humidity and heat-stress parameters calculated on an hourly basis from the QC'd HadISD data :param list station_info: station information list :param str restart_id: first station to process :param str end_id: last station to process :param bool diagnostics: verbose output to screen :param bool plots: make plots (placeholder) """ # sort truncated run startindex = 0 if restart_id != "": startindex, = np.where(station_info[:, 0] == restart_id) if end_id != "": endindex, = np.where(station_info[:, 0] == end_id) if endindex != len(station_info) - 1: station_info = station_info[startindex:endindex + 1] else: station_info = station_info[startindex:] else: station_info = station_info[startindex:] for st, stat in enumerate(station_info): print dt.datetime.strftime(dt.datetime.now(), "%A, %d %B %Y, %H:%M:%S") print "{:35s} {:d}/{:d}".format("Station Number : ", st + 1, len(station_info)) print "{:35s} {}".format("Station Identifier :", stat[0]) if plots or diagnostics: logfile = "" else: logfile = file(LOG_OUTFILE_LOCS + stat[0] + '.log', 'a') logfile.write( dt.datetime.strftime(dt.datetime.now(), "%A, %d %B %Y, %H:%M:%S\n")) logfile.write("Calculating Humidity and Heat Stress variables\n") logfile.write("{:35s} {}\n".format("Station Identifier :", stat[0])) process_start_time = time.time() station = utils.Station(stat[0], float(stat[1]), float(stat[2]), float(stat[3])) if os.path.exists( os.path.join(NETCDF_DATA_LOCS, station.id + "_mask.nc.gz")): # if gzip file, unzip here subprocess.call([ "gunzip", os.path.join(NETCDF_DATA_LOCS, station.id + "_mask.nc.gz") ]) time.sleep(5) # make sure it is unzipped before proceeding # read in the data ncdfp.read(os.path.join(NETCDF_DATA_LOCS, station.id + "_mask.nc"), station, process_vars, diagnostics=diagnostics, read_qc_flags=False, read_flagged_obs=False) match_to_compress = utils.create_fulltimes(station, process_vars, DATASTART, DATAEND, do_qc_flags=False, do_flagged_obs=False) # run through calculations, each one should add a new variable to object. """ 1) Use T and P to get e [to get es, use Td] 2) Use e, P, Td and T to get Tw 3) If Tw < 0C, recalculate e w.r.t ice, and re-obtain Tw - keep both! 4) Use e and P to calculate q 5) Use e and es to get rh (use appropriate es too) - or q and qs what P to use if no measurement - using monthly mean probably isn't appropriate in this instance?? """ station = humidity.run_calcs(station, logfile) # run through heat stress calculations station = heat_stress.run_calcs(station, logfile) if diagnostics or plots: raw_input("stop") # adjust this to work with the desired output file - will need a separate write function - output humidity in one set, heat indices in another? humidity_vars = [ "temperatures", "dewpoints", "slp", "vapour_pressure", "saturation_vapour_pressure", "wetbulb_temperature", "specific_humidity", "relative_humidity" ] ncdfp.write(os.path.join(NETCDF_DATA_LOCS, station.id + "_humidity.nc"), station, humidity_vars, os.path.join(INPUT_FILE_LOCS, 'attributes.dat'), compressed=match_to_compress, processing_date='', qc_code_version='', write_QC_flags=False, write_flagged_obs=False, least_significant_digit=5) heat_stress_vars = [ "temperatures", "dewpoints", "windspeeds", "THI", "WBGT", "humidex", "apparent_t", "heat_index" ] ncdfp.write(os.path.join(NETCDF_DATA_LOCS, station.id + "_heat_stress.nc"), station, heat_stress_vars, os.path.join(INPUT_FILE_LOCS, 'attributes.dat'), compressed=match_to_compress, processing_date='', qc_code_version='', write_QC_flags=False, write_flagged_obs=False, least_significant_digit=5) # gzip the raw file # subprocess.call(["gzip","-f",os.path.join(NETCDF_DATA_LOCS, station.id + "_humidity.nc")]) # subprocess.call(["gzip","-f",os.path.join(NETCDF_DATA_LOCS, station.id + "_heat_stress.nc")]) # subprocess.call(["gzip",os.path.join(NETCDF_DATA_LOCS, station.id + "_mask.nc")]) logfile.write( dt.datetime.strftime(dt.datetime.now(), "%A, %d %B %Y, %H:%M:%S\n")) logfile.write( "processing took {:4.0f}s\n\n".format(time.time() - process_start_time)) logfile.close() print "Humidity and Heat Stress Indices calculated" return # make_hum_heat_vars
def process_canadian_stations(all_stations, DATA_START): ''' if in Canada_single - then single occurrance - USE if in Canada_onoff - then single occurrance with time limits - USE if in Canada_homogenisation - then multiple occurances in single location - USE if in Canada_goodmove - then well documented station move - USE - but only section which matches the dates else DON'T USE (_overlap, _questionablemove, _rem, _dates) ''' station_ids = np.array([stn.id for stn in all_stations]) # can only work on 71???0-99999 station numbers canadian_ids = np.array([ s for s, stn in enumerate(station_ids) if stn[:2] == "71" and stn[-7:] == "0-99999" ]) use = np.array([0 for i in range(len(canadian_ids)) ]) # 1 - use, 0 - not tested, -1 - don't use # only reject those we are sure about, and keep those that we can't test as we don't know # single, onoff & homogenisation for category_files in [ "Canada_single.dat", "Canada_onoff.dat", "Canada_homogenisation.dat" ]: test_ids, test_names, test_lats, test_lons, test_active, test_date = read_canada_info( category_files, DATA_START) for c, cid in enumerate(canadian_ids): id_to_compare = int(station_ids[cid][:5]) if id_to_compare in test_ids: # ID present loc = np.where(test_ids == id_to_compare)[0][0] test_station = utils.Station( test_ids[loc], test_lats[loc], test_lons[loc], all_stations[cid].elev) # fake the elev test_station.name = test_names[loc].strip() test_station.call = "" probs = sel_utils.do_match(test_station, all_stations[cid], sel_utils.LATITUDE_THRESHOLD, sel_utils.ELEVATION_THRESHOLD, sel_utils.DISTANCE_THRESHOLD) if np.product(probs) > sel_utils.PROB_THRESHOLD: use[c] = 1 else: print all_stations[cid].name, all_stations[cid] print test_station.name.strip(), test_station print probs, "\n" # overlap, questionablemove, rem, dates for category_files in [ "Canada_rem.dat", "Canada_dates.dat", "Canada_questionablemove.dat", "Canada_overlap.dat" ]: test_ids, test_names, test_lats, test_lons, test_active, test_date = read_canada_info( category_files, DATA_START) for c, cid in enumerate(canadian_ids): id_to_compare = int(station_ids[cid][:5]) if id_to_compare in test_ids: # ID present - don't use this station use[c] = -1 # restrict start and end times so won't be selected. all_stations[cid].start = dt.datetime.today() all_stations[cid].end = dt.datetime.today() # goodmove for category_files in ["Canada_goodmove.dat"]: outfilename = os.path.join(INPUT_FILE_LOCS, "Canada_time_ranges.dat") try: os.remove(outfilename) except OSError: print "file does not exist ", outfilename outfile = file(outfilename, "w") test_ids, test_names, test_lats, test_lons, test_active, test_date = read_canada_info( category_files, DATA_START) for c, cid in enumerate(canadian_ids): id_to_compare = int(station_ids[cid][:5]) if id_to_compare in test_ids: # ID present locs, = np.where(test_ids == id_to_compare) # test which locations match all_probs = [] for loc in locs: test_station = utils.Station( test_ids[loc], test_lats[loc], test_lons[loc], all_stations[cid].elev) # fake the elev test_station.name = test_names[loc] test_station.call = "" probs = sel_utils.do_match(test_station, all_stations[cid], sel_utils.LATITUDE_THRESHOLD, sel_utils.ELEVATION_THRESHOLD, sel_utils.DISTANCE_THRESHOLD) all_probs += [np.product(probs)] good_locs, = np.where( np.array(all_probs) > sel_utils.PROB_THRESHOLD) # need to test which range of dates can be taken. start = dt.datetime(DATA_START, 1, 1, 0, 0) end = dt.datetime(dt.datetime.now().year + 1, 1, 1, 0, 0) # single "Active" and last entry - then use as start date if (len(good_locs) == 1) and (good_locs[0] + 1 == len(all_probs)) and ( test_active[locs][good_locs[0]].strip() == "Active"): use[c] = 1 start = test_date[locs][good_locs[0]] # adjust start date and write out useful range all_stations[cid].start = start outfile.write("{} {} {}\n".format( station_ids[cid], dt.datetime.strftime(start, "%Y-%m-%d %H:%M:%S"), dt.datetime.strftime(end, "%Y-%m-%d %H:%M:%S"))) elif len(good_locs) == 2: # "Active" followed by "Inactive" - use as range 71100 if (test_active[locs][good_locs[0]].strip() == "Active" ) and (test_active[locs][good_locs[1]].strip() == "Inactive"): use[c] = 1 start = test_date[locs][good_locs[0]] end = test_date[locs][good_locs[1]] # adjust start and end date and write out useful range all_stations[cid].start = start all_stations[cid].end = end outfile.write("{} {} {}\n".format( station_ids[cid], dt.datetime.strftime(start, "%Y-%m-%d %H:%M:%S"), dt.datetime.strftime(end, "%Y-%m-%d %H:%M:%S"))) # 2 "Active"s, then if in final place, use first as start date 71038 elif (test_active[locs][good_locs[0]].strip() == "Active" ) and (test_active[locs][good_locs[1]].strip() == "Active") and (good_locs[0] + 1 == len(all_probs) - 1) and (good_locs[1] + 1 == len(all_probs)): use[c] = 1 start = test_date[locs][good_locs[0]] # adjust start date and write out useful range all_stations[cid].start = start outfile.write("{} {} {}\n".format( station_ids[cid], dt.datetime.strftime(start, "%Y-%m-%d %H:%M:%S"), dt.datetime.strftime(end, "%Y-%m-%d %H:%M:%S"))) # 3 "Active"s or combination of "Active" and "Inactive" - if no other IDs present, then use all elif len(locs) == len(good_locs): use[c] = 1 # no change to start/end times outfile.close() print "{} Canadian stations processed - {} kept, {} not tested, {} rejected".format( len(use), len(use[use == 1]), len(use[use == 0]), len(use[use == -1])) return all_stations # process_canadian_stations
def main(restart_id="", end_id="", diagnostics=False): """ Main plot function. :param str restart_id: which station to start on :param str end_id: which station to end on :param bool diagnostics: print extra material to screen """ obs_var_list = setup.obs_var_list # process the station list station_list = utils.get_station_list(restart_id=restart_id, end_id=end_id) station_IDs = station_list.id all_stations = {} # now spin through each ID in the curtailed list for st, station_id in enumerate(station_IDs): # if st > 10: # break print("{} {}".format(dt.datetime.now(), station_id)) station = utils.Station(station_id, station_list.iloc[st].latitude, station_list.iloc[st].longitude, station_list.iloc[st].elevation) if diagnostics: print(station) try: flag_summary = flag_read( os.path.join(setup.SUBDAILY_FLAG_DIR, "{}.flg".format(station_id))) except IOError: print("flag file missing for {}".format(station_id)) #************************* # read QFF # try: # station_df = io.read(os.path.join(setup.SUBDAILY_OUT_DIR, "{}.qff".format(station_id))) # except IOError: # print("Missing station {}".format(station_id)) # continue for var in obs_var_list: setattr( station, var, utils.Meteorological_Variable("{}".format(var), utils.MDI, "", "")) obs_var = getattr(station, var) # flags = station_df["{}_QC_flag".format(var)].fillna("") for test in utils.QC_TESTS.keys(): # locs = flags[flags.str.contains(test)] # setattr(obs_var, test, locs.shape[0]/flags.shape[0]) # setattr(obs_var, "{}_counts".format(test), locs.shape[0]) try: setattr(obs_var, test, flag_summary[var][test]) setattr(obs_var, "{}_counts".format(test), flag_summary[var]["{}_counts".format(test)]) except KeyError: setattr(obs_var, test, 0) setattr(obs_var, "{}_counts".format(test), 0) # # for total, get number of clean obs and subtract # flagged, = np.where(flags != "") # setattr(obs_var, "All", flagged.shape[0]/flags.shape[0]) # setattr(obs_var, "All_counts".format(test), flagged.shape[0]) try: setattr(obs_var, "All", flag_summary[var]["All"]) setattr(obs_var, "All_counts".format(test), flag_summary[var]["{}_counts".format("All")]) except KeyError: setattr(obs_var, "All", 0) setattr(obs_var, "All_counts".format(test), 0) if diagnostics: print("{} - {}".format(var, flagged.shape[0])) all_stations[station_id] = station # now spin through each var/test combo and make a plot for var in obs_var_list: for test in TESTS_FOR_VARS[var]: for suffix in ["", "_counts"]: lats, lons, flag_fraction = np.zeros( station_IDs.shape[0]), np.zeros( station_IDs.shape[0]), np.zeros(station_IDs.shape[0]) for st, (ID, station) in enumerate(all_stations.items()): lats[st] = station.lat lons[st] = station.lon obs_var = getattr(station, var) flag_fraction[st] = getattr(obs_var, "{}{}".format(test, suffix)) if suffix == "": flag_fraction *= 100. # convert to percent # do the plot plt.figure(figsize=(8, 5)) plt.clf() ax = plt.axes([0.02, 0.02, 0.96, 0.96], projection=ccrs.Robinson()) ax.set_global() ax.coastlines('50m') try: ax.gridlines() #draw_labels = True) except TypeError: ax.gridlines() # colors are the exact same RBG codes as in IDL colors = [(150, 150, 150), (41, 10, 216), (63, 160, 255), (170, 247, 255), \ (255, 224, 153), (247, 109, 94), (165, 0, 33), (0, 0, 0)] if suffix == "": limits = [0.0, 0.1, 0.2, 0.5, 1.0, 2.0, 5.0, 100.] elif suffix == "_counts": limits = [0.0, 5., 10., 50., 100., 500., 1000., 5000.] for u, upper in enumerate(limits): # sort the labels if u == 0: locs, = np.where(flag_fraction == 0) label = "{}{}: {}".format(upper, UNITS[suffix], len(locs)) else: locs, = np.where(np.logical_and(flag_fraction <= upper, \ flag_fraction > limits[u-1])) label = ">{} to {}{}: {}".format( limits[u - 1], upper, UNITS[suffix], len(locs)) if upper == limits[-1]: label = ">{}{}: {}".format(limits[u - 1], UNITS[suffix], len(locs)) # and plot if len(locs) > 0: ax.scatter(lons[locs], lats[locs], transform=ccrs.PlateCarree(), s=15, color=tuple([float(c)/255 for c in colors[u]]), \ edgecolors="none", label = label) else: ax.scatter([0], [-90], transform=ccrs.PlateCarree(), s=15, color=tuple([float(c)/255 for c in colors[u]]), \ edgecolors="none", label=label) if test == "All": plt.title("{} - {}".format( " ".join([v.capitalize() for v in var.split("_")]), "All")) else: plt.title("{} - {}".format( " ".join([v.capitalize() for v in var.split("_")]), utils.QC_TESTS[test])) watermarkstring="/".join(os.getcwd().split('/')[4:])+'/'+\ os.path.basename( __file__ )+" "+dt.datetime.strftime(dt.datetime.now(), "%d-%b-%Y %H:%M") plt.figtext(0.01, 0.01, watermarkstring, size=5) leg=plt.legend(loc='lower center', ncol=4, bbox_to_anchor=(0.5, -0.12), frameon=False, title='', prop={'size':9}, \ labelspacing=0.15, columnspacing=0.5, numpoints=1) plt.savefig( os.path.join( IMAGE_LOCS, "All_fails_{}-{}{}_{}.png".format( var, test, suffix, start_time_string))) plt.close() return # main
def select_neighbours(station, variable, neighbour_info, neighbours, neighbour_distances, neighbour_quadrants, data_locs, datastart, dataend, logfile, diagnostics=False, plots=False): ''' From the list of nearby stations select the ones which will be good neighours for the test. Select on basis of correlation, overlap of data points and bearing (quadrants) :param object station: station object :param str variable: which variable to proces :param array neighbour_info: array of ID, lat, lon and elev :param array neighbours: which station sequence numbers are the nearby stations :param array neighbour_distances: distances to nearby stations :param array neighbour_quadrants: bearings to nearby stations (in 90deg bins) :param array data_locs: path to data files :param datetime datastart: start of data set :param datetime dataend: end of data set :param file logfile: logfile to store outputs :param boolean diagnostics: output diagnostic information :param boolean plots: make a plot :returns: final_locs - array of station sequence numbers to use. ''' # set up storage arrays n_correlations = np.zeros(len(neighbours)) n_distances = np.zeros(len(neighbours)) n_quadrants = np.zeros(len(neighbours)) n_overlaps = np.zeros(len(neighbours)) combined_score = np.zeros(len(neighbours)) # get station data st_var = getattr(station, variable) st_anomalies = hourly_daily_anomalies(st_var.data[:]) # go through initial list and extract correlations and overlaps for nn, nn_loc in enumerate(neighbours): n_details = neighbour_info[nn] neigh = utils.Station(n_details[0], float(n_details[1]), float(n_details[2]), float(n_details[3])) ncdfp.read(os.path.join( NETCDF_DATA_LOCS, "hadisd.{}_19310101-{}_{}_internal.nc".format( LONG_VERSION, END_TIME, station.id)), neigh, [variable], diagnostics=diagnostics, read_input_station_id=False) dummy = utils.create_fulltimes(neigh, [variable], datastart, dataend, [], do_input_station_id=False) # get the correlations of data to this neighbour neigh_var = getattr(neigh, variable) neigh_anomalies = hourly_daily_anomalies(neigh_var.data[:]) # correlation = np.ma.corrcoef(neigh_var.data, st_var.data)[1,0] correlation = np.ma.corrcoef(neigh_anomalies, st_anomalies)[1, 0] overlap = len( np.where( np.logical_or(neigh_var.data.mask, st_var.data.mask) == False) [0]) / float(len(st_var.data.compressed())) if not math.isnan(correlation): n_correlations[nn] = correlation n_overlaps[nn] = overlap combined_score[nn] = correlation + overlap n_distances[nn] = neighbour_distances[nn] n_quadrants[nn] = neighbour_quadrants[nn] # clear up to save memory del dummy del neigh_var del neigh_anomalies gc.collect() # sort in order of the combination of correlation and overlap sort_order = np.argsort(combined_score)[::-1] # and select the best 10 # final_selection = neighbours[sort_order][:10] # sort out the quadrants locs1 = neighbours[sort_order][n_quadrants[sort_order] == 1] locs2 = neighbours[sort_order][n_quadrants[sort_order] == 2] locs3 = neighbours[sort_order][n_quadrants[sort_order] == 3] locs4 = neighbours[sort_order][n_quadrants[sort_order] == 4] final_locs = np.concatenate((locs1[:2], locs2[:2], locs3[:2], locs4[:2]), axis=0).reshape(-1) # and add the rest in order of combined score for index in neighbours[sort_order]: if index not in final_locs: final_locs = np.append(final_locs, index) if len(final_locs) == N_NEIGHBOURS: break # output table showing distances, correlations, overlaps, the combined score and which ones were selected if plots or diagnostics: print "{:14s} {:10s} {:10s} {:10s} {:10s} {:10s} {:10s} {:10s}".format( "Neighbour", "Distance", "Elevation", "Correl'n", "Overlap", "Combined", "Quadrant", "Selected") else: logfile.write( "{:14s} {:10s} {:10s} {:10s} {:10s} {:10s} {:10s} {:10s}\n".format( "Neighbour", "Distance", "Elevation", "Correl'n", "Overlap", "Combined", "Quadrant", "Selected")) selected_correlations = [] selected_overlaps = [] for nn, nn_loc in enumerate(neighbours[sort_order]): selected = "" if nn_loc in final_locs: selected = "Y" if plots: selected_correlations += [n_correlations[sort_order[nn]]] selected_overlaps += [n_overlaps[sort_order[nn]]] neigh_details = neighbour_info[sort_order][nn] if plots or diagnostics: print "{:14s} {:10.1f} {:10.1f} {:10.5f} {:10.3f} {:10.3f} {:10.0f} {:10s}".format( neigh_details[0], n_distances[sort_order][nn], float(neigh_details[3]), n_correlations[sort_order][nn], n_overlaps[sort_order][nn], combined_score[sort_order][nn], n_quadrants[sort_order][nn], selected) else: logfile.write( "{:14s} {:10.1f} {:10.1f} {:10.5f} {:10.3f} {:10.3f} {:10.0f} {:10s}\n" .format(neigh_details[0], n_distances[sort_order][nn], float(neigh_details[3]), n_correlations[sort_order][nn], n_overlaps[sort_order][nn], combined_score[sort_order][nn], n_quadrants[sort_order][nn], selected)) # plot of correlations and overlaps, with selected stations highlighted if plots: import matplotlib.pyplot as plt plt.clf() plt.plot(n_correlations, n_overlaps, 'bo') plt.plot(selected_correlations, selected_overlaps, 'ro') plt.xlabel("correlations") plt.ylabel("data overlap") plt.title("{} - {}".format(station.id, variable)) plt.show() return final_locs # select_neighbours