Ejemplo n.º 1
0
def neighbour_outlier(target_station,
                      initial_neighbours,
                      variable,
                      diagnostics=False,
                      plots=False,
                      full=False):
    """
    Works on a single station and variable.  Reads in neighbour's data, finds locations where sufficent are sufficiently different.

    :param Station target_station: station to run on 
    :param array initial_neighbours: input neighbours (ID, distance) pairs
    :param str variable: obs variable being run on
    :param bool diagnostics: print extra material to screen
    :param bool plots: create plots from each test
    :param bool full: run full reprocessing rather than using stored values.
    """
    station_list = utils.get_station_list()

    # if sufficient
    n_neighbours = len(np.where(initial_neighbours[:, 0] != "-")[0]) - 1
    if n_neighbours < utils.MIN_NEIGHBOURS:
        print("{} has insufficient neighbours ({}<{})".format(
            target_station.id, n_neighbours, utils.MIN_NEIGHBOURS))

    else:
        #*************************
        # extract target observations
        obs_var = getattr(target_station, variable)
        flags = np.array(["" for i in range(obs_var.data.shape[0])
                          ]).astype("<U10")

        #*************************
        # read in in the neighbour (buddy) data
        all_buddy_data = np.ma.zeros(
            [len(initial_neighbours[:, 0]),
             len(target_station.times)])
        all_buddy_data.mask = np.ones(all_buddy_data.shape)

        for bid, buddy_id in enumerate(initial_neighbours[:, 0]):
            if buddy_id == target_station.id:
                # first entry is self
                continue
            if buddy_id == "-":
                # end of the list of buddies
                break

            if diagnostics:
                print("{}/{} {}".format(bid, len(initial_neighbours[:, 0]),
                                        buddy_id))

            # set up station object to hold information
            buddy_idx, = np.where(station_list.id == buddy_id)
            buddy = utils.Station(buddy_id, station_list.iloc[buddy_idx].latitude.values[0], \
                                      station_list.iloc[buddy_idx].longitude.values[0], station_list.iloc[buddy_idx].elevation.values[0])

            try:
                buddy, buddy_df = io.read_station(os.path.join(
                    setup.SUBDAILY_PROC_DIR, "{:11s}.qff".format(buddy_id)),
                                                  buddy,
                                                  read_flags=True)

                buddy_var = getattr(buddy, variable)

                # apply flags
                flag_locs, = np.where(buddy_var.flags != "")
                buddy_var.data.mask[flag_locs] = True

            except OSError as e:
                # file missing, move on to next in sequence
                io.write_error(
                    target_station,
                    "File Missing (Buddy, {}) - {}".format(variable, buddy_id))
                continue
            except ValueError as e:
                # some issue in the raw file
                io.write_error(target_station,
                               "Error in input file (Buddy, {}) - {}".format(
                                   variable, buddy_id),
                               error=str(e))
                continue

            # match the timestamps of target_station and copy over
            match = np.in1d(target_station.times, buddy.times)
            match_back = np.in1d(buddy.times, target_station.times)

            if True in match and True in match_back:
                # skip if no overlapping times at all!
                all_buddy_data[bid, match] = buddy_var.data[match_back]

        if diagnostics:
            print("All buddies read in")

        #*************************
        # find differences
        differences = all_buddy_data - obs_var.data

        #*************************
        # find spread of differences on monthly basis (with minimum value)
        spreads = np.ma.zeros(differences.shape)

        for month in range(1, 13):

            month_locs = np.where(target_station.months == month)

            for bid, buddy in enumerate(differences):

                if len(differences[bid, month_locs].compressed()
                       ) > utils.DATA_COUNT_THRESHOLD:

                    this_spread = utils.spread(differences[bid, month_locs])
                    if this_spread < MIN_SPREAD:
                        spreads[bid, month_locs] = MIN_SPREAD
                    else:
                        spreads[bid, month_locs] = utils.spread(
                            differences[bid, month_locs])

                else:
                    spreads[bid, month_locs] = MIN_SPREAD

        spreads.mask = np.copy(differences.mask)

        # store which entries may be sufficient to flag
        dubious = np.ma.zeros(differences.shape)
        dubious.mask = np.copy(differences.mask)

        #*************************
        # adjust for storms
        if variable in ["sea_level_pressure", "station_level_pressure"]:
            distant, = np.where(initial_neighbours[:, 1].astype(int) > 100)
            if len(distant) > 0:
                # find positive and negative differences across neighbours
                positive = np.ma.where(
                    differences[distant] > spreads[distant] * SPREAD_LIMIT)
                negative = np.ma.where(
                    differences[distant] < spreads[distant] * SPREAD_LIMIT)

                # spin through each neighbour
                for dn, dist_neigh in enumerate(distant):

                    pos, = np.where(positive[0] == dn)
                    neg, = np.where(negative[0] == dn)

                    if len(neg) > 0:
                        ratio = len(neg) / (len(pos) + len(neg))
                        if ratio > 0.667:
                            # majority negative, only flag the positives [definitely not storms]
                            dubious[dist_neigh, positive[1][pos]] = 1

            else:
                # all stations close by so storms shouldn't affect, include all
                # note where differences exceed the spread
                dubious_locs = np.ma.where(
                    np.ma.abs(differences) > spreads * SPREAD_LIMIT)
                dubious[dubious_locs] = 1

        else:
            #*************************
            # note where differences exceed the spread [all non pressure variables]
            dubious_locs = np.ma.where(
                np.ma.abs(differences) > spreads * SPREAD_LIMIT)
            dubious[dubious_locs] = 1

        if diagnostics:
            print("cross checks complete - assessing all outcomes")
        #*************************
        # sum across neighbours
        neighbour_count = np.ma.count(differences, axis=0)
        dubious_count = np.ma.sum(dubious, axis=0)

        # flag if large enough fraction (>0.66)
        sufficient, = np.ma.where(dubious_count > 0.66 * neighbour_count)
        flags[sufficient] = "N"

        if plots:
            for flag in sufficient:
                plot_neighbour_flags(target_station.times, flag, obs_var,
                                     all_buddy_data)

        # append flags to object
        obs_var.flags = utils.insert_flags(obs_var.flags, flags)

        if diagnostics:

            print("Neighbour Outlier {}".format(obs_var.name))
            print("   Cumulative number of flags set: {}".format(
                len(np.where(flags != "")[0])))

    return  # neighbour_outlier
Ejemplo n.º 2
0
except IOError:
    print "station list not found"
    sys.exit()

all_flag_sums = np.zeros([len(station_info), len(qc_test) + 6])
all_flag_pct = np.zeros([len(station_info), len(qc_test)])

Lons = []
Lats = []

uk_stns = []

for st, stat in enumerate(station_info):

    # set up station
    station = utils.Station(stat[0], float(stat[1]), float(stat[2]),
                            float(stat[3]))

    #    if station.id[:2] != "03":
    #        continue
    print st, station.id

    # read attributes and qc_flags
    ncdfp.read(os.path.join(NETCDF_DATA_LOCS, station.id + "_mask.nc"),
               station,
               process_vars, [],
               diagnostics=diagnostics)

    # sum qc_flags:
    # remove multi-level flagging
    qc_flags = station.qc_flags[:]
Ejemplo n.º 3
0
def neighbour_checks(station_info, restart_id = "", end_id = "", distances=np.array([]), angles=np.array([]), second = False, masking = False, doZip=False, plots = False, diagnostics = False):
    """
    Run through neighbour checks on list of stations passed
    
    :param list station_info: list of lists - [[ID, lat, lon, elev]] - strings
    :param array distances: array of distances between station pairs
    :param array angles: array of angles between station pairs
    :param bool second: do the second run
    :param bool masking: apply the flags to the data to mask the observations.

    """
    first = not second

    qc_code_version = subprocess.check_output(['svnversion']).strip()

    # if distances and angles not calculated, then do so
    if (len(distances) == 0) or (len(angles) == 0):
        print "calculating distances and bearings matrix"
        distances, angles = get_distances_angles(station_info)

    # extract before truncate the array
    neighbour_elevations = np.array(station_info[:,3], dtype=float) 
    neighbour_ids        = np.array(station_info[:,0])
    neighbour_info       = np.array(station_info[:,:])

    # sort truncated run
    startindex = 0
    if restart_id != "":
        startindex, = np.where(station_info[:,0] == restart_id)


    if end_id != "":
        endindex, = np.where(station_info[:,0] == end_id)
        if endindex != len(station_info) -1:
            station_info = station_info[startindex: endindex+1]
            distances = distances[startindex:endindex+1,:]
            angles = angles[startindex:endindex+1,:]
        else:
            station_info = station_info[startindex:]
            distances = distances[startindex:,:]
            angles = angles[startindex:,:]
    else:
        station_info = station_info[startindex:]
        distances = distances[startindex:,:]
        angles = angles[startindex:,:]
        

    # process each neighbour
    for st, stat in enumerate(station_info):       

        print dt.datetime.strftime(dt.datetime.now(), "%A, %d %B %Y, %H:%M:%S")
        print "Neighbour Check"
        print "{:35s} {}".format("Station Identifier :", stat[0])

        if not plots and not diagnostics:
            logfile = file(LOG_OUTFILE_LOCS+stat[0]+'.log','a') # append to file if second iteration.
            logfile.write(dt.datetime.strftime(dt.datetime.now(), "%A, %d %B %Y, %H:%M:%S\n"))
            logfile.write("Neighbour Check\n")
            logfile.write("{:35s} {}\n".format("Station Identifier :", stat[0]))
        else:
            logfile = ""

        process_start_time = time.time()

        station = utils.Station(stat[0], float(stat[1]), float(stat[2]), float(stat[3]))

        # if running through the first time
        if first:

            if os.path.exists(os.path.join(NETCDF_DATA_LOCS, station.id + "_internal.nc.gz")):
                # if gzip file, unzip here
                subprocess.call(["gunzip",os.path.join(NETCDF_DATA_LOCS, station.id + "_internal.nc.gz")])
                time.sleep(5) # make sure it is unzipped before proceeding

            # read in the data
            ncdfp.read(os.path.join(NETCDF_DATA_LOCS, station.id + "_internal.nc"), station, process_vars, carry_thru_vars, diagnostics = diagnostics)

            if plots or diagnostics:
                print "{:35s}  {}\n".format("Total station record size :",len(station.time.data))
            else:
                logfile.write("{:35s}  {}\n".format("Total station record size :",len(station.time.data)))

            match_to_compress = utils.create_fulltimes(station, process_vars, DATASTART, DATAEND, carry_thru_vars)

        # or if second pass through?
        elif second:
            if os.path.exists(os.path.join(NETCDF_DATA_LOCS, station.id + "internal2.nc.gz")):
                # if gzip file, unzip here
                subprocess.call(["gunzip",os.path.join(NETCDF_DATA_LOCS, station.id + "_internal2.nc.gz")])
                time.sleep(5) # make sure it is unzipped before proceeding

            ncdfp.read(os.path.join(NETCDF_DATA_LOCS, station.id + "_internal2.nc"), station, process_vars, carry_thru_vars, diagnostics = diagnostics)
            if plots or diagnostics:
                print "{:35s}  {}\n".format("Total station record size :",len(station.time.data))
            else:
                logfile.write("{:35s}  {}\n".format("Total station record size :",len(station.time.data)))

            match_to_compress = utils.create_fulltimes(station, process_vars, DATASTART, DATAEND, carry_thru_vars)


        # select neighbours
        neighbour_distances  = distances[st,:]
        neighbour_bearings   = angles[st,:]

        # have to add in start index so that can use location in distance file.
        # neighbours = n_utils.get_neighbours(st+startindex, np.float(stat[3]), neighbour_distances, neighbour_bearings, neighbour_elevations)

        # return all neighbours up to a limit from the distance and elevation offsets (500km and 300m respectively)
        neighbours, neighbour_quadrants = n_utils.get_all_neighbours(st+startindex, np.float(stat[3]), neighbour_distances, neighbour_bearings, neighbour_elevations)

        if plots or diagnostics:
            print "{:14s} {:10s} {:10s}".format("Neighbour","Distance","Elevation")
            for n in neighbours:
                print "{:14s} {:10.1f} {:10.1f}".format(neighbour_ids[n],neighbour_distances[n],neighbour_elevations[n])

        else:
            logfile.write("{:14s} {:10s} {:10s}\n".format("Neighbour","Distance","Elevation"))
            for n in neighbours:
                logfile.write("{:14s} {:10.1f} {:10.1f}\n".format(neighbour_ids[n],neighbour_distances[n],neighbour_elevations[n]))

        # if sufficient neighbours
        if len(neighbours) >= 3:

            for variable, col in FLAG_OUTLIER_DICT.items():
                # NOTE - this requires multiple reads of the same file
                #      but does make it easier to understand and code

                st_var = getattr(station, variable)

                if plots or diagnostics:
                    print "Length of {} record: {}".format(variable, len(st_var.data.compressed()))
                else:
                    logfile.write("Length of {} record: {}\n".format(variable, len(st_var.data.compressed())))

                
                if len(st_var.data.compressed()) > 0:

                    final_neighbours = n_utils.select_neighbours(station, variable, neighbour_info[neighbours], neighbours, neighbour_distances[neighbours], neighbour_quadrants, NETCDF_DATA_LOCS, DATASTART, DATAEND, logfile, second = second, diagnostics = diagnostics, plots = plots)


                    # now read in final set of neighbours and process

                    neigh_flags = np.zeros(len(station.time.data)) # count up how many neighbours think this obs is bad
                    neigh_count = np.zeros(len(station.time.data)) # number of neighbours at each time stamp
                    dpd_flags = np.zeros(len(station.time.data)) # number of neighbours at each time stamp
                    reporting_accuracies = np.zeros(len(neighbours)) # reporting accuracy of each neighbour

                    all_data = np.ma.zeros([len(final_neighbours), len(station.time.data)]) # store all the neighbour values

                    for nn, nn_loc in enumerate(final_neighbours):

                        neigh_details = neighbour_info[nn_loc]
                        neigh = utils.Station(neigh_details[0], float(neigh_details[1]), float(neigh_details[2]), float(neigh_details[3]))

                        if first:
                            ncdfp.read(os.path.join(NETCDF_DATA_LOCS, neigh.id + "_internal.nc"), neigh, [variable], diagnostics = diagnostics, read_input_station_id = False)
                        elif second:
                            ncdfp.read(os.path.join(NETCDF_DATA_LOCS, neigh.id + "_internal2.nc"), neigh, [variable], diagnostics = diagnostics, read_input_station_id = False)

                        dummy = utils.create_fulltimes(neigh, [variable], DATASTART, DATAEND, [], do_input_station_id = False)

                        all_data[nn, :] = utils.apply_filter_flags(getattr(neigh, variable))

                        if diagnostics:
                            print neigh_details

                        n_utils.detect(station, neigh, variable, neigh_flags, neigh_count, DATASTART, DATAEND, distance = neighbour_distances[nn_loc], diagnostics = diagnostics, plots = plots)

                        reporting_accuracies[nn] = utils.reporting_accuracy(getattr(neigh,variable).data)

                        dpd_flags += neigh.qc_flags[:,31]
                    # gone through all neighbours


                    # if at least 2/3 of neighbours have flagged this point (and at least 3 neighbours)
                    some_flags, = np.where(neigh_flags > 0)            
                    outlier_locs, = np.where(np.logical_and((neigh_count[some_flags] >= 3),(neigh_flags[some_flags].astype("float")/neigh_count[some_flags] > 2./3.)))

                    # flag where < 3 neighbours
                    locs = np.where(neigh_count[some_flags] < 3)
                    station.qc_flags[some_flags[locs], col] = -1

                    if len(outlier_locs) >= 1:
                        station.qc_flags[some_flags[outlier_locs], col] = 1

                        # print number flagged and copy into attribute
                        if plots or diagnostics:
                            utils.print_flagged_obs_number(logfile, "Neighbour", variable, len(outlier_locs), noWrite = True)
                        else:
                            utils.print_flagged_obs_number(logfile, "Neighbour", variable, len(outlier_locs))
                        st_var = getattr(station, variable)
                        st_var.flags[some_flags[outlier_locs]] = 1

                    else:
                        if plots or diagnostics:
                            utils.print_flagged_obs_number(logfile, "Neighbour", variable, len(outlier_locs), noWrite = True)
                        else:
                            utils.print_flagged_obs_number(logfile, "Neighbour", variable, len(outlier_locs))


                    if plots:
                        n_utils.plot_outlier(station, variable, some_flags[outlier_locs], all_data, DATASTART)

                    # unflagging using neighbours
                    n_utils.do_unflagging(station, variable, all_data, reporting_accuracies, neigh_count, dpd_flags, FLAG_COL_DICT, DATASTART, logfile, plots = plots, diagnostics = diagnostics)

                else:
                    if plots or diagnostics:
                        print "No observations to assess for {}".format(variable)
                    else:
                        logfile.write("No observations to assess for {}\n".format(variable))
                    

            # variable loop
        else:
            if plots or diagnostics:
                print "Fewer than 3 neighbours"
            else:
                logfile.write("Fewer than 3 neighbours\n")

        print dt.datetime.strftime(dt.datetime.now(), "%A, %d %B %Y, %H:%M:%S")
        print "processing took {:4.0f}s\n\n".format(time.time() - process_start_time)

        # end of neighbour check
        utils.append_history(station, "Neighbour Outlier Check")
        
        # clean up months 

        qc_tests.clean_up.clu(station, ["temperatures","dewpoints","slp","windspeeds","winddirs"], [44,45,46,47,48], FLAG_COL_DICT, DATASTART, DATAEND, logfile, plots = plots, diagnostics = diagnostics)


        if diagnostics or plots: raw_input("stop")

        # masking (at least call from here - optional call from internal?)

        # write to file
        if first:
            ncdfp.write(os.path.join(NETCDF_DATA_LOCS, station.id + "_external.nc"), station, process_vars, os.path.join(INPUT_FILE_LOCS,'attributes.dat'), opt_var_list = carry_thru_vars, compressed = match_to_compress, processing_date = '', qc_code_version = qc_code_version)
            # gzip the raw file
        elif second:
            ncdfp.write(os.path.join(NETCDF_DATA_LOCS, station.id + "_external2.nc"), station, process_vars, os.path.join(INPUT_FILE_LOCS,'attributes.dat'), opt_var_list = carry_thru_vars, compressed = match_to_compress, processing_date = '', qc_code_version = qc_code_version)
            # gzip the raw file
 

        # masking - apply the flags and copy masked data to flagged_obs attribute
        if masking:

            station = utils.mask(station, process_vars, logfile, FLAG_COL_DICT)

            # write to file
            if first:
                ncdfp.write(os.path.join(NETCDF_DATA_LOCS, station.id + "_mask.nc"), station, process_vars, os.path.join(INPUT_FILE_LOCS,'attributes.dat'), opt_var_list = carry_thru_vars, compressed = match_to_compress, processing_date = '', qc_code_version = qc_code_version)
            elif second:
                ncdfp.write(os.path.join(NETCDF_DATA_LOCS, station.id + "_mask2.nc"), station, process_vars, os.path.join(INPUT_FILE_LOCS,'attributes.dat'), opt_var_list = carry_thru_vars, compressed = match_to_compress, processing_date = '', qc_code_version = qc_code_version)

        if plots or diagnostics:
            print "Masking completed\n"
            print dt.datetime.strftime(dt.datetime.now(), "%A, %d %B %Y, %H:%M:%S\n")
            print "processing took {:4.0f}s\n\n".format(time.time() - process_start_time)
        else:
            logfile.write("Masking completed\n")
            logfile.write(dt.datetime.strftime(dt.datetime.now(), "%A, %d %B %Y, %H:%M:%S\n"))
            logfile.write("processing took {:4.0f}s\n\n".format(time.time() - process_start_time))
            logfile.close()
            
    # looped through all stations

    # gzip up all the raw files
    if doZip:
        for st, stat in enumerate(station_info):       
            if first:
                subprocess.call(["gzip",os.path.join(NETCDF_DATA_LOCS, stat[0]+"_internal.nc")])
                if masking:
                    subprocess.call(["gzip",os.path.join(NETCDF_DATA_LOCS, stat[0]+"_external.nc")])
                    subprocess.call(["gzip",os.path.join(NETCDF_DATA_LOCS, stat[0]+"_mask.nc")])

            elif second:
                subprocess.call(["gzip",os.path.join(NETCDF_DATA_LOCS, stat[0]+"_internal2.nc")])
                if masking:
                    subprocess.call(["gzip",os.path.join(NETCDF_DATA_LOCS, stat[0]+"_external2.nc")])
                    subprocess.call(["gzip",os.path.join(NETCDF_DATA_LOCS, stat[0]+"_mask2.nc")])

    print "Neighbour Checks completed\n"

    return # neighbour_checks 
Ejemplo n.º 4
0
def internal_checks(station_info,
                    restart_id="",
                    end_id="",
                    second=False,
                    all_checks=True,
                    duplicate=False,
                    odd=False,
                    frequent=False,
                    diurnal=False,
                    gap=False,
                    records=False,
                    streaks=False,
                    climatological=False,
                    spike=False,
                    humidity=False,
                    cloud=False,
                    variance=False,
                    winds=False,
                    diagnostics=False,
                    plots=False):
    '''
    Run through internal checks on list of stations passed
    
    :param list station_info: list of lists - [[ID, lat, lon, elev]] - strings
    :param str restart_id: which station to start on
    :param str end_id: which station to end on
    :param bool second: do the second run 

    :param bool all_checks: run all the checks

    :param bool duplicate/odd/frequent/diurnal/gap/records/streaks/
                climatological/spike/humidity/cloud/variance/winds: run each test separately
    :param bool diagnostics: print extra material to screen
    :param bool plots: create plots from each test [many files if all stations/all tests]

    '''
    first = not second

    if all_checks:
        duplicate = True
        odd = True
        frequent = True
        diurnal = True
        gap = True
        records = True
        streaks = True
        climatological = True
        spike = True
        humidity = True
        cloud = True
        variance = True
        winds = True
    else:
        print "single tests selected"

    qc_code_version = subprocess.check_output(['svnversion']).strip()

    # sort truncated run
    startindex = 0
    if restart_id != "":
        startindex, = np.where(station_info[:, 0] == restart_id)

    if end_id != "":
        endindex, = np.where(station_info[:, 0] == end_id)
        if endindex != len(station_info) - 1:
            station_info = station_info[startindex:endindex + 1]
        else:
            station_info = station_info[startindex:]
    else:
        station_info = station_info[startindex:]

    for st, stat in enumerate(station_info):

        # if st%100 != 0: continue # do every nth station

        print dt.datetime.strftime(dt.datetime.now(), "%A, %d %B %Y, %H:%M:%S")
        print "{:35s} {:d}/{:d}".format("Station Number : ", st + 1,
                                        len(station_info))
        print "{:35s} {}".format("Station Identifier :", stat[0])

        if plots or diagnostics:
            logfile = ""
        else:
            if first:
                logfile = file(LOG_OUTFILE_LOCS + stat[0] + '.log', 'w')
            elif second:
                logfile = file(LOG_OUTFILE_LOCS + stat[0] + '.log',
                               'a')  # append to file if second iteration.
            logfile.write(
                dt.datetime.strftime(dt.datetime.now(),
                                     "%A, %d %B %Y, %H:%M:%S\n"))
            logfile.write("Internal Checks\n")
            logfile.write("{:35s} {}\n".format("Station Identifier :",
                                               stat[0]))

        process_start_time = time.time()

        station = utils.Station(stat[0], float(stat[1]), float(stat[2]),
                                float(stat[3]))

        # latitude and longitude check

        if np.abs(station.lat) > 90.:
            if plots or diagnostics:
                print "{} {} {} {} {} {} {}\n".format(\
                        station.id,"Latitude Check",DATASTART.year, DATAEND.year,"All", "Unphysical latitude {}".format(station.lat))
            else:
                logfile.write("{} {} {} {} {} {} {}\n".format(\
                        station.id,"Latitude Check",DATASTART.year, DATAEND.year,"All", "Unphysical latitude {}".format(station.lat)))
                logfile.close()

            continue

        if np.abs(station.lon) > 180.:
            if plots or diagnostics:
                print "{} {} {} {} {} {} {}\n".format(\
                    station.id,"Longitude Check",DATASTART.year, DATAEND.year,"All", "Unphysical longitude {}".format(station.lon))
            else:
                logfile.write("{} {} {} {} {} {} {}\n".format(\
                        station.id,"Longitude Check",DATASTART.year, DATAEND.year,"All", "Unphysical longitude {}".format(station.lon)))
                logfile.close()
            continue

        # if running through the first time
        if first:

            if os.path.exists(
                    os.path.join(NETCDF_DATA_LOCS, station.id + ".nc.gz")):
                # if gzip file, unzip here
                subprocess.call([
                    "gunzip",
                    os.path.join(NETCDF_DATA_LOCS, station.id + ".nc.gz")
                ])
                time.sleep(5)  # make sure it is unzipped before proceeding

            # read in the data
            ncdfp.read(os.path.join(NETCDF_DATA_LOCS, station.id + ".nc"),
                       station,
                       process_vars,
                       opt_var_list=carry_thru_vars,
                       diagnostics=diagnostics)

            if plots or diagnostics:
                print "{:35s}  {}\n".format("Total station record size :",
                                            len(station.time.data))
            else:
                logfile.write("{:35s}  {}\n".format(
                    "Total station record size :", len(station.time.data)))

            match_to_compress = utils.create_fulltimes(station, process_vars,
                                                       DATASTART, DATAEND,
                                                       carry_thru_vars)

            station.qc_flags = np.zeros(
                [len(station.time.data),
                 69])  # changed to include updated wind tests

            # get reporting accuracies and frequencies.

            for var in process_vars:

                st_var = getattr(station, var)
                st_var.reporting_stats = utils.monthly_reporting_statistics(
                    st_var, DATASTART, DATAEND)

        # or if second pass through?
        elif second:
            ncdfp.read(os.path.join(NETCDF_DATA_LOCS, station.id + "_mask.nc"),
                       station,
                       process_vars,
                       opt_var_list=carry_thru_vars,
                       diagnostics=diagnostics)
            print "{:35s}  {}\n".format("Total station record size :",
                                        len(station.time.data))

            match_to_compress = utils.create_fulltimes(station, process_vars,
                                                       DATASTART, DATAEND,
                                                       carry_thru_vars)

        # Add history text to netcdf file
        # Reporting Changes - TODO

        # Duplicate months - check on temperature ONLY
        if duplicate:
            qc_tests.duplicate_months.dmc(station, ['temperatures'],
                                          process_vars, [0],
                                          DATASTART,
                                          DATAEND,
                                          logfile,
                                          diagnostics=diagnostics,
                                          plots=plots)

        # Odd Clusters
        if odd:
            qc_tests.odd_cluster.occ(
                station, ['temperatures', 'dewpoints', 'windspeeds', 'slp'],
                [54, 55, 56, 57],
                DATASTART,
                logfile,
                diagnostics=diagnostics,
                plots=plots,
                second=second)
            utils.apply_windspeed_flags_to_winddir(station,
                                                   diagnostics=diagnostics)

        # Frequent Values
        if frequent:
            qc_tests.frequent_values.fvc(station,
                                         ['temperatures', 'dewpoints', 'slp'],
                                         [1, 2, 3],
                                         DATASTART,
                                         DATAEND,
                                         logfile,
                                         diagnostics=diagnostics,
                                         plots=plots)

        # Diurnal Cycle
        if diurnal:
            if np.abs(station.lat) <= 60.:
                qc_tests.diurnal_cycle.dcc(station, ['temperatures'],
                                           process_vars, [4],
                                           logfile,
                                           diagnostics=diagnostics,
                                           plots=plots)

            else:
                if plots or diagnostics:
                    print "Diurnal Cycle Check not run as station latitude ({}) > 60\n".format(
                        station.lat)
                else:
                    logfile.write(
                        "Diurnal Cycle Check not run as station latitude ({}) > 60\n"
                        .format(station.lat))

        # Distributional Gap
        if gap:
            qc_tests.distributional_gap.dgc(
                station, ['temperatures', 'dewpoints', 'slp'], [5, 6, 7],
                DATASTART,
                DATAEND,
                logfile,
                diagnostics=diagnostics,
                plots=plots,
                GH=True)

        # Records
        if records:
            qc_tests.records.krc(
                station, ['temperatures', 'dewpoints', 'windspeeds', 'slp'],
                [8, 9, 10, 11],
                logfile,
                diagnostics=diagnostics,
                plots=plots)
            utils.apply_windspeed_flags_to_winddir(station,
                                                   diagnostics=diagnostics)

        # Streaks and Repetitions
        if streaks:
            qc_tests.streaks.rsc(
                station,
                ['temperatures', 'dewpoints', 'windspeeds', 'slp', 'winddirs'],
                [[12, 16, 20], [13, 17, 21], [14, 18, 22], [15, 19, 23],
                 [66, 67, 68]],
                DATASTART,
                DATAEND,
                logfile,
                diagnostics=diagnostics,
                plots=plots)
            utils.apply_windspeed_flags_to_winddir(station,
                                                   diagnostics=diagnostics)

        # Climatological Outlier
        if climatological:
            qc_tests.climatological.coc(station, ['temperatures', 'dewpoints'],
                                        [24, 25],
                                        DATASTART,
                                        DATAEND,
                                        logfile,
                                        diagnostics=diagnostics,
                                        plots=plots)
            # column 26 kept spare for slp

        # Spike
        if spike:
            qc_tests.spike.sc(
                station, ['temperatures', 'dewpoints', 'slp', 'windspeeds'],
                [27, 28, 29, 65],
                DATASTART,
                DATAEND,
                logfile,
                diagnostics=diagnostics,
                plots=plots,
                second=second)
            utils.apply_windspeed_flags_to_winddir(station,
                                                   diagnostics=diagnostics)

        # Humidity cross checks
        if humidity:
            qc_tests.humidity.hcc(station, [30, 31, 32],
                                  DATASTART,
                                  DATAEND,
                                  logfile,
                                  diagnostics=diagnostics,
                                  plots=plots)

        # Cloud cross check
        if cloud:
            qc_tests.clouds.ccc(station, [33, 34, 35, 36, 37, 38, 39, 40],
                                logfile,
                                diagnostics=diagnostics,
                                plots=plots)

        # Variance
        if variance:
            qc_tests.variance.evc(
                station, ['temperatures', 'dewpoints', 'slp', 'windspeeds'],
                [58, 59, 60, 61],
                DATASTART,
                DATAEND,
                logfile,
                diagnostics=diagnostics,
                plots=plots)
            utils.apply_windspeed_flags_to_winddir(station,
                                                   diagnostics=diagnostics)

        # Winds
        if winds:
            qc_tests.winds.wdc(station, [62, 63, 64],
                               DATASTART,
                               DATAEND,
                               logfile,
                               diagnostics=diagnostics,
                               plots=plots)

        # are flags actually applied?

        if diagnostics or plots: raw_input("stop")

        # write to file
        if first:
            ncdfp.write(os.path.join(NETCDF_DATA_LOCS,
                                     station.id + "_internal.nc"),
                        station,
                        process_vars,
                        os.path.join(INPUT_FILE_LOCS, 'attributes.dat'),
                        opt_var_list=carry_thru_vars,
                        compressed=match_to_compress,
                        processing_date='',
                        qc_code_version=qc_code_version)
            # gzip the raw file
            subprocess.call(
                ["gzip",
                 os.path.join(NETCDF_DATA_LOCS, station.id + ".nc")])

        elif second:

            ncdfp.write(os.path.join(NETCDF_DATA_LOCS,
                                     station.id + "_internal2.nc"),
                        station,
                        process_vars,
                        os.path.join(INPUT_FILE_LOCS, 'attributes.dat'),
                        opt_var_list=carry_thru_vars,
                        compressed=match_to_compress,
                        processing_date='',
                        qc_code_version=qc_code_version)
            # gzip the raw file
            subprocess.call([
                "gzip",
                os.path.join(NETCDF_DATA_LOCS, station.id + "_mask.nc")
            ])

        logfile.write(
            dt.datetime.strftime(dt.datetime.now(),
                                 "%A, %d %B %Y, %H:%M:%S\n"))
        logfile.write(
            "processing took {:4.0f}s\n\n".format(time.time() -
                                                  process_start_time))
        logfile.close()

    print "Internal Checks completed\n"

    return  # internal_checks
Ejemplo n.º 5
0
def run_checks(restart_id="",
               end_id="",
               diagnostics=False,
               plots=False,
               full=False,
               test="all"):
    """
    Main script.  Reads in station data, populates internal objects and passes to the tests.

    :param str restart_id: which station to start on
    :param str end_id: which station to end on
    :param bool diagnostics: print extra material to screen
    :param bool plots: create plots from each test
    :param bool full: run full reprocessing rather than using stored values.
    :param str test: specify a single test to run (useful for diagnostics) [climatological/distribution/diurnal
                     frequent/humidity/odd_cluster/pressure/spike/streaks/timestamp/variance/winds/world_records]
    """

    # process the station list
    station_list = utils.get_station_list(restart_id=restart_id, end_id=end_id)

    station_IDs = station_list.id

    # now spin through each ID in the curtailed list
    for st, station_id in enumerate(station_IDs):
        print("{} {:11s} ({}/{})".format(dt.datetime.now(), station_id, st + 1,
                                         station_IDs.shape[0]))

        startT = dt.datetime.now()
        # set up config file to hold thresholds etc
        config_file = os.path.join(setup.SUBDAILY_CONFIG_DIR,
                                   "{:11s}.config".format(station_id))
        if full:
            try:
                # recreating, so remove completely
                os.remove(config_file)
            except IOError:
                pass

        #*************************
        # set up the stations
        station = utils.Station(station_id, station_list.latitude[st],
                                station_list.longitude[st],
                                station_list.elevation[st])
        if diagnostics:
            print(station)

        try:
            station, station_df = io.read_station(
                os.path.join(setup.SUBDAILY_MFF_DIR,
                             "{:11s}.mff".format(station_id)), station)
        except OSError as e:
            # file missing, move on to next in sequence
            io.write_error(station, "File Missing")
            continue
        except ValueError as e:
            # some issue in the raw file
            io.write_error(station, "Error in input file", error=str(e))
            continue

        # some may have no data (for whatever reason)
        if station.times.shape[0] == 0:
            if diagnostics:
                print("No data in station {}".format(station.id))
            # scoot onto next station
            io.write_error(station, "No data in input file")
            continue

        #*************************
        # Add the country and continent
        station.country = utils.find_country_code(station.lat, station.lon)
        station.continent = utils.find_continent(station.country)

        #*************************
        """
        HadISD tests and order

        Duplicated months
        Odd Clusters of data - need to address output with buddy checks in due course.
        Frequent Values - tick
        Diurnal Cycle
        Gaps in distributions - tick
        World Records - tick
        Repeated values (streaks or just too common short ones) - partial tick
        Climatology - tick
        Spike - tick
        Humidity Cross checks - super saturation, dewpoint depression, dewpoint cut off - tick (dewpoint cut off not applied)
        Cloud logical checks - clouds not in C3S 311a @Aug 2019
        Excess Variance - partial tick
        Winds (logical wind & wind rose) - logical tick.  Not sure if wind rose is robust enough
        Logical SLP/StnLP - tick
        Precipitation logical checks - precip not in C3S 311a @Aug 2019
        """
        #*************************
        if test in ["all", "logic"]:
            # incl lat, lon and elev checks
            #
            print("L", dt.datetime.now() - startT)
            good_metadata = qc_tests.logic_checks.lc(station, [
                "temperature", "dew_point_temperature",
                "station_level_pressure", "sea_level_pressure", "wind_speed",
                "wind_direction"
            ],
                                                     full=full,
                                                     plots=plots,
                                                     diagnostics=diagnostics)

            if good_metadata != 0:
                print("Issue with station metadata")
                # skip on to next one
                continue

        if test in ["all", "odd_cluster"]:
            print("O", dt.datetime.now() - startT)
            # TODO - use suite config file to store all settings for tests
            qc_tests.odd_cluster.occ(station, [
                "temperature", "dew_point_temperature",
                "station_level_pressure", "sea_level_pressure", "wind_speed"
            ],
                                     config_file,
                                     full=full,
                                     plots=plots,
                                     diagnostics=diagnostics)

        if test in ["all", "frequent"]:
            print("F", dt.datetime.now() - startT)
            qc_tests.frequent.fvc(station, [
                "temperature", "dew_point_temperature",
                "station_level_pressure", "sea_level_pressure"
            ],
                                  config_file,
                                  full=full,
                                  plots=plots,
                                  diagnostics=diagnostics)

        # HadISD only runs on stations where latitude lower than 60(N/S)
        # Takes a long time, this one
        if test in ["all", "diurnal"]:
            print("U", dt.datetime.now() - startT)
            if np.abs(station.lat < 60):
                qc_tests.diurnal.dcc(station,
                                     config_file,
                                     full=full,
                                     plots=plots,
                                     diagnostics=diagnostics)

        if test in ["all", "distribution"]:
            print("D", dt.datetime.now() - startT)
            qc_tests.distribution.dgc(station, [
                "temperature", "dew_point_temperature",
                "station_level_pressure", "sea_level_pressure"
            ],
                                      config_file,
                                      full=full,
                                      plots=plots,
                                      diagnostics=diagnostics)

        if test in ["all", "world_records"]:
            print("W", dt.datetime.now() - startT)
            qc_tests.world_records.wrc(station, [
                "temperature", "dew_point_temperature", "sea_level_pressure",
                "wind_speed"
            ],
                                       full=full,
                                       plots=plots,
                                       diagnostics=diagnostics)

        if test in ["all", "streaks"]:
            print("K", dt.datetime.now() - startT)
            qc_tests.streaks.rsc(station, [
                "temperature", "dew_point_temperature",
                "station_level_pressure", "sea_level_pressure", "wind_speed",
                "wind_direction"
            ],
                                 config_file,
                                 full=full,
                                 plots=plots,
                                 diagnostics=diagnostics)

        # not run on pressure data in HadISD.
        if test in ["all", "climatological"]:
            print("C", dt.datetime.now() - startT)
            qc_tests.climatological.coc(
                station, ["temperature", "dew_point_temperature"],
                config_file,
                full=full,
                plots=plots,
                diagnostics=diagnostics)

        if test in ["all", "timestamp"]:
            print("T", dt.datetime.now() - startT)
            qc_tests.timestamp.tsc(station, [
                "temperature", "dew_point_temperature",
                "station_level_pressure", "sea_level_pressure", "wind_speed"
            ],
                                   config_file,
                                   full=full,
                                   plots=plots,
                                   diagnostics=diagnostics)

        if test in ["all", "spike"]:
            print("S", dt.datetime.now() - startT)
            qc_tests.spike.sc(station, [
                "temperature", "dew_point_temperature",
                "station_level_pressure", "sea_level_pressure", "wind_speed"
            ],
                              config_file,
                              full=full,
                              plots=plots,
                              diagnostics=diagnostics)

        if test in ["all", "humidity"]:
            print("h", dt.datetime.now() - startT)
            qc_tests.humidity.hcc(station,
                                  config_file,
                                  full=full,
                                  plots=plots,
                                  diagnostics=diagnostics)

        if test in ["all", "variance"]:
            print("V", dt.datetime.now() - startT)
            qc_tests.variance.evc(station, [
                "temperature", "dew_point_temperature",
                "station_level_pressure", "sea_level_pressure", "wind_speed"
            ],
                                  config_file,
                                  full=full,
                                  plots=plots,
                                  diagnostics=diagnostics)

        if test in ["all", "pressure"]:
            print("P", dt.datetime.now() - startT)
            qc_tests.pressure.pcc(station,
                                  config_file,
                                  full=full,
                                  plots=plots,
                                  diagnostics=diagnostics)

        if test in ["all", "winds"]:
            print("w", dt.datetime.now() - startT)
            qc_tests.winds.wcc(station,
                               config_file,
                               fix=True,
                               full=full,
                               plots=plots,
                               diagnostics=diagnostics)

        if test in ["all", "high_flag"]:
            print("H", dt.datetime.now() - startT)
            hfr_vars_set = qc_tests.high_flag.hfr(station, [
                "temperature", "dew_point_temperature",
                "station_level_pressure", "sea_level_pressure", "wind_speed",
                "wind_direction"
            ],
                                                  full=full,
                                                  plots=plots,
                                                  diagnostics=diagnostics)

        print(dt.datetime.now() - startT)

        #*************************
        # Insert flags into Data Frame

        # need to insert columns in correct place
        column_names = station_df.columns.values

        #*************************
        # add QC flag columns to each variable
        #    initialise with blank
        #    need to automate the column identification
        new_column_indices = []
        for c, column in enumerate(station_df.columns):
            if column in setup.obs_var_list:
                new_column_indices += [
                    c + 2
                ]  # 2 offset rightwards from variable's column

        # reverse order so can insert without messing up the indices
        new_column_indices.reverse()
        for index in new_column_indices:
            station_df.insert(
                index, "{}_QC_flag".format(station_df.columns[index - 2]),
                ["" for i in range(station_df.shape[0])], True)

        # # sort source_ID.x columns - purely for first release
        # for c, column in enumerate(station_df.columns):
        #     if "Source_ID" in column:
        #         # replace the NaN with empty string
        #         station_df[column] = station_df[column].fillna('')
        #         # rename the column
        #         variable = station_df.columns[c-1]
        #         station_df = station_df.rename(columns={column : "{}_Source_ID".format(variable)})

        # write in the flag information
        for var in setup.obs_var_list:
            obs_var = getattr(station, var)
            station_df["{}_QC_flag".format(var)] = obs_var.flags

        #*************************
        # Output of QFF
        # write out the dataframe to output format
        if hfr_vars_set > 1:
            # high flagging rates in more than one variable.  Withholding station completely
            print("{} withheld as too high flagging".format(station.id))
            io.write(
                os.path.join(setup.SUBDAILY_BAD_DIR,
                             "{:11s}.qff".format(station_id)), station_df)
        else:
            io.write(
                os.path.join(setup.SUBDAILY_PROC_DIR,
                             "{:11s}.qff".format(station_id)), station_df)

        #*************************
        # Output flagging summary file
        io.flag_write(os.path.join(setup.SUBDAILY_FLAG_DIR,
                                   "{:11s}.flg".format(station_id)),
                      station_df,
                      diagnostics=diagnostics)

        print(dt.datetime.now() - startT)

#        if diagnostics or plots:
#            input("end")
#            break

    return  # run_checks
Ejemplo n.º 6
0
def run_checks(restart_id="", end_id="", diagnostics=False, plots=False, full=False, test="all"):
    """
    Main script.  Reads in station data, populates internal objects and passes to the tests.

    :param str restart_id: which station to start on
    :param str end_id: which station to end on
    :param bool diagnostics: print extra material to screen
    :param bool plots: create plots from each test
    :param bool full: run full reprocessing rather than using stored values.
    :param str test: specify a single test to run (useful for diagnostics) [neighbour/clean_up/high_flag]
    """

    # process the station list
    station_list = utils.get_station_list(restart_id=restart_id, end_id=end_id)
    station_IDs = station_list.id

    # read in all the neighbours for these stations to hold ready
    all_neighbours = read_neighbours(restart_id=restart_id, end_id=end_id)

    # now spin through each ID in the curtailed list
    for st, target_station_id in enumerate(station_IDs):
        print("{} {} ({}/{})".format(dt.datetime.now(), target_station_id, st+1, station_IDs.shape[0]))

        startT = dt.datetime.now()
        #*************************
        # set up the stations
        target_station = utils.Station(target_station_id, station_list.latitude[st], station_list.longitude[st], station_list.elevation[st])
        if diagnostics:
            print(target_station)

        try:
            target_station, target_station_df = io.read_station(os.path.join(setup.SUBDAILY_PROC_DIR, "{:11s}.qff".format(target_station_id)), target_station, read_flags=True)
        except OSError:
            # file missing, move on to next in sequence
            continue

        # some may have no data (for whatever reason)
        if target_station.times.shape[0] == 0:
            if diagnostics:
                print("No data in station {}".format(target_station.id))
            # scoot onto next station
            continue

        # extract neighbours for this station
        nloc, = np.where(all_neighbours[:, 0, 0] == target_station_id)
        initial_neighbours = all_neighbours[nloc].squeeze()

        #*************************
        # TODO: refine neighbours [quadrants, correlation?]
        
        if test in ["all", "outlier"]:
            print("N", dt.datetime.now()-startT)
            qc_tests.neighbour_outlier.noc(target_station, initial_neighbours, \
                                               ["temperature", "dew_point_temperature", "wind_speed", "station_level_pressure", "sea_level_pressure"], full=full, plots=plots, diagnostics=diagnostics)

        if test in ["all", "clean_up"]:
            print("U", dt.datetime.now()-startT)
            qc_tests.clean_up.mcu(target_station, ["temperature", "dew_point_temperature", "station_level_pressure", "sea_level_pressure", "wind_speed", "wind_direction"], full=full, plots=plots, diagnostics=diagnostics)


        if test in ["all", "high_flag"]:
            print("H", dt.datetime.now()-startT)
            hfr_vars_set = qc_tests.high_flag.hfr(target_station, ["temperature", "dew_point_temperature", "station_level_pressure", "sea_level_pressure", "wind_speed", "wind_direction"], full=full, plots=plots, diagnostics=diagnostics)

        print(dt.datetime.now()-startT)

        # write in the flag information
        for var in setup.obs_var_list:
            obs_var = getattr(target_station, var)
            target_station_df["{}_QC_flag".format(var)] = obs_var.flags

        #*************************
        # Output of QFF
        # write out the dataframe to output format
        if hfr_vars_set > 1:
            # high flagging rates in more than one variable.  Withholding station completely
            print("{} withheld as too high flagging".format(target_station.id))
            io.write(os.path.join(setup.SUBDAILY_BAD_DIR, "{:11s}.qff".format(target_station_id)), target_station_df, formatters={"Latitude" : "{:7.4f}", "Longitude" : "{:7.4f}", "Month": "{:02d}", "Day": "{:02d}", "Hour" : "{:02d}", "Minute" : "{:02d}"})
                                                            
        else:
            io.write(os.path.join(setup.SUBDAILY_OUT_DIR, "{:11s}.qff".format(target_station_id)), target_station_df, formatters={"Latitude" : "{:7.4f}", "Longitude" : "{:7.4f}", "Month": "{:02d}", "Day": "{:02d}", "Hour" : "{:02d}", "Minute" : "{:02d}"})

            
        #*************************
        # Output flagging summary file
        io.flag_write(os.path.join(setup.SUBDAILY_FLAG_DIR, "{:11s}.flg".format(target_station_id)), target_station_df, diagnostics=diagnostics)


        print(dt.datetime.now()-startT)

#        input("stop")

    return # run_checks
Ejemplo n.º 7
0
def internal_checks(restart_id = "", end_id = "",
                    all_checks = True,
                    duplicate = False,
                    odd = False,
                    frequent = False,
                    diurnal = False,
                    gap = False,
                    records = False,
                    streaks = False,
                    climatological = False,
                    spike = False,
                    humidity = False,
                    cloud = False,
                    variance = False, 
                    winds = False, 
                    pressure = False,
                    precipitation = False,
                    diagnostics = False,
                    plots = False,
                    doMonth = False):
    '''
    Run through internal checks on list of stations passed
    
    :param str restart_id: which station to start on
    :param str end_id: which station to end on

    :param bool all_checks: run all the checks

    :param bool duplicate/odd/frequent/diurnal/gap/records/streaks/climatological/spike/humidity/cloud/variance/winds/pressure/precipitation: run each test separately
    :param bool diagnostics: print extra material to screen
    :param bool plots: create plots from each test [many files if all stations/all tests]
    :param bool doMonth: a monthly append process

    '''

    if all_checks:
        duplicate = True
        odd = True
        frequent = True
        diurnal = True
        gap = True
        records = True
        streaks = True
        climatological = True
        spike = True
        humidity = True
        cloud = True
        variance = True
        winds = True
        pressure = True
        precipitation = True
    else:
        print "single tests selected"
        
#    qc_code_version = subprocess.check_output(['svnversion']).strip()
    qc_code_version = subprocess.check_output(['svn', 'info', 'file:///home/h05/rdunn/svn/hadisd_py_qc/branches/monthly/'])
    for line in qc_code_version.split("\n"):
        if line.split(":")[0] == "Revision":
            qc_code_version = line.split(":")[1]
            break

        
    # get station information
    try:
        station_info = np.genfromtxt(os.path.join(INPUT_FILE_LOCS, STATION_LIST), dtype=(str))
    except IOError:
        print "station list not found"
        sys.exit()

    # sort truncated run
    startindex = [0]
    if restart_id != "":
        startindex, = np.where(station_info[:,0] == restart_id)


    if end_id != "":
        endindex, = np.where(station_info[:,0] == end_id)
        if endindex != len(station_info) -1:
            station_info = station_info[startindex[0]: endindex[0]+1]
        else:
            station_info = station_info[startindex[0]:]
    else:
        station_info = station_info[startindex[0]:]
        

    for st,stat in enumerate(station_info):     

        # if st%100 != 0: continue # do every nth station
  
        print dt.datetime.strftime(dt.datetime.now(), "%A, %d %B %Y, %H:%M:%S")
        print "{:35s} {:d}/{:d}".format("Station Number : ", st + 1, len(station_info))
        print "{:35s} {}".format("Station Identifier :", stat[0])
        if doMonth: print "Running with incomplete final year"

        # set up the log file
        logfile = file(LOG_OUTFILE_LOCS+stat[0]+'.log','w')
        logfile.write(dt.datetime.strftime(dt.datetime.now(), "%A, %d %B %Y, %H:%M:%S\n"))
        logfile.write("Internal Checks\n")
        logfile.write("{:35s} {}\n".format("Station Identifier :", stat[0]))

        process_start_time = time.time()

        station = utils.Station(stat[0], float(stat[1]), float(stat[2]), float(stat[3]))

        # latitude and longitude check
        if np.abs(station.lat) > 90.:
            if plots or diagnostics:
                print "{} {} {} {} {} {} {}\n".format(\
                        station.id,"Latitude Check",DATASTART.year, DATAEND.year,"All", "Unphysical latitude {}".format(station.lat))
            else:
                logfile.write("{} {} {} {} {} {} {}\n".format(\
                        station.id,"Latitude Check",DATASTART.year, DATAEND.year,"All", "Unphysical latitude {}".format(station.lat)))
                logfile.close()

            continue

        # check if station longitude outside of bounds
        if np.abs(station.lon) > 180.:       
            if plots or diagnostics:
                print "{} {} {} {} {} {} {}\n".format(\
                    station.id,"Longitude Check",DATASTART.year, DATAEND.year,"All", "Unphysical longitude {}".format(station.lon))
            else:
                logfile.write("{} {} {} {} {} {} {}\n".format(\
                        station.id,"Longitude Check",DATASTART.year, DATAEND.year,"All", "Unphysical longitude {}".format(station.lon)))
                logfile.close()
            continue

        # check if file is zipped
        if os.path.exists(os.path.join(NETCDF_DATA_LOCS, "hadisd.{}_19310101-{}_{}_raw.nc.gz".format(LONG_VERSION, END_TIME, station.id))):
            # if gzip file, unzip here
            subprocess.call(["gunzip",os.path.join(NETCDF_DATA_LOCS, "hadisd.{}_19310101-{}_{}_raw.nc.gz".format(LONG_VERSION, END_TIME, station.id))])
            time.sleep(5) # make sure it is unzipped before proceeding

        # read in the data
        ncdfp.read(os.path.join(NETCDF_DATA_LOCS, "hadisd.{}_19310101-{}_{}_raw.nc".format(LONG_VERSION, END_TIME, station.id)), station, process_vars, opt_var_list = carry_thru_vars, diagnostics = diagnostics)

        if plots or diagnostics:
            print "{:35s}  {}\n".format("Total station record size :",len(station.time.data))
        else:
            logfile.write("{:35s}  {}\n".format("Total station record size :",len(station.time.data)))

        match_to_compress = utils.create_fulltimes(station, process_vars, DATASTART, DATAEND, carry_thru_vars)

        station.qc_flags = np.zeros([len(station.time.data),71]) # changed to include updated wind tests, station level pressure & precipitation

        # get reporting accuracies and frequencies.

        for var in process_vars:

            st_var = getattr(station, var)
            st_var.reporting_stats = utils.monthly_reporting_statistics(st_var, DATASTART, DATAEND)


        # Add history text to netcdf file
        # Reporting Changes - TODO

        # Duplicate months - check on temperature ONLY
        if duplicate:
            # no change as result of incomplete year
            qc_tests.duplicate_months.dmc(station, ['temperatures'], process_vars, [0], DATASTART, DATAEND, logfile, diagnostics = diagnostics, plots = plots)

        # Odd Clusters
        if odd:
            # no change as result of incomplete year
            qc_tests.odd_cluster.occ(station,['temperatures','dewpoints','windspeeds','slp'], [54,55,56,57], DATASTART, logfile, diagnostics = diagnostics, plots = plots)
            utils.apply_flags_from_A_to_B(station, "slp", "stnlp", diagnostics = diagnostics)
            utils.apply_flags_from_A_to_B(station, "windspeeds", "winddirs", diagnostics = diagnostics)

        # Frequent Values
        if frequent:
            qc_tests.frequent_values.fvc(station, ['temperatures', 'dewpoints','slp'], [1,2,3], DATASTART, DATAEND, logfile, diagnostics = diagnostics, plots = plots, doMonth = doMonth)
            utils.apply_flags_from_A_to_B(station, "slp", "stnlp", diagnostics = diagnostics)

        # Diurnal Cycle 
        if diurnal:
            if np.abs(station.lat) <= 60.:
                qc_tests.diurnal_cycle.dcc(station, ['temperatures'], process_vars, [4], DATASTART, DATAEND, logfile, diagnostics = diagnostics, plots = plots, doMonth = doMonth)
                
            else:
                if plots or diagnostics:
                    print "Diurnal Cycle Check not run as station latitude ({}) > 60\n".format(station.lat)
                else:
                    logfile.write("Diurnal Cycle Check not run as station latitude ({}) > 60\n".format(station.lat))

        # Distributional Gap
        if gap:
            qc_tests.distributional_gap.dgc(station, ['temperatures','dewpoints','slp'], [5,6,7], DATASTART, DATAEND, logfile, diagnostics = diagnostics, plots = plots, GH = True, doMonth = doMonth)
            utils.apply_flags_from_A_to_B(station, "slp", "stnlp", diagnostics = diagnostics)

        # Records 
        if records:
            qc_tests.records.krc(station, ['temperatures','dewpoints','windspeeds','slp'], [8,9,10,11], logfile, diagnostics = diagnostics, plots = plots)
            utils.apply_flags_from_A_to_B(station, "windspeeds", "winddirs", diagnostics = diagnostics)
            utils.apply_flags_from_A_to_B(station, "slp", "stnlp", diagnostics = diagnostics)

        # Streaks and Repetitions 
        if streaks:
            qc_tests.streaks.rsc(station, ['temperatures','dewpoints','windspeeds','slp','winddirs'], [[12,16,20],[13,17,21],[14,18,22],[15,19,23],[66,67,68]], DATASTART, DATAEND, logfile, diagnostics = diagnostics, plots = plots, doMonth = doMonth)
            utils.apply_flags_from_A_to_B(station, "windspeeds", "winddirs", diagnostics = diagnostics)
            utils.apply_flags_from_A_to_B(station, "slp", "stnlp", diagnostics = diagnostics)

        # Climatological Outlier
        if climatological:
            qc_tests.climatological.coc(station, ['temperatures','dewpoints'], [24,25], DATASTART, DATAEND, logfile, diagnostics = diagnostics, plots = plots, doMonth = doMonth)
            # column 26 kept spare for slp

        # Spike
        if spike:
            qc_tests.spike.sc(station, ['temperatures','dewpoints','slp','windspeeds'], [27,28,29,65], DATASTART, DATAEND, logfile, diagnostics = diagnostics, plots = plots, doMonth = doMonth)
            utils.apply_flags_from_A_to_B(station, "windspeeds", "winddirs", diagnostics = diagnostics)
            utils.apply_flags_from_A_to_B(station, "slp", "stnlp", diagnostics = diagnostics)

        # Humidity cross checks
        if humidity:
            qc_tests.humidity.hcc(station, [30,31,32], DATASTART, DATAEND, logfile, diagnostics = diagnostics, plots = plots)

        # Cloud cross check
        if cloud:
            qc_tests.clouds.ccc(station, [33,34,35,36,37,38,39,40], logfile, diagnostics = diagnostics, plots = plots)

        # Variance
        if variance:
            qc_tests.variance.evc(station, ['temperatures','dewpoints','slp','windspeeds'], [58,59,60,61], DATASTART, DATAEND, logfile, diagnostics = diagnostics, plots = plots, doMonth = doMonth) 
            utils.apply_flags_from_A_to_B(station, "windspeeds", "winddirs", diagnostics = diagnostics)
            utils.apply_flags_from_A_to_B(station, "slp", "stnlp", diagnostics = diagnostics)

        # Winds
        if winds:
            qc_tests.winds.wdc(station, [62,63,64], DATASTART, DATAEND, logfile, diagnostics = diagnostics, plots = plots, doMonth = doMonth)

        # Pressure
        if pressure:
            qc_tests.pressure.spc(station, [69], DATASTART, DATAEND, logfile, diagnostics = diagnostics, plots = plots, doMonth = doMonth)

        # Precipitation
        if precipitation:
            qc_tests.precipitation.pcc(station, [70], DATASTART, DATAEND, logfile, diagnostics = diagnostics, plots = plots)


        # are flags actually applied?
        sys.stdout.flush()
        if diagnostics or plots: raw_input("stop")

        # write to file
        ncdfp.write(os.path.join(NETCDF_DATA_LOCS, "hadisd.{}_19310101-{}_{}_internal.nc".format(LONG_VERSION, END_TIME, station.id)), station, process_vars, os.path.join(INPUT_FILE_LOCS,'attributes.dat'), opt_var_list = carry_thru_vars, compressed = match_to_compress, processing_date = dt.datetime.strftime(dt.datetime.now(), "%d-%b-%Y"), qc_code_version = qc_code_version)
        # gzip the raw file
        subprocess.call(["gzip",os.path.join(NETCDF_DATA_LOCS, "hadisd.{}_19310101-{}_{}_raw.nc".format(LONG_VERSION, END_TIME, station.id))])


        logfile.write(dt.datetime.strftime(dt.datetime.now(), "%A, %d %B %Y, %H:%M:%S\n"))
        logfile.write("processing took {:4.0f}s\n\n".format(time.time() - process_start_time))
        logfile.close()

        # clean up
        gc.collect()

    print "Internal Checks completed\n"

    return # internal_checks
Ejemplo n.º 8
0
def main():
    """
    Main plot function - no inputs.  Runs from settings in set_paths_and_vars.
    """

    qc_test_names = make_test_dictionary()

    try:
        station_info = np.genfromtxt(os.path.join(INPUT_FILE_LOCS,
                                                  STATION_LIST),
                                     dtype=(str))
    except IOError:
        print "station list not found"
        sys.exit()

    all_flag_sums = np.zeros([len(station_info), len(qc_test)])
    all_flag_pct = np.zeros([len(station_info), len(qc_test)])

    Lons = []
    Lats = []

    uk_stns = []

    for st, stat in enumerate(station_info):

        # set up station
        station = utils.Station(stat[0], float(stat[1]), float(stat[2]),
                                float(stat[3]))

        #        if station.id[:2] != "03":
        #            continue
        print st, station.id

        # read attributes and qc_flags
        try:
            ncdfp.read(os.path.join(
                NETCDF_DATA_LOCS,
                "hadisd.{}_19310101-{}_{}.nc".format(LONG_VERSION, END_TIME,
                                                     station.id)),
                       station,
                       process_vars,
                       diagnostics=diagnostics)

            # sum qc_flags:
            # remove multi-level flagging
            qc_flags = station.qc_flags[:]

            qc_flags[qc_flags[:] > 1] = 1

            # remove multi-level flagging - neighbour flags
            no_neighbours = qc_flags[qc_flags[:] == -1].size
            qc_flags[qc_flags[:] < 0] = 0

            total_flags = qc_flags[qc_flags[:] != 0].size

            sum_flags = np.sum(qc_flags[:], axis=0)  # 71 column array

            for cols in [
                    strT_QC, strD_QC, strWS_QC, strWD_QC, strS_QC, T_QC, D_QC,
                    S_QC, WS_QC, WD_QC, C_QC
            ]:

                # to prevent double counting of flags on individual time stamps, re-sum
                combined_flags = np.sum(np.max(qc_flags[:, cols], axis=1))

                sum_flags = np.append(sum_flags, combined_flags)

            all_flag_sums[st] = sum_flags

            # now do percentage flagged of total obs

            pct_flag = np.zeros(len(qc_test), dtype=float)

            for t, test in enumerate(qc_test):

                if t in T_QC:
                    if station.temperatures.data.compressed().size > 0:
                        pct_flag[t] = sum_flags[
                            t] / station.temperatures.data.compressed().size
                elif t in D_QC:
                    if station.dewpoints.data.compressed().size > 0:
                        pct_flag[t] = sum_flags[
                            t] / station.dewpoints.data.compressed().size
                elif t in S_QC:
                    if station.slp.data.compressed().size > 0:
                        pct_flag[t] = sum_flags[
                            t] / station.slp.data.compressed().size
                elif t in WS_QC:
                    if station.windspeeds.data.compressed().size > 0:
                        pct_flag[t] = sum_flags[
                            t] / station.windspeeds.data.compressed().size
                elif t in WD_QC:
                    if station.winddirs.data.compressed().size > 0:
                        pct_flag[t] = sum_flags[
                            t] / station.winddirs.data.compressed().size
                elif t in C_QC:
                    if station.total_cloud_cover.data.compressed().size > 0:
                        pct_flag[t] = sum_flags[
                            t] / station.total_cloud_cover.data.size
                else:
                    if station.temperatures.data.compressed().size > 0:
                        pct_flag[
                            t] = sum_flags[t] / station.temperatures.data.size

            all_flag_pct[st] = 100. * pct_flag

            # get occasions when more locations are flagged than have data.

            over_100, = np.where(all_flag_pct[st] > 100.)
            all_flag_pct[st][over_100] = 100.

            Lons += [station.lon]
            Lats += [station.lat]
            uk_stns += [st]
        except RuntimeError:
            # file doesn't exist
            pass

    Lats = np.array(Lats)
    Lons = np.array(Lons)

    outfile = file(
        INPUT_FILE_LOCS + "all_fails_summary_{}.dat".format(start_time_string),
        'w')

    for t, test in enumerate(qc_test):

        plt.figure(figsize=(8, 6))
        plt.clf()
        ax = plt.axes([0, 0, 1, 1], projection=ccrs.Robinson())
        ax.set_global()
        ax.coastlines('50m')
        try:
            ax.gridlines(draw_labels=True)
        except TypeError:
            ax.gridlines()

        # colors are the exact same RBG codes as in IDL
        colors = [(150, 150, 150), (41, 10, 216), (63, 160, 255),
                  (170, 247, 255), (255, 224, 153), (247, 109, 94),
                  (165, 0, 33), (0, 0, 0)]
        limits = [0.0, 0.1, 0.2, 0.5, 1.0, 2.0, 5.0, 100.]

        all_locs = []

        for u, upper in enumerate(limits):

            if u == 0:
                locs, = np.where(all_flag_pct[uk_stns, t] == 0)
                label = "{}%: {}".format(upper, len(locs))
            else:
                locs, = np.where(
                    np.logical_and(all_flag_pct[uk_stns, t] <= upper,
                                   all_flag_pct[uk_stns, t] > limits[u - 1]))
                label = ">{} to {}%: {}".format(limits[u - 1], upper,
                                                len(locs))
                if upper == limits[-1]:
                    label = ">{}%: {}".format(limits[u - 1], len(locs))

            if len(locs) > 0:
                ax.scatter(Lons[locs],
                           Lats[locs],
                           transform=ccrs.Geodetic(),
                           s=15,
                           c=tuple([float(c) / 255 for c in colors[u]]),
                           edgecolors="none",
                           label=label)

            else:
                ax.scatter([0], [-90],
                           transform=ccrs.Geodetic(),
                           s=15,
                           c=tuple([float(c) / 255 for c in colors[u]]),
                           edgecolors="none",
                           label=label)

            all_locs += [len(locs)]

        plt.title(qc_test_names[test])
        watermarkstring = "/".join(
            os.getcwd().split('/')[4:]) + '/' + os.path.basename(
                __file__) + "   " + dt.datetime.strftime(
                    dt.datetime.now(), "%d-%b-%Y %H:%M")
        plt.figtext(0.01, 0.01, watermarkstring, size=5)

        leg = plt.legend(loc='lower center',
                         ncol=4,
                         bbox_to_anchor=(0.5, -0.2),
                         frameon=False,
                         title='',
                         prop={'size': 11},
                         labelspacing=0.15,
                         columnspacing=0.5,
                         numpoints=1)

        plt.savefig(IMAGE_LOCS +
                    "All_fails_{}_{}.png".format(test, start_time_string))
        plt.close()

        outfile.write("{:10s}".format(test) +
                      ''.join(['%7i' % n for n in all_locs]) + ''.join([
                          "%7.1f" % n
                          for n in [100. * n / len(Lats) for n in all_locs]
                      ]) + "\n")

    outfile.close()

    return  # main
Ejemplo n.º 9
0
def get_summary(stage="N", restart_id="", end_id="", diagnostics=False):
    """
    Main script.  Reads in station data, populates internal objects extracts counts per year.

    :param str stage: after which stage to run Internal, Neighbour
    :param str restart_id: which station to start on
    :param str end_id: which station to end on
    :param bool diagnostics: print extra material to screen
    """

    # process the station list
    station_list = utils.get_station_list(restart_id=restart_id, end_id=end_id)

    station_IDs = station_list.id

    yearly_counts = {}

    # now spin through each ID in the curtailed list
    for st, station_id in enumerate(station_IDs):
        print("{} {:11s} ({}/{})".format(dt.datetime.now(), station_id, st+1, station_IDs.shape[0]))


        #*************************
        # set up the stations
        station = utils.Station(station_id, station_list.latitude[st], station_list.longitude[st], station_list.elevation[st])
        if diagnostics:
            print(station)

        try:
            if stage == "I":
                station, station_df = io.read_station(os.path.join(setup.SUBDAILY_PROC_DIR, "{:11s}.qff".format(station_id)), station)
            elif stage == "N":
                station, station_df = io.read_station(os.path.join(setup.SUBDAILY_OUT_DIR, "{:11s}.qff".format(station_id)), station)

        except OSError as e:
            # file missing, move on to next in sequence
            # io.write_error(station, "File Missing")
            continue
        except ValueError as e:
            # some issue in the raw file
            # io.write_error(station, "Error in input file", error=str(e))
            continue

        # some may have no data (for whatever reason)
        if station.times.shape[0] == 0:
            if diagnostics:
                print("No data in station {}".format(station.id))
            # scoot onto next station
            # io.write_error(station, "No data in input file")
            continue

        unique_years = np.unique(station.years)
        year_counts = np.zeros(unique_years.shape).astype(int)

        # spin through each variable (might be heaviest lift)
        for var in setup.obs_var_list:
            obs_var = getattr(station, var)

            # spin through each year
            for y, year in enumerate(unique_years):
                locs, = np.where(station.years == year)

                # where obs and years intersect
                year_obs = obs_var.data[locs]
                # and just keep unflagged set
                year_counts[y] += [len(year_obs.compressed())]

        # store in dictionary
        for year, count in zip(unique_years, year_counts):
            yearly_counts[year] = yearly_counts.get(year, 0) + count


    # now print
    with open("summary_counts.txt", "w") as outfile:
        for key, value in sorted(yearly_counts.items(), key=lambda x: x[0]): 
            outfile.write("{} : {}\n".format(key, value))

    return # get_summary
Ejemplo n.º 10
0
def make_hum_heat_vars(station_info,
                       restart_id="",
                       end_id="",
                       diagnostics=False,
                       plots=False):
    """
    Make the humidity and heat-stress variable netCDF files

    Make two sets of output files containing the humidity and heat-stress
    parameters calculated on an hourly basis from the QC'd HadISD data

    :param list station_info: station information list
    :param str restart_id: first station to process
    :param str end_id: last station to process
    :param bool diagnostics: verbose output to screen
    :param bool plots: make plots (placeholder)
    """

    # sort truncated run
    startindex = 0
    if restart_id != "":
        startindex, = np.where(station_info[:, 0] == restart_id)

    if end_id != "":
        endindex, = np.where(station_info[:, 0] == end_id)
        if endindex != len(station_info) - 1:
            station_info = station_info[startindex:endindex + 1]
        else:
            station_info = station_info[startindex:]
    else:
        station_info = station_info[startindex:]

    for st, stat in enumerate(station_info):
        print dt.datetime.strftime(dt.datetime.now(), "%A, %d %B %Y, %H:%M:%S")
        print "{:35s} {:d}/{:d}".format("Station Number : ", st + 1,
                                        len(station_info))
        print "{:35s} {}".format("Station Identifier :", stat[0])

        if plots or diagnostics:
            logfile = ""
        else:
            logfile = file(LOG_OUTFILE_LOCS + stat[0] + '.log', 'a')
            logfile.write(
                dt.datetime.strftime(dt.datetime.now(),
                                     "%A, %d %B %Y, %H:%M:%S\n"))
            logfile.write("Calculating Humidity and Heat Stress variables\n")
            logfile.write("{:35s} {}\n".format("Station Identifier :",
                                               stat[0]))
        process_start_time = time.time()

        station = utils.Station(stat[0], float(stat[1]), float(stat[2]),
                                float(stat[3]))
        if os.path.exists(
                os.path.join(NETCDF_DATA_LOCS, station.id + "_mask.nc.gz")):
            # if gzip file, unzip here
            subprocess.call([
                "gunzip",
                os.path.join(NETCDF_DATA_LOCS, station.id + "_mask.nc.gz")
            ])
            time.sleep(5)  # make sure it is unzipped before proceeding

        # read in the data
        ncdfp.read(os.path.join(NETCDF_DATA_LOCS, station.id + "_mask.nc"),
                   station,
                   process_vars,
                   diagnostics=diagnostics,
                   read_qc_flags=False,
                   read_flagged_obs=False)

        match_to_compress = utils.create_fulltimes(station,
                                                   process_vars,
                                                   DATASTART,
                                                   DATAEND,
                                                   do_qc_flags=False,
                                                   do_flagged_obs=False)

        # run through calculations, each one should add a new variable to object.
        """
        1) Use T and P to get e  [to get es, use Td]
        2) Use e, P, Td and T to get Tw
        3) If Tw < 0C, recalculate e w.r.t ice, and re-obtain Tw - keep both!
        4) Use e and P to calculate q
        5) Use e and es to get rh (use appropriate es too) - or q and qs

        what P to use if no measurement - using monthly mean probably isn't appropriate in this instance??
        """

        station = humidity.run_calcs(station, logfile)

        # run through heat stress calculations

        station = heat_stress.run_calcs(station, logfile)

        if diagnostics or plots: raw_input("stop")

        # adjust this to work with the desired output file - will need a separate write function - output humidity in one set, heat indices in another?
        humidity_vars = [
            "temperatures", "dewpoints", "slp", "vapour_pressure",
            "saturation_vapour_pressure", "wetbulb_temperature",
            "specific_humidity", "relative_humidity"
        ]
        ncdfp.write(os.path.join(NETCDF_DATA_LOCS,
                                 station.id + "_humidity.nc"),
                    station,
                    humidity_vars,
                    os.path.join(INPUT_FILE_LOCS, 'attributes.dat'),
                    compressed=match_to_compress,
                    processing_date='',
                    qc_code_version='',
                    write_QC_flags=False,
                    write_flagged_obs=False,
                    least_significant_digit=5)

        heat_stress_vars = [
            "temperatures", "dewpoints", "windspeeds", "THI", "WBGT",
            "humidex", "apparent_t", "heat_index"
        ]
        ncdfp.write(os.path.join(NETCDF_DATA_LOCS,
                                 station.id + "_heat_stress.nc"),
                    station,
                    heat_stress_vars,
                    os.path.join(INPUT_FILE_LOCS, 'attributes.dat'),
                    compressed=match_to_compress,
                    processing_date='',
                    qc_code_version='',
                    write_QC_flags=False,
                    write_flagged_obs=False,
                    least_significant_digit=5)

        # gzip the raw file
        # subprocess.call(["gzip","-f",os.path.join(NETCDF_DATA_LOCS, station.id + "_humidity.nc")])
        # subprocess.call(["gzip","-f",os.path.join(NETCDF_DATA_LOCS, station.id + "_heat_stress.nc")])
        # subprocess.call(["gzip",os.path.join(NETCDF_DATA_LOCS, station.id + "_mask.nc")])

        logfile.write(
            dt.datetime.strftime(dt.datetime.now(),
                                 "%A, %d %B %Y, %H:%M:%S\n"))
        logfile.write(
            "processing took {:4.0f}s\n\n".format(time.time() -
                                                  process_start_time))
        logfile.close()

        print "Humidity and Heat Stress Indices calculated"

    return  # make_hum_heat_vars
Ejemplo n.º 11
0
def process_canadian_stations(all_stations, DATA_START):
    '''
    if in Canada_single - then single occurrance - USE
    if in Canada_onoff - then single occurrance with time limits - USE
    if in Canada_homogenisation - then multiple occurances in single location - USE
    if in Canada_goodmove - then well documented station move - USE - but only section which matches the dates
    else DON'T USE (_overlap, _questionablemove, _rem, _dates)
    '''

    station_ids = np.array([stn.id for stn in all_stations])
    # can only work on 71???0-99999 station numbers
    canadian_ids = np.array([
        s for s, stn in enumerate(station_ids)
        if stn[:2] == "71" and stn[-7:] == "0-99999"
    ])

    use = np.array([0 for i in range(len(canadian_ids))
                    ])  # 1 - use, 0 - not tested, -1 - don't use
    # only reject those we are sure about, and keep those that we can't test as we don't know

    # single, onoff & homogenisation
    for category_files in [
            "Canada_single.dat", "Canada_onoff.dat",
            "Canada_homogenisation.dat"
    ]:

        test_ids, test_names, test_lats, test_lons, test_active, test_date = read_canada_info(
            category_files, DATA_START)

        for c, cid in enumerate(canadian_ids):
            id_to_compare = int(station_ids[cid][:5])

            if id_to_compare in test_ids:
                # ID present

                loc = np.where(test_ids == id_to_compare)[0][0]

                test_station = utils.Station(
                    test_ids[loc], test_lats[loc], test_lons[loc],
                    all_stations[cid].elev)  # fake the elev
                test_station.name = test_names[loc].strip()
                test_station.call = ""
                probs = sel_utils.do_match(test_station, all_stations[cid],
                                           sel_utils.LATITUDE_THRESHOLD,
                                           sel_utils.ELEVATION_THRESHOLD,
                                           sel_utils.DISTANCE_THRESHOLD)

                if np.product(probs) > sel_utils.PROB_THRESHOLD:
                    use[c] = 1
                else:
                    print all_stations[cid].name, all_stations[cid]
                    print test_station.name.strip(), test_station
                    print probs, "\n"

    # overlap, questionablemove, rem, dates
    for category_files in [
            "Canada_rem.dat", "Canada_dates.dat",
            "Canada_questionablemove.dat", "Canada_overlap.dat"
    ]:

        test_ids, test_names, test_lats, test_lons, test_active, test_date = read_canada_info(
            category_files, DATA_START)

        for c, cid in enumerate(canadian_ids):
            id_to_compare = int(station_ids[cid][:5])

            if id_to_compare in test_ids:
                # ID present - don't use this station
                use[c] = -1

                # restrict start and end times so won't be selected.
                all_stations[cid].start = dt.datetime.today()
                all_stations[cid].end = dt.datetime.today()

    # goodmove
    for category_files in ["Canada_goodmove.dat"]:

        outfilename = os.path.join(INPUT_FILE_LOCS, "Canada_time_ranges.dat")
        try:
            os.remove(outfilename)
        except OSError:
            print "file does not exist ", outfilename
        outfile = file(outfilename, "w")

        test_ids, test_names, test_lats, test_lons, test_active, test_date = read_canada_info(
            category_files, DATA_START)

        for c, cid in enumerate(canadian_ids):
            id_to_compare = int(station_ids[cid][:5])

            if id_to_compare in test_ids:
                # ID present
                locs, = np.where(test_ids == id_to_compare)

                # test which locations match
                all_probs = []

                for loc in locs:
                    test_station = utils.Station(
                        test_ids[loc], test_lats[loc], test_lons[loc],
                        all_stations[cid].elev)  # fake the elev
                    test_station.name = test_names[loc]
                    test_station.call = ""
                    probs = sel_utils.do_match(test_station, all_stations[cid],
                                               sel_utils.LATITUDE_THRESHOLD,
                                               sel_utils.ELEVATION_THRESHOLD,
                                               sel_utils.DISTANCE_THRESHOLD)

                    all_probs += [np.product(probs)]

                good_locs, = np.where(
                    np.array(all_probs) > sel_utils.PROB_THRESHOLD)

                # need to test which range of dates can be taken.
                start = dt.datetime(DATA_START, 1, 1, 0, 0)
                end = dt.datetime(dt.datetime.now().year + 1, 1, 1, 0, 0)

                # single "Active" and last entry - then use as start date
                if (len(good_locs)
                        == 1) and (good_locs[0] + 1 == len(all_probs)) and (
                            test_active[locs][good_locs[0]].strip()
                            == "Active"):
                    use[c] = 1
                    start = test_date[locs][good_locs[0]]

                    # adjust start date and write out useful range
                    all_stations[cid].start = start
                    outfile.write("{} {} {}\n".format(
                        station_ids[cid],
                        dt.datetime.strftime(start, "%Y-%m-%d %H:%M:%S"),
                        dt.datetime.strftime(end, "%Y-%m-%d %H:%M:%S")))

                elif len(good_locs) == 2:

                    # "Active" followed by "Inactive" - use as range 71100
                    if (test_active[locs][good_locs[0]].strip() == "Active"
                        ) and (test_active[locs][good_locs[1]].strip()
                               == "Inactive"):
                        use[c] = 1
                        start = test_date[locs][good_locs[0]]
                        end = test_date[locs][good_locs[1]]

                        # adjust start and end date and write out useful range
                        all_stations[cid].start = start
                        all_stations[cid].end = end
                        outfile.write("{} {} {}\n".format(
                            station_ids[cid],
                            dt.datetime.strftime(start, "%Y-%m-%d %H:%M:%S"),
                            dt.datetime.strftime(end, "%Y-%m-%d %H:%M:%S")))

                    # 2 "Active"s, then if in final place, use first as start date 71038
                    elif (test_active[locs][good_locs[0]].strip() == "Active"
                          ) and (test_active[locs][good_locs[1]].strip()
                                 == "Active") and (good_locs[0] + 1
                                                   == len(all_probs) -
                                                   1) and (good_locs[1] + 1
                                                           == len(all_probs)):
                        use[c] = 1
                        start = test_date[locs][good_locs[0]]

                        # adjust start date and write out useful range
                        all_stations[cid].start = start
                        outfile.write("{} {} {}\n".format(
                            station_ids[cid],
                            dt.datetime.strftime(start, "%Y-%m-%d %H:%M:%S"),
                            dt.datetime.strftime(end, "%Y-%m-%d %H:%M:%S")))

                # 3 "Active"s or combination of "Active" and "Inactive" - if no other IDs present, then use all
                elif len(locs) == len(good_locs):
                    use[c] = 1
                    # no change to start/end times

        outfile.close()

    print "{} Canadian stations processed - {} kept, {} not tested, {} rejected".format(
        len(use), len(use[use == 1]), len(use[use == 0]), len(use[use == -1]))

    return all_stations  # process_canadian_stations
def main(restart_id="", end_id="", diagnostics=False):
    """
    Main plot function.

    :param str restart_id: which station to start on
    :param str end_id: which station to end on
    :param bool diagnostics: print extra material to screen
    """

    obs_var_list = setup.obs_var_list

    # process the station list
    station_list = utils.get_station_list(restart_id=restart_id, end_id=end_id)

    station_IDs = station_list.id

    all_stations = {}

    # now spin through each ID in the curtailed list
    for st, station_id in enumerate(station_IDs):

        #        if st > 10:
        #            break

        print("{} {}".format(dt.datetime.now(), station_id))

        station = utils.Station(station_id, station_list.iloc[st].latitude,
                                station_list.iloc[st].longitude,
                                station_list.iloc[st].elevation)
        if diagnostics:
            print(station)

        try:
            flag_summary = flag_read(
                os.path.join(setup.SUBDAILY_FLAG_DIR,
                             "{}.flg".format(station_id)))
        except IOError:
            print("flag file missing for {}".format(station_id))

        #*************************
        # read QFF
        # try:
        #     station_df = io.read(os.path.join(setup.SUBDAILY_OUT_DIR, "{}.qff".format(station_id)))
        # except IOError:
        #     print("Missing station {}".format(station_id))
        #     continue

        for var in obs_var_list:

            setattr(
                station, var,
                utils.Meteorological_Variable("{}".format(var), utils.MDI, "",
                                              ""))
            obs_var = getattr(station, var)

            # flags = station_df["{}_QC_flag".format(var)].fillna("")

            for test in utils.QC_TESTS.keys():
                # locs = flags[flags.str.contains(test)]

                # setattr(obs_var, test, locs.shape[0]/flags.shape[0])
                # setattr(obs_var, "{}_counts".format(test), locs.shape[0])
                try:
                    setattr(obs_var, test, flag_summary[var][test])
                    setattr(obs_var, "{}_counts".format(test),
                            flag_summary[var]["{}_counts".format(test)])
                except KeyError:
                    setattr(obs_var, test, 0)
                    setattr(obs_var, "{}_counts".format(test), 0)

            # # for total, get number of clean obs and subtract
            # flagged, = np.where(flags != "")
            # setattr(obs_var, "All", flagged.shape[0]/flags.shape[0])
            # setattr(obs_var, "All_counts".format(test), flagged.shape[0])
            try:
                setattr(obs_var, "All", flag_summary[var]["All"])
                setattr(obs_var, "All_counts".format(test),
                        flag_summary[var]["{}_counts".format("All")])
            except KeyError:
                setattr(obs_var, "All", 0)
                setattr(obs_var, "All_counts".format(test), 0)

            if diagnostics:
                print("{} - {}".format(var, flagged.shape[0]))

        all_stations[station_id] = station

    # now spin through each var/test combo and make a plot
    for var in obs_var_list:
        for test in TESTS_FOR_VARS[var]:

            for suffix in ["", "_counts"]:

                lats, lons, flag_fraction = np.zeros(
                    station_IDs.shape[0]), np.zeros(
                        station_IDs.shape[0]), np.zeros(station_IDs.shape[0])

                for st, (ID, station) in enumerate(all_stations.items()):
                    lats[st] = station.lat
                    lons[st] = station.lon
                    obs_var = getattr(station, var)
                    flag_fraction[st] = getattr(obs_var,
                                                "{}{}".format(test, suffix))

                if suffix == "":
                    flag_fraction *= 100.  # convert to percent

                # do the plot
                plt.figure(figsize=(8, 5))
                plt.clf()
                ax = plt.axes([0.02, 0.02, 0.96, 0.96],
                              projection=ccrs.Robinson())
                ax.set_global()
                ax.coastlines('50m')
                try:
                    ax.gridlines()  #draw_labels = True)
                except TypeError:
                    ax.gridlines()

                # colors are the exact same RBG codes as in IDL
                colors = [(150, 150, 150), (41, 10, 216), (63, 160, 255), (170, 247, 255), \
                          (255, 224, 153), (247, 109, 94), (165, 0, 33), (0, 0, 0)]
                if suffix == "":
                    limits = [0.0, 0.1, 0.2, 0.5, 1.0, 2.0, 5.0, 100.]
                elif suffix == "_counts":
                    limits = [0.0, 5., 10., 50., 100., 500., 1000., 5000.]

                for u, upper in enumerate(limits):

                    # sort the labels
                    if u == 0:
                        locs, = np.where(flag_fraction == 0)
                        label = "{}{}: {}".format(upper, UNITS[suffix],
                                                  len(locs))
                    else:
                        locs, = np.where(np.logical_and(flag_fraction <= upper, \
                                                        flag_fraction > limits[u-1]))
                        label = ">{} to {}{}: {}".format(
                            limits[u - 1], upper, UNITS[suffix], len(locs))
                        if upper == limits[-1]:
                            label = ">{}{}: {}".format(limits[u - 1],
                                                       UNITS[suffix],
                                                       len(locs))

                    # and plot
                    if len(locs) > 0:
                        ax.scatter(lons[locs], lats[locs], transform=ccrs.PlateCarree(), s=15, color=tuple([float(c)/255 for c in colors[u]]), \
                                   edgecolors="none", label = label)

                    else:
                        ax.scatter([0], [-90], transform=ccrs.PlateCarree(), s=15, color=tuple([float(c)/255 for c in colors[u]]), \
                                   edgecolors="none", label=label)

                if test == "All":
                    plt.title("{} - {}".format(
                        " ".join([v.capitalize() for v in var.split("_")]),
                        "All"))
                else:
                    plt.title("{} - {}".format(
                        " ".join([v.capitalize() for v in var.split("_")]),
                        utils.QC_TESTS[test]))

                watermarkstring="/".join(os.getcwd().split('/')[4:])+'/'+\
                    os.path.basename( __file__ )+"   "+dt.datetime.strftime(dt.datetime.now(), "%d-%b-%Y %H:%M")
                plt.figtext(0.01, 0.01, watermarkstring, size=5)

                leg=plt.legend(loc='lower center', ncol=4, bbox_to_anchor=(0.5, -0.12), frameon=False, title='', prop={'size':9}, \
                               labelspacing=0.15, columnspacing=0.5, numpoints=1)

                plt.savefig(
                    os.path.join(
                        IMAGE_LOCS, "All_fails_{}-{}{}_{}.png".format(
                            var, test, suffix, start_time_string)))
                plt.close()

    return  # main
Ejemplo n.º 13
0
def select_neighbours(station,
                      variable,
                      neighbour_info,
                      neighbours,
                      neighbour_distances,
                      neighbour_quadrants,
                      data_locs,
                      datastart,
                      dataend,
                      logfile,
                      diagnostics=False,
                      plots=False):
    '''
    From the list of nearby stations select the ones which will be good neighours for the test.
    Select on basis of correlation, overlap of data points and bearing (quadrants)
    
    :param object station: station object
    :param str variable: which variable to proces
    :param array neighbour_info: array of ID, lat, lon and elev
    :param array neighbours: which station sequence numbers are the nearby stations
    :param array neighbour_distances: distances to nearby stations
    :param array neighbour_quadrants: bearings to nearby stations (in 90deg bins)
    :param array data_locs: path to data files
    :param datetime datastart: start of data set
    :param datetime dataend: end of data set
    :param file logfile: logfile to store outputs
    :param boolean diagnostics: output diagnostic information
    :param boolean plots: make a plot

    :returns: final_locs - array of station sequence numbers to use.
    '''

    # set up storage arrays
    n_correlations = np.zeros(len(neighbours))
    n_distances = np.zeros(len(neighbours))
    n_quadrants = np.zeros(len(neighbours))
    n_overlaps = np.zeros(len(neighbours))
    combined_score = np.zeros(len(neighbours))

    # get station data
    st_var = getattr(station, variable)
    st_anomalies = hourly_daily_anomalies(st_var.data[:])

    # go through initial list and extract correlations and overlaps
    for nn, nn_loc in enumerate(neighbours):

        n_details = neighbour_info[nn]
        neigh = utils.Station(n_details[0], float(n_details[1]),
                              float(n_details[2]), float(n_details[3]))

        ncdfp.read(os.path.join(
            NETCDF_DATA_LOCS, "hadisd.{}_19310101-{}_{}_internal.nc".format(
                LONG_VERSION, END_TIME, station.id)),
                   neigh, [variable],
                   diagnostics=diagnostics,
                   read_input_station_id=False)

        dummy = utils.create_fulltimes(neigh, [variable],
                                       datastart,
                                       dataend, [],
                                       do_input_station_id=False)

        # get the correlations of data to this neighbour
        neigh_var = getattr(neigh, variable)
        neigh_anomalies = hourly_daily_anomalies(neigh_var.data[:])
        # correlation = np.ma.corrcoef(neigh_var.data, st_var.data)[1,0]
        correlation = np.ma.corrcoef(neigh_anomalies, st_anomalies)[1, 0]

        overlap = len(
            np.where(
                np.logical_or(neigh_var.data.mask, st_var.data.mask) == False)
            [0]) / float(len(st_var.data.compressed()))

        if not math.isnan(correlation):
            n_correlations[nn] = correlation
            n_overlaps[nn] = overlap
            combined_score[nn] = correlation + overlap
            n_distances[nn] = neighbour_distances[nn]
            n_quadrants[nn] = neighbour_quadrants[nn]

        # clear up to save memory
        del dummy
        del neigh_var
        del neigh_anomalies
        gc.collect()
    # sort in order of the combination of correlation and overlap
    sort_order = np.argsort(combined_score)[::-1]

    # and select the best 10
    # final_selection = neighbours[sort_order][:10]

    # sort out the quadrants

    locs1 = neighbours[sort_order][n_quadrants[sort_order] == 1]
    locs2 = neighbours[sort_order][n_quadrants[sort_order] == 2]
    locs3 = neighbours[sort_order][n_quadrants[sort_order] == 3]
    locs4 = neighbours[sort_order][n_quadrants[sort_order] == 4]

    final_locs = np.concatenate((locs1[:2], locs2[:2], locs3[:2], locs4[:2]),
                                axis=0).reshape(-1)

    # and add the rest in order of combined score
    for index in neighbours[sort_order]:
        if index not in final_locs:
            final_locs = np.append(final_locs, index)

        if len(final_locs) == N_NEIGHBOURS:
            break

    # output table showing distances, correlations, overlaps, the combined score and which ones were selected
    if plots or diagnostics:
        print "{:14s} {:10s} {:10s} {:10s} {:10s} {:10s} {:10s} {:10s}".format(
            "Neighbour", "Distance", "Elevation", "Correl'n", "Overlap",
            "Combined", "Quadrant", "Selected")
    else:
        logfile.write(
            "{:14s} {:10s} {:10s} {:10s} {:10s} {:10s} {:10s} {:10s}\n".format(
                "Neighbour", "Distance", "Elevation", "Correl'n", "Overlap",
                "Combined", "Quadrant", "Selected"))

    selected_correlations = []
    selected_overlaps = []
    for nn, nn_loc in enumerate(neighbours[sort_order]):

        selected = ""
        if nn_loc in final_locs:
            selected = "Y"
            if plots:
                selected_correlations += [n_correlations[sort_order[nn]]]
                selected_overlaps += [n_overlaps[sort_order[nn]]]

        neigh_details = neighbour_info[sort_order][nn]
        if plots or diagnostics:
            print "{:14s} {:10.1f} {:10.1f} {:10.5f} {:10.3f} {:10.3f} {:10.0f} {:10s}".format(
                neigh_details[0], n_distances[sort_order][nn],
                float(neigh_details[3]), n_correlations[sort_order][nn],
                n_overlaps[sort_order][nn], combined_score[sort_order][nn],
                n_quadrants[sort_order][nn], selected)
        else:
            logfile.write(
                "{:14s} {:10.1f} {:10.1f} {:10.5f} {:10.3f} {:10.3f} {:10.0f} {:10s}\n"
                .format(neigh_details[0], n_distances[sort_order][nn],
                        float(neigh_details[3]),
                        n_correlations[sort_order][nn],
                        n_overlaps[sort_order][nn],
                        combined_score[sort_order][nn],
                        n_quadrants[sort_order][nn], selected))

    # plot of correlations and overlaps, with selected stations highlighted
    if plots:
        import matplotlib.pyplot as plt

        plt.clf()
        plt.plot(n_correlations, n_overlaps, 'bo')
        plt.plot(selected_correlations, selected_overlaps, 'ro')
        plt.xlabel("correlations")
        plt.ylabel("data overlap")
        plt.title("{} - {}".format(station.id, variable))
        plt.show()

    return final_locs  # select_neighbours