Python append_history Beispiele

Programmiersprache: Python

Namespace / Paketname: qc_utils

Methode / Funktion: append_history

Beispiele auf hotexamples.com: 31

Python append_history - 31 Beispiele gefunden. Dies sind die am besten bewerteten Python Beispiele für die qc_utils.append_history, die aus Open Source-Projekten extrahiert wurden. Sie können Beispiele bewerten, um die Qualität der Beispiele zu verbessern.

Beispiel #1

Datei anzeigen

def spc(station, flag_col, start, end, logfile, diagnostics = False, plots = False, doMonth = False):
    '''
    Run pressure cross checks on SLP and STNLP

    :param object station: station object
    :param array flag_col: which columns to fill in flag_array
    :param datetime start: start of dataset
    :param datetime end: end of dataset
    :param file logfile: logfile to store outputs
    :param bool plots: do plots or not
    :param bool diagnostics: extra verbose information
    :param bool doMonth: account for incomplete months

    :returns:    
    '''

    slp = getattr(station, 'slp')
    stnlp = getattr(station, 'stnlp')

    month_ranges = utils.month_starts_in_pairs(start, end)

    station.qc_flags[:,flag_col[0]] = spc_diff(slp, stnlp, station.qc_flags[:,flag_col[0]], month_ranges, start, end, logfile, plots = plots, diagnostics = diagnostics, doMonth = doMonth)

    station = utils.append_history(station, "Pressure Cross Check")  

    return # spc

Beispiel #2

Datei anzeigen

Datei: humidity_vars.py Projekt: rjhd2/HadISD_v3

def run_calcs(station, logfile, plots=False, diagnostics=False):
    '''
    Run the humidity calculations and add the attributes to the station file

    :param object station: station object
    :param file logfile: logfile to store outputs
    :param boolean diagnostics: output diagnostic information
    :param boolean plots: make a plot

    :returns: station - updated with humidity variables
    '''

    temperatures = utils.apply_flags_to_mask(station, "temperatures")
    dewpoints = utils.apply_flags_to_mask(station, "dewpoints")

    # adjust from sea-level to station-level
    station_pressure = get_station_level_pressure(station)

    e_v = utils.set_MetVar_attributes("vapor_pressure",
                                      "Vapor pressure calculated w.r.t water",
                                      "water_vapor_pressure", "hPa",
                                      temperatures.mdi, np.dtype('float64'))
    e_s = utils.set_MetVar_attributes(
        "saturation_vapor_pressure",
        "Saturation vapor pressure calculated w.r.t. water",
        "water_vapor_pressure", "hPa", temperatures.mdi, np.dtype('float64'))
    Tw = utils.set_MetVar_attributes(
        "wet_bulb_temperature",
        "Wet bulb temperatures nearest to reporting hour",
        "wet_bulb_temperature", "C", temperatures.mdi, np.dtype('float64'))
    q = utils.set_MetVar_attributes("specific_humidity", "Specific humidity",
                                    "specific_humidity", "g/kg",
                                    temperatures.mdi, np.dtype('float64'))
    rh = utils.set_MetVar_attributes("relative_humidity", "Relative humidity",
                                     "relative_humidity", "%rh",
                                     temperatures.mdi, np.dtype('float64'))

    # sort the vapour pressures and wet-bulb --> ice or water?
    e_v.data, e_s.data, Tw.data = fix_wrt_ice_or_water(temperatures.data,
                                                       dewpoints.data,
                                                       station_pressure)

    # get relative and specific humidity
    q.data = calculate_q(e_v.data, station_pressure)
    rh.data = calculate_rh(e_v.data, e_s.data)

    if plots or diagnostics:
        print "Humidity variables calculated, setting attributes\n"
    else:
        logfile.write("Humidity variables calculated, setting attributes\n")

    setattr(station, "vapour_pressure", e_v)
    setattr(station, "saturation_vapour_pressure", e_s)
    setattr(station, "wetbulb_temperature", Tw)
    setattr(station, "specific_humidity", q)
    setattr(station, "relative_humidity", rh)

    station = utils.append_history(station, "Humidity Calculations")

    return station  # run_calcs

Beispiel #3

Datei anzeigen

Datei: distributional_gap.py Projekt: rjhd2/HadISD_v2

def dgc(station, variable_list, flag_col, start, end, logfile, plots=False, diagnostics = False, idl = False, GH = False):
    '''Controller for two individual tests'''

    if plots:
        import matplotlib.pyplot as plt
        
    
    for v, variable in enumerate(variable_list):    
        station.qc_flags[:,flag_col[v]] = dgc_monthly(station, variable, station.qc_flags[:,flag_col[v]], start, end, plots=plots, diagnostics=diagnostics, idl = idl)
        
        if variable == "slp":
            # need to send in windspeeds too        
            station.qc_flags[:,flag_col[v]] = dgc_all_obs(station, variable, station.qc_flags[:,flag_col[v]], start, end, plots=plots, diagnostics=diagnostics, idl = idl, windspeeds = True, GH = GH)
        else:
            station.qc_flags[:,flag_col[v]] = dgc_all_obs(station, variable, station.qc_flags[:,flag_col[v]], start, end, plots=plots, diagnostics=diagnostics, idl = idl, GH = GH)
    
        flag_locs = np.where(station.qc_flags[:, flag_col[v]] != 0)

        if plots or diagnostics:
            utils.print_flagged_obs_number(logfile, "Distributional Gap", variable, len(flag_locs[0]), noWrite=True)
        else:
            utils.print_flagged_obs_number(logfile, "Distributional Gap", variable, len(flag_locs[0]))

        # copy flags into attribute
        st_var = getattr(station, variable)
        st_var.flags[flag_locs] = 1
 
        # MATCHES IDL for 030660-99999, 2 flags in T, 30-06-2014

    station = utils.append_history(station, "Distributional Gap Check")  
                     
    return # dgc

Beispiel #4

Datei anzeigen

Datei: clean_up.py Projekt: rjhd2/HadISD_v2

def clu(station, var_list, flag_cols, FLAG_COL_DICT, start, end, logfile, plots=False):
    """
    Run the clean up for each variable

    :param file logfile: logfile to store outputs
    """

    for v, variable in enumerate(var_list):

        st_var = getattr(station, variable)

        clean_up(
            st_var, station.qc_flags, FLAG_COL_DICT[variable], flag_cols[v], start, end, station.time.data, plots=plots
        )

        flag_locs = np.where(station.qc_flags[:, flag_cols[v]] != 0)

        if plots:
            utils.print_flagged_obs_number(logfile, "Clean Up Months", variable, len(flag_locs[0]), noWrite=True)
        else:
            utils.print_flagged_obs_number(logfile, "Clean Up Months", variable, len(flag_locs[0]))

        # copy flags into attribute
        st_var.flags[flag_locs] = 1

    station = utils.append_history(station, "Clean Up Months")

    return  # clu

Beispiel #5

Datei anzeigen

Datei: clouds.py Projekt: rjhd2/HadISD_v2

def ccc(station, flag_col, logfile, diagnostics = False, plots = False):
    '''
    Call the logical cloud checks
    
    :param obj station: station object
    :param list flag_col: flag columns to use
    :param file logfile: logfile to store output
    :param bool diagnostics: diagnostic output (unused)
    :param bool plots: do the plots (unused)

    :returns:
    '''

    if len(flag_col) != 8:
        print "insufficient flag columns given"
        return

    unobservable(station, flag_col[0:4], logfile, plots = plots, diagnostics = diagnostics)
    
    total_lt_max(station, flag_col[4], logfile, plots = plots, diagnostics = diagnostics) 
    
    low_full(station, flag_col[5], logfile, plots = plots, diagnostics = diagnostics) 
    
    mid_full(station, flag_col[6], logfile, plots = plots, diagnostics = diagnostics) 
    
    fix_cloud_base(station)
    
    negative_cloud(station, flag_col[7], logfile, plots = plots, diagnostics = diagnostics)
    
    station = utils.append_history(station, "Cloud - Logical Cross Check")  
                     
    return # ccc

Beispiel #6

Datei anzeigen

Datei: heat_stress.py Projekt: rjhd2/HadISD_v3

def run_calcs(station, logfile, plots=False, diagnostics=False):
    '''
    Run the heat stress calculations and add the new attributes to the station

    :param obj station: station object
    :param file logfile: logfile to store outputs
    :param boolean diagnostics: output diagnostic information
    :param boolean plots: make a plot

    :returns: updated station object with heat stress values.
    '''

    temperatures = utils.apply_flags_to_mask(station, "temperatures")
    rh = getattr(station, "relative_humidity")  # no separate flags using fdi
    e_v = getattr(station, "vapour_pressure")  # no separate flags using fdi
    windspeeds = utils.apply_flags_to_mask(station, "windspeeds")

    thi = utils.set_MetVar_attributes("temperature_humidity_index",
                                      "Temperature Humidity Index (THI)",
                                      "temperature_humidity_index", "1",
                                      temperatures.mdi, np.dtype('float64'))
    wbgt = utils.set_MetVar_attributes("wet_bulb_globe_temperature",
                                       "Wet Bulb Globe Temperature (WBGT)",
                                       "wet_bulb_globe_temperature", "C",
                                       temperatures.mdi, np.dtype('float64'))
    humidex = utils.set_MetVar_attributes("humidex", "Humidex", "humidex",
                                          "1", temperatures.mdi,
                                          np.dtype('float64'))
    apparent_t = utils.set_MetVar_attributes("apparent_temperature",
                                             "Apparent Temperature",
                                             "apparent_temperature", "C",
                                             temperatures.mdi,
                                             np.dtype('float64'))
    heat_index = utils.set_MetVar_attributes("heat_index", "Heat Index",
                                             "heat_index", "C",
                                             temperatures.mdi,
                                             np.dtype('float64'))

    thi.data = calculate_thi(temperatures.data, rh.data)
    wbgt.data = calculate_wbgt(temperatures.data, e_v.data)
    humidex.data = calculate_humidex(temperatures.data, e_v.data)
    apparent_t.data = calculate_apparent_t(temperatures.data, e_v.data,
                                           windspeeds.data)
    heat_index.data = calculate_heat_index(temperatures.data, rh.data)

    if plots or diagnostics:
        print "Heat stress variables calculated, setting attributes\n"
    else:
        logfile.write("Heat stress variables calculated, setting attributes\n")

    setattr(station, "THI", thi)
    setattr(station, "WBGT", wbgt)
    setattr(station, "humidex", humidex)
    setattr(station, "apparent_t", apparent_t)
    setattr(station, "heat_index", heat_index)

    station = utils.append_history(station, "Heat Stress Calculations")

    return station  # run_calcs

Beispiel #7

Datei anzeigen

Datei: humidity.py Projekt: rjhd2/HadISD_v2

def hcc(station, flag_col, start, end, logfile, diagnostics=False, plots=False):
    """
    Run humidity cross checks on temperature and dewpoint temperature obs

    :param object station: station object
    :param array flag_col: which columns to fill in flag_array
    :param datetime start: start of dataset
    :param datetime end: end of dataset
    :param file logfile: logfile to store outputs
    :param bool plots: do plots or not
    :param bool diagnostics: extra verbose information

    :returns:    
    """

    temperatures = getattr(station, "temperatures")
    dewpoints = getattr(station, "dewpoints")

    month_ranges = utils.month_starts_in_pairs(start, end)

    # Supersaturation
    station.qc_flags[:, flag_col[0]] = hcc_sss(
        temperatures.data, dewpoints.data, month_ranges, start, logfile, plots=plots, diagnostics=diagnostics
    )

    # Dew point depression
    precip = getattr(station, "precip1_depth")
    cloudbase = getattr(station, "cloud_base")
    past_sigwx = getattr(station, "past_sigwx1")
    times = station.time.data

    station.qc_flags[:, flag_col[1]] = hcc_dpd(
        times,
        temperatures.data,
        dewpoints.data,
        precip.data,
        cloudbase.data,
        past_sigwx.data,
        start,
        logfile,
        plots=plots,
        diagnostics=diagnostics,
    )

    # Dew point cutoffs
    station.qc_flags[:, flag_col[2]] = hcc_cutoffs(
        temperatures.data, dewpoints.data, month_ranges, logfile, start, plots=plots, diagnostics=diagnostics
    )

    for col in range(3):
        flag_locs = np.where(station.qc_flags[:, flag_col[col]] != 0)
        station.dewpoints.flags[flag_locs] = 1

    station = utils.append_history(station, "Temperature-Humidity Cross Check")

    return  # hcc

Beispiel #8

Datei anzeigen

def hcc(station, flag_col, start, end, logfile, diagnostics = False, plots = False):
    '''
    Run humidity cross checks on temperature and dewpoint temperature obs

    :param object station: station object
    :param array flag_col: which columns to fill in flag_array
    :param datetime start: start of dataset
    :param datetime end: end of dataset
    :param file logfile: logfile to store outputs
    :param bool plots: do plots or not
    :param bool diagnostics: extra verbose information

    :returns:    
    '''

    
    temperatures = getattr(station, 'temperatures')
    dewpoints = getattr(station, 'dewpoints')
    
    month_ranges = utils.month_starts_in_pairs(start, end)

    # Supersaturation
    station.qc_flags[:,flag_col[0]] = hcc_sss(temperatures.data, dewpoints.data, month_ranges, start, logfile, plots = plots, diagnostics = diagnostics) 
            
    # Dew point depression  
    precip1 = getattr(station, 'precip1_depth') 
    precip2 = getattr(station, 'precip2_depth') 
    precip3 = getattr(station, 'precip3_depth') 
    precip6 = getattr(station, 'precip6_depth') 
    precip9 = getattr(station, 'precip9_depth') 
    precip12 = getattr(station, 'precip12_depth') 
    precip15 = getattr(station, 'precip15_depth') 
    precip18 = getattr(station, 'precip18_depth') 
    precip24 = getattr(station, 'precip24_depth') 

    cloudbase = getattr(station, 'cloud_base')
    past_sigwx = getattr(station, 'past_sigwx1')
    times = station.time.data

    # combine all the precips together
    precips = np.array([precip1.data, precip2.data, precip3.data, precip6.data, precip9.data, precip12.data, precip15.data, precip18.data, precip24.data])
    
    station.qc_flags[:,flag_col[1]] = hcc_dpd(times, temperatures.data, dewpoints.data, precips, cloudbase.data, past_sigwx.data, start, logfile, plots = plots, diagnostics = diagnostics)    
   
    # Dew point cutoffs
    station.qc_flags[:,flag_col[2]] = hcc_cutoffs(temperatures.data, dewpoints.data, month_ranges, logfile, start, plots = plots, diagnostics = diagnostics) 
    
    for col in range(3):
        flag_locs = np.where(station.qc_flags[:, flag_col[col]] != 0)
        station.dewpoints.flags[flag_locs] = 1
    
    station = utils.append_history(station, "Temperature-Humidity Cross Check")  

    return # hcc

Beispiel #9

Datei anzeigen

Datei: clouds.py Projekt: wk1984/HadISD_v2

def ccc(station, flag_col, logfile, diagnostics=False, plots=False):
    '''
    Call the logical cloud checks
    
    :param obj station: station object
    :param list flag_col: flag columns to use
    :param file logfile: logfile to store output
    :param bool diagnostics: diagnostic output (unused)
    :param bool plots: do the plots (unused)

    :returns:
    '''

    if len(flag_col) != 8:
        print "insufficient flag columns given"
        return

    unobservable(station,
                 flag_col[0:4],
                 logfile,
                 plots=plots,
                 diagnostics=diagnostics)

    total_lt_max(station,
                 flag_col[4],
                 logfile,
                 plots=plots,
                 diagnostics=diagnostics)

    low_full(station,
             flag_col[5],
             logfile,
             plots=plots,
             diagnostics=diagnostics)

    mid_full(station,
             flag_col[6],
             logfile,
             plots=plots,
             diagnostics=diagnostics)

    fix_cloud_base(station)

    negative_cloud(station,
                   flag_col[7],
                   logfile,
                   plots=plots,
                   diagnostics=diagnostics)

    station = utils.append_history(station, "Cloud - Logical Cross Check")

    return  # ccc

Beispiel #10

Datei anzeigen

def pcc(station,
        flag_col,
        start,
        end,
        logfile,
        diagnostics=False,
        plots=False):
    '''
    Run humidity cross checks on precipitation obs

    :param object station: station object
    :param array flag_col: which columns to fill in flag_array
    :param datetime start: start of dataset
    :param datetime end: end of dataset
    :param file logfile: logfile to store outputs
    :param bool plots: do plots or not
    :param bool diagnostics: extra verbose information

    :returns:    
    '''

    precip1 = getattr(station, 'precip1_depth')
    precip2 = getattr(station, 'precip2_depth')
    precip3 = getattr(station, 'precip3_depth')
    precip6 = getattr(station, 'precip6_depth')
    precip9 = getattr(station, 'precip9_depth')
    precip12 = getattr(station, 'precip12_depth')
    precip15 = getattr(station, 'precip15_depth')
    precip18 = getattr(station, 'precip18_depth')
    precip24 = getattr(station, 'precip24_depth')

    times = station.time.data

    # combine all the precips together
    precips = np.ma.array([
        precip1.data, precip2.data, precip3.data, precip6.data, precip9.data,
        precip12.data, precip15.data, precip18.data, precip24.data
    ])

    station.qc_flags[:,
                     flag_col[0]] = pcc_accumulations(times,
                                                      precips,
                                                      start,
                                                      logfile,
                                                      plots=plots,
                                                      diagnostics=diagnostics)

    station = utils.append_history(station, "Precipitation Cross Check")

    return  # pcc

Beispiel #11

Datei anzeigen

Datei: humidity_vars.py Projekt: rjhd2/HadISD_v2

def run_calcs(station, logfile, plots = False, diagnostics = False):
    '''
    Run the humidity calculations and add the attributes to the station file

    :param object station: station object
    :param file logfile: logfile to store outputs
    :param boolean diagnostics: output diagnostic information
    :param boolean plots: make a plot

    :returns: station - updated with humidity variables
    '''

    temperatures = utils.apply_flags_to_mask(station, "temperatures")
    dewpoints = utils.apply_flags_to_mask(station, "dewpoints")
   
    # adjust from sea-level to station-level
    station_pressure = get_station_level_pressure(station)
    

    e_v = utils.set_MetVar_attributes("vapor_pressure", "Vapor pressure calculated w.r.t water", "water_vapor_pressure", "hPa", temperatures.mdi, np.dtype('float64'))
    e_s = utils.set_MetVar_attributes("saturation_vapor_pressure", "Saturation vapor pressure calculated w.r.t. water", "water_vapor_pressure", "hPa", temperatures.mdi, np.dtype('float64'))
    Tw = utils.set_MetVar_attributes("wet_bulb_temperature", "Wet bulb temperatures nearest to reporting hour", "wet_bulb_temperature", "C", temperatures.mdi, np.dtype('float64'))
    q = utils.set_MetVar_attributes("specific_humidity", "Specific humidity", "specific_humidity", "g/kg", temperatures.mdi, np.dtype('float64'))
    rh = utils.set_MetVar_attributes("relative_humidity", "Relative humidity", "relative_humidity", "%rh", temperatures.mdi, np.dtype('float64'))


    # sort the vapour pressures and wet-bulb --> ice or water?
    e_v.data, e_s.data, Tw.data = fix_wrt_ice_or_water(temperatures.data, dewpoints.data, station_pressure)

    
    # get relative and specific humidity
    q.data = calculate_q(e_v.data, station_pressure)
    rh.data = calculate_rh(e_v.data, e_s.data)
    
    if plots or diagnostics:
        print "Humidity variables calculated, setting attributes\n"
    else:
        logfile.write("Humidity variables calculated, setting attributes\n")

    setattr(station, "vapour_pressure", e_v)
    setattr(station, "saturation_vapour_pressure", e_s)
    setattr(station, "wetbulb_temperature", Tw)
    setattr(station, "specific_humidity", q)
    setattr(station, "relative_humidity", rh)

    station = utils.append_history(station, "Humidity Calculations")

    return station # run_calcs

Beispiel #12

Datei anzeigen

def clu(station,
        var_list,
        flag_cols,
        FLAG_COL_DICT,
        start,
        end,
        logfile,
        plots=False,
        diagnostics=False):
    '''
    Run the clean up for each variable

    :param file logfile: logfile to store outputs
    '''

    for v, variable in enumerate(var_list):

        st_var = getattr(station, variable)

        clean_up(st_var,
                 station.qc_flags,
                 FLAG_COL_DICT[variable],
                 flag_cols[v],
                 start,
                 end,
                 station.time.data,
                 plots=plots)

        flag_locs = np.where(station.qc_flags[:, flag_cols[v]] != 0)

        if plots or diagnostics:
            utils.print_flagged_obs_number(logfile,
                                           "Clean Up Months",
                                           variable,
                                           len(flag_locs[0]),
                                           noWrite=True)
        else:
            utils.print_flagged_obs_number(logfile, "Clean Up Months",
                                           variable, len(flag_locs[0]))

        # copy flags into attribute
        st_var.flags[flag_locs] = 1

    station = utils.append_history(station, "Clean Up Months")

    return  # clu

Beispiel #13

Datei anzeigen

Datei: records.py Projekt: wk1984/HadISD_v2

def krc(station, var_list, flag_col, logfile, diagnostics = False, plots = False):
    '''
    Run the known records check for each variable in list
    
    :param object station: station to process
    :param list var_list: list of variables to process
    :param list flag_col: which columns to use for which variable 
    :param file logfile: logfile to store output 
    :param bool diagnostics: diagnostic output (unused)
    :param bool plots: do the plots (unused)
    
    '''
    
    
    for v, variable in enumerate(var_list):
        
        st_var = getattr(station, variable)
    
        st_region = krc_get_wmo_region(station.id)
        
        all_filtered = utils.apply_filter_flags(st_var)
        
        too_high = np.where(all_filtered > maxes[variable][st_region])
        
        krc_set_flags(too_high, station.qc_flags, flag_col[v])
        
        # make sure that don't flag the missing values!
        too_low = np.where(np.logical_and(all_filtered < mins[variable][st_region], all_filtered.mask == False ))
        
        krc_set_flags(too_low, station.qc_flags, flag_col[v])
        
        flag_locs = np.where(station.qc_flags[:, flag_col[v]] != 0)
        if plots or diagnostics:
            utils.print_flagged_obs_number(logfile, "World Record", variable, len(flag_locs[0]), noWrite = True)
        else:
            utils.print_flagged_obs_number(logfile, "World Record", variable, len(flag_locs[0]))

        # copy flags into attribute
        st_var.flags[flag_locs] = 1
	    
    station = utils.append_history(station, "World Record Check")  

    return # krc

Beispiel #14

Datei anzeigen

def rsc(station, var_list, flag_col, start, end, logfile, diagnostics = False, plots = False, doMonth = False):
    ''' Wrapper for the four individual repeating streak check tests '''
    
    times = station.time.data
    
    for v, variable in enumerate(var_list):
        
        st_var = getattr(station, variable)
                
        if len(utils.apply_filter_flags(st_var).compressed()) > 0:
        
            wind = False
            if variable == "windspeeds": wind = True
            winddir= False
            if variable == "winddirs": winddir = True

            reporting_resolution = utils.reporting_accuracy(utils.apply_filter_flags(st_var, doMonth = doMonth, start = start, end = end), winddir = winddir, plots = plots)

            limits = limits_dict[variable][reporting_resolution]  

            # need to apply flags to st_var.flags each time for filtering
            station.qc_flags[:,flag_col[v][0]] = rsc_straight_strings(st_var, times, limits[0], limits[1], start, end, reporting = reporting_resolution, wind = wind, diagnostics = diagnostics, plots = plots, dynamic = True, doMonth = doMonth)

            # no effect of final incomplete year ("month" option) as limits[2] and limits[3] fixed
            station.qc_flags[:, flag_col[v][1]], station.qc_flags[:, flag_col[v][2]] = rsc_hourly_repeats(st_var, times, limits[2], limits[3], diagnostics = diagnostics, plots = plots)


            for streak_type in range(3):
                flag_locs = np.where(station.qc_flags[:, flag_col[v][streak_type]] != 0)
                utils.print_flagged_obs_number(logfile, "Streak Check", variable, len(flag_locs[0]), noWrite = diagnostics)
                


                # copy flags into attribute
                st_var.flags[flag_locs] = 1

    station = utils.append_history(station, "Streak Check")
    
    return

Beispiel #15

Datei anzeigen

Datei: streaks.py Projekt: rjhd2/HadISD_v2

def rsc(station, var_list, flag_col, start, end, logfile, diagnostics = False, plots = False):
    ''' Wrapper for the four individual repeating streak check tests '''
    
    times = station.time.data
    
    for v, variable in enumerate(var_list):
        
        st_var = getattr(station, variable)
                
        if len(utils.apply_filter_flags(st_var).compressed()) > 0:
        
            reporting_resolution = utils.reporting_accuracy(utils.apply_filter_flags(st_var))

            limits = limits_dict[variable][reporting_resolution]  

            wind = False
            if variable == "windspeeds": wind = True

            # need to apply flags to st_var.flags each time for filtering
            station.qc_flags[:,flag_col[v][0]] = rsc_straight_strings(st_var, times, limits[0], limits[1], start, end, reporting = reporting_resolution, wind = wind, diagnostics = diagnostics, plots = plots, dynamic = True)

            station.qc_flags[:, flag_col[v][1]], station.qc_flags[:, flag_col[v][2]]= rsc_hourly_repeats(st_var, times, limits[2], limits[3], diagnostics = diagnostics, plots = plots)


            for streak_type in range(3):
                flag_locs = np.where(station.qc_flags[:, flag_col[v][streak_type]] != 0)
                if plots or diagnostics:
                    utils.print_flagged_obs_number(logfile, "Streak Check", variable, len(flag_locs[0]), noWrite = True)
                else:
                    utils.print_flagged_obs_number(logfile, "Streak Check", variable, len(flag_locs[0]))


                # copy flags into attribute
                st_var.flags[flag_locs] = 1

    station = utils.append_history(station, "Streak Check")
    
    return

Beispiel #16

Datei anzeigen

def dcc(station,
        variable_list,
        full_variable_list,
        flag_col,
        start,
        end,
        logfile,
        diagnostics=False,
        plots=False,
        doMonth=False):
    '''
    The diurnal cycle check.
    
    :param object station: the station object to be processed
    :param list variable_list: the variables to be processed
    :param list full_variable_list: the variables for flags to be applied to
    :param list flag_col: which column in the qc_flags array to work on
    :param file logfile: logfile to store outputs
    :param bool plots: to do any plots
    :param bool diagnostics: to do any extra diagnostic output
    :returns:
    '''

    # list of flags for each variable
    diurnal_flags = []

    for v, variable in enumerate(variable_list):

        st_var = getattr(station, variable)

        # is this needed 21/08/2014
        #        reporting_accuracy = utils.reporting_accuracy(utils.apply_filter_flags(st_var))

        # apply flags, but discount incomplete year - so that test values against these later.
        all_data = utils.apply_filter_flags(st_var)
        all_data = all_data.reshape(-1, 24)  # working in fulltimes.
        # apply flags - also apply to final incomplete year so that best values only use complete years
        filtered_data = utils.apply_filter_flags(st_var,
                                                 doMonth=doMonth,
                                                 start=start,
                                                 end=end)
        filtered_data = filtered_data.reshape(-1, 24)  # working in fulltimes.
        number_of_days = filtered_data.shape[0]

        if plots:
            import matplotlib.pyplot as plt
            plt.clf()
            plot_data = np.ma.zeros(filtered_data.shape)
            plot_data.mask = True
#            best_estimate_counter = np.zeros(HOURS)

        diurnal_filtered_fits = np.zeros(filtered_data.shape[0], dtype=(int))
        diurnal_filtered_fits.fill(INTMDI)
        diurnal_best_fits = np.zeros(st_var.data.shape[0], dtype=(int))
        diurnal_best_fits.fill(INTMDI)
        diurnal_uncertainties = np.zeros(filtered_data.shape[0])
        diurnal_uncertainties.fill(INTMDI)

        for d, day in enumerate(all_data):
            '''enough observations and have large enough diurnal range '''
            if len(day.compressed()) >= OBS_PER_DAY:

                obs_daily_range = max(day.compressed()) - min(day.compressed())
                if obs_daily_range >= DAILY_RANGE:

                    if dcc_quartile_check(day):
                        scaled_sine = ((dcc_make_sine() + 1.) / 2. *
                                       obs_daily_range) + min(day.compressed())
                        diffs = np.zeros(HOURS)
                        '''Find differences for each shifted sine --> cost function'''
                        for h in range(HOURS):
                            diffs[h] = np.sum(
                                np.abs(day - scaled_sine).compressed())
                            scaled_sine = np.roll(scaled_sine,
                                                  1)  # matched to IDL SHIFT()

                        # and keep this for testing against the average value later
                        diurnal_best_fits[d] = np.argmin(diffs)

        for d, day in enumerate(filtered_data):
            '''enough observations and have large enough diurnal range '''
            if len(day.compressed()) >= OBS_PER_DAY:

                obs_daily_range = max(day.compressed()) - min(day.compressed())
                if obs_daily_range >= DAILY_RANGE:

                    if dcc_quartile_check(day):
                        scaled_sine = ((dcc_make_sine() + 1.) / 2. *
                                       obs_daily_range) + min(day.compressed())
                        diffs = np.zeros(HOURS)
                        '''Find differences for each shifted sine --> cost function'''
                        for h in range(HOURS):
                            diffs[h] = np.sum(
                                np.abs(day - scaled_sine).compressed())
                            scaled_sine = np.roll(scaled_sine,
                                                  1)  # matched to IDL SHIFT()

                        diurnal_filtered_fits[d] = np.argmin(diffs)

                        # default uncertainty is the average time resolution of the data
                        diurnal_uncertainties[d] = round(
                            float(HOURS) / len(day.compressed()))

                        if DYNAMIC_DIURNAL:
                            critical_value = min(diffs) + (
                                (max(diffs) - min(diffs)) * 0.33)

                            # centre so minimum in middle
                            diffs = np.roll(diffs,
                                            11 - diurnal_filtered_fits[d])

                            uncertainty = 1
                            while uncertainty < 11:
                                if (diffs[11 - uncertainty] > critical_value) and\
                                        (diffs[11 + uncertainty] > critical_value):
                                    # break if both sides greater than critical difference
                                    # when counting outwards
                                    #    see diurnal_example.py
                                    break

                                uncertainty += 1

                            # check if uncertainty greater than time resolution for day
                            if uncertainty > diurnal_uncertainties[d]:
                                diurnal_uncertainties[d] = uncertainty

                        if plots:
                            #                            best_estimate_counter[np.argmin(diffs)] += 1
                            # scale daily data to range -1 -> 1, plot with random scatter for clarity
                            plot_data[d] = ((2 *
                                             (day - min(day.compressed())) /
                                             obs_daily_range) - 1.)
                            plt.plot(
                                np.arange(24) + np.random.randn(24) * 0.25,
                                plot_data[d] + np.random.randn(24) * 0.05,
                                'k,')

        if plots:
            plt.plot(
                np.arange(24),
                np.roll(
                    dcc_make_sine(),
                    np.argmax(
                        np.bincount(diurnal_filtered_fits[np.where(
                            diurnal_filtered_fits != INTMDI)]))), 'r-')
            plt.xlim([-1, 25])
            plt.ylim([-1.2, 1.2])
            plt.show()

        # dumb copy of IDL
        '''For each uncertainty range (1-6h) find median of cycle offset'''
        filtered_fits = np.zeros(6)
        filtered_fits.fill(-9)
        for h in range(6):
            locs = np.where(diurnal_uncertainties == h + 1)

            if len(locs[0]) > 300:
                # filtered_fits[h] = int(np.median(diurnal_filtered_fits[locs]))
                # Numpy median gives average of central two values which may not be integer
                # 25/11/2014 use IDL style which gives lower value
                filtered_fits[h] = utils.idl_median(
                    diurnal_filtered_fits[locs])
        '''Build up range of cycles incl, uncertainty to find where best of best located'''

        hours = np.arange(24)
        hour_matches = np.zeros(24)
        diurnal_peak = -9
        number_estimates = 0
        for h in range(6):
            if filtered_fits[h] != -9:
                '''Store lowest uncertainty best fit as first guess'''
                if diurnal_peak == -9:
                    diurnal_peak = filtered_fits[h]
                    hours = np.roll(hours, 11 - int(diurnal_peak))
                    hour_matches[11 - (h + 1):11 + (h + 2)] = 1
                    number_estimates += 1

                centre, = np.where(hours == filtered_fits[h])

                if (centre[0] - h + 1) >= 0:
                    if (centre[0] + h + 1) <= 23:
                        hour_matches[centre[0] - (h + 1):centre[0] + h +
                                     2] += 1
                    else:
                        hour_matches[centre[0] - (h + 1):] += 1
                        hour_matches[:centre[0] + h + 2 - 24] += 1
                else:
                    hour_matches[:centre[0] + h + 2] += 1
                    hour_matches[centre[0] - (h + 1):] += 1

                number_estimates += 1
        '''If value at lowest uncertainty not found in all others, then see what value is found by all others '''
        if hour_matches[
                11] != number_estimates:  # central estimate at 12 o'clock
            all_match = np.where(hour_matches == number_estimates)

            # if one is, then use it
            if len(all_match[0]) > 0:
                diurnal_peak = all_match[0][0]
            else:
                diurnal_peak = -9
        '''Now have value for best fit diurnal offset'''

        potentially_spurious = np.zeros(number_of_days)
        potentially_spurious.fill(INTMDI)

        if diurnal_peak != -9:
            hours = np.arange(24)
            hours = np.roll(hours, 11 - int(diurnal_peak))
            for d in range(number_of_days):
                # and now going back to the unfiltered data
                if diurnal_best_fits[d] != INTMDI:
                    '''Checks if global falls inside daily value+/-range
                    rather than seeing if each day falls in global value+/-range'''

                    min_range = 11 - diurnal_uncertainties[d]
                    max_range = 11 + diurnal_uncertainties[d]
                    maxloc = np.where(hours == diurnal_best_fits[d])[0][0]

                    if maxloc < min_range or maxloc > max_range:
                        potentially_spurious[d] = 1
                    else:
                        potentially_spurious[d] = 0

            # count number of good, missing and not-bad days
            n_good = 0
            n_miss = 0
            n_not_bad = 0
            total_points = 0
            total_not_miss = 0
            to_flag = np.zeros(number_of_days)

            for d in range(number_of_days):

                if potentially_spurious[d] == 1:

                    n_good = 0
                    n_miss = 0
                    n_not_bad = 0
                    total_points += 1
                    total_not_miss += 1

                else:

                    if potentially_spurious[d] == 0:

                        n_good += 1
                        n_not_bad += 1
                        if n_miss != 0:
                            n_miss = 0
                        total_not_miss += 1

                    if potentially_spurious[d] == -999:

                        n_miss += 1
                        n_not_bad += 1
                        if n_good != 0:
                            n_good = 0

                    total_points += 1

                    if (n_good == 3) or (n_miss == 3) or (n_not_bad >= 6):

                        if total_points >= 30:
                            if float(total_not_miss) / total_points >= 0.5:
                                to_flag[d - total_points:d] = 1

                        n_good = 0
                        n_miss = 0
                        n_not_bad = 0
                        total_points = 0
                        total_not_miss = 0

            dcc_flags = np.zeros(filtered_data.shape)

            for d in range(number_of_days):

                if to_flag[d] == 1:
                    good = np.where(filtered_data.mask[d, :] == False)
                    if len(good[0]) >= 1:
                        dcc_flags[d, good] = 1

            if diagnostics:
                print len(np.where(dcc_flags == 1)[0])
                print "currently matches IDL, but should all hours in days have flags set, not just the missing/flagged ones?"

            diurnal_flags += [dcc_flags]
        else:
            diurnal_flags += [np.zeros(filtered_data.shape)]

        station.qc_flags[:, flag_col[v]] = np.array(diurnal_flags).reshape(-1)

        flag_locs = np.where(station.qc_flags[:, flag_col[v]] != 0)
        utils.print_flagged_obs_number(logfile,
                                       "Diurnal Cycle",
                                       variable,
                                       len(flag_locs[0]),
                                       noWrite=diagnostics)

        # copy flags into attribute
        st_var.flags[flag_locs] = 1

        # CHECKED 030660-99999, 30-06-2014, 855 flagged RJHD

    utils.apply_flags_all_variables(station,
                                    full_variable_list,
                                    flag_col[variable_list == "temperatures"],
                                    logfile,
                                    "Diurnal Cycle",
                                    plots=plots,
                                    diagnostics=diagnostics)

    station = utils.append_history(station, "Diurnal Cycle Check")

    return  # dcc

Beispiel #17

Datei anzeigen

Datei: spike.py Projekt: rjhd2/HadISD_v2

def sc(station, variable_list, flag_col, start, end, logfile, diagnostics = False, plots = False, second = False):
    '''
    Spike Check, looks for spikes up to 3 observations long, using thresholds
    calculated from the data itself.

    :param MetVar station: the station object
    :param list variable_list: list of observational variables to process
    :param list flag_col: the columns to set on the QC flag array
    :param datetime start: dataset start time
    :param datetime end: dataset end time
    :param file logfile: logfile to store outputs
    :param bool plots: do plots
    :param bool second: run for second time

    :returns:    
    '''
    print "refactor"
    
    for v, variable in enumerate(variable_list):

        flags = station.qc_flags[:, flag_col[v]]

        prev_flag_number = 0
        if second:
            # count currently existing flags:
            prev_flag_number = len(flags[flags != 0])
    
        st_var = getattr(station, variable)
    
        all_filtered = utils.apply_filter_flags(st_var)
      
        reporting_resolution = utils.reporting_accuracy(utils.apply_filter_flags(st_var))
        # to match IDL system - should never be called as would mean no data
        if reporting_resolution == -1: reporting_resolution = 1 

        month_ranges = utils.month_starts_in_pairs(start, end)
        month_ranges = month_ranges.reshape(-1,12,2)
        
        good = np.where(all_filtered.mask == False)
        
        full_time_diffs = np.ma.zeros(len(all_filtered))
        full_time_diffs.mask = all_filtered.mask
        full_time_diffs[good] = station.time.data[good][1:] - station.time.data[good][:-1]
        
        # develop critical values using clean values
        # NOTE 4/7/14 - make sure that Missing and Flagged values treated appropriately
        print "sort the differencing if values were flagged rather than missing"

        full_filtered_diffs = np.ma.zeros(len(all_filtered))
        full_filtered_diffs.mask = all_filtered.mask
        full_filtered_diffs[good] = all_filtered.compressed()[1:] - all_filtered.compressed()[:-1]
        
        # test all values
        good_to_uncompress = np.where(st_var.data.mask == False)
        full_value_diffs = np.ma.zeros(len(st_var.data))
        full_value_diffs.mask = st_var.data.mask
        full_value_diffs[good_to_uncompress] = st_var.data.compressed()[1:] - st_var.data.compressed()[:-1]

        # convert to compressed time to match IDL
        value_diffs = full_value_diffs.compressed()
        time_diffs = full_time_diffs.compressed()
        filtered_diffs = full_filtered_diffs.compressed()
        flags = flags[good_to_uncompress]
        

        critical_values = np.zeros([9,12])
        critical_values.fill(st_var.mdi)
        
        # link observation to calendar month
        month_locs = np.zeros(full_time_diffs.shape)
                
        for month in range(12):
            for year in range(month_ranges.shape[0]):
                
                if year == 0:
                    this_month_time_diff = full_time_diffs[month_ranges[year,month,0]:month_ranges[year,month,1]]
                    this_month_filtered_diff = full_filtered_diffs[month_ranges[year,month,0]:month_ranges[year,month,1]]
                else:
                    this_month_time_diff = np.ma.concatenate([this_month_time_diff, full_time_diffs[month_ranges[year,month,0]:month_ranges[year,month,1]]])
                    this_month_filtered_diff = np.ma.concatenate([this_month_filtered_diff, full_filtered_diffs[month_ranges[year,month,0]:month_ranges[year,month,1]]])


                month_locs[month_ranges[year,month,0]:month_ranges[year,month,1]] = month
      
            for delta in range(1,9):
                
                locs = np.ma.where(this_month_time_diff == delta)
        
                if len(locs[0]) >= 100:
                    
                    iqr = utils.IQR(this_month_filtered_diff[locs])

                    if iqr == 0. and delta == 1:
                        critical_values[delta-1,month] = 6.
                    elif iqr == 0: 
                        critical_values[delta-1,month] = st_var.mdi
                    else:
                        critical_values[delta-1,month] = 6. * iqr      

                    # January 2015 - changed to dynamically calculating the thresholds if less than IQR method ^RJHD

                    if plots:
                        import calendar
                        title = "{}, {}-hr differences".format(calendar.month_name[month+1], delta)                  
                        line_label = st_var.name
                        xlabel = "First Difference Magnitudes"
                    else:
                        title, line_label, xlabel = "","",""

                    threshold = utils.get_critical_values(this_month_filtered_diff[locs], binmin = 0, binwidth = 0.5, plots = plots, diagnostics = diagnostics, title = title, line_label = line_label, xlabel = xlabel, old_threshold = critical_values[delta-1,month])

                    if threshold < critical_values[delta-1,month]: critical_values[delta-1,month] = threshold

                    if plots or diagnostics:

                        print critical_values[delta-1,month] , iqr, 6 * iqr
           

        month_locs = month_locs[good_to_uncompress]
        if diagnostics:
            print critical_values[0,:]
                
        # not less than 5x reporting accuracy
        good_critical_values = np.where(critical_values != st_var.mdi)
        low_critical_values = np.where(critical_values[good_critical_values] <= 5.*reporting_resolution)
        temporary = critical_values[good_critical_values]
        temporary[low_critical_values] = 5.*reporting_resolution
        critical_values[good_critical_values] = temporary
        
        
        if diagnostics:
            print critical_values[0,:], 5.*reporting_resolution

        # check hourly against 2 hourly, if <2/3 the increase to avoid crazy rejection rate
        for month in range(12):
            if critical_values[0,month] != st_var.mdi and critical_values[1,month] != st_var.mdi:
                if critical_values[0,month]/critical_values[1,month] <= 0.66:
                    critical_values[0,month] = 0.66 * critical_values[1,month]
        
        if diagnostics:
            print critical_values[0,:]


        # get time differences for unfiltered data

        full_time_diffs = np.ma.zeros(len(st_var.data))
        full_time_diffs.mask = st_var.data.mask
        full_time_diffs[good_to_uncompress] = station.time.data[good_to_uncompress][1:] - station.time.data[good_to_uncompress][:-1]
        time_diffs = full_time_diffs.compressed()

        # go through each difference, identify which month it is in if passes spike thresholds 
    
        # spikes at the beginning or ends of sections
        for t in np.arange(len(time_diffs)):
            if (np.abs(time_diffs[t - 1]) > 240) and (np.abs(time_diffs[t]) < 3):
                # 10 days before but short gap thereafter
                
                next_values = st_var.data[good_to_uncompress[0][t + 1:]] 
                good, = np.where(next_values.mask == False)
        
                next_median = np.ma.median(next_values[good[:10]])
        
                next_diff = np.abs(value_diffs[t]) # out of spike
                median_diff = np.abs(next_median - st_var.data[good_to_uncompress[0][t]]) # are the remaining onees
                       
                if (critical_values[time_diffs[t] - 1, month_locs[t]] != st_var.mdi):
                    
                    # jump from spike > critical but average after < critical / 2
                    if (np.abs(median_diff) < critical_values[time_diffs[t] - 1, month_locs[t]] / 2.) and\
                        (np.abs(next_diff) > critical_values[time_diffs[t] - 1, month_locs[t]]) :
                    
                        flags[t] = 1
                        if plots or diagnostics:
                            sc_diagnostics_and_plots(station.time.data, st_var.data, good_to_uncompress[0][t], good_to_uncompress[0][t+1], start, variable, plots = plots)
                        
                        
            elif (np.abs(time_diffs[t - 1]) < 3) and (np.abs(time_diffs[t]) > 240):
                # 10 days after but short gap before
                
                prev_values = st_var.data[good_to_uncompress[0][:t - 1]]
                good, = np.where(prev_values.mask == False)
        
                prev_median = np.ma.median(prev_values[good[-10:]])
        
                prev_diff = np.abs(value_diffs[t - 1])
                median_diff = np.abs(prev_median - st_var.data[good_to_uncompress[0][t]])
        
                if (critical_values[time_diffs[t - 1] - 1, month_locs[t]] != st_var.mdi):
                    
                    # jump into spike > critical but average before < critical / 2
                    if (np.abs(median_diff) < critical_values[time_diffs[t - 1] - 1, month_locs[t]] / 2.) and\
                        (np.abs(prev_diff) > critical_values[time_diffs[t - 1] - 1, month_locs[t]]) :
                    
                        flags[t] = 1
                        if plots or diagnostics:
                            sc_diagnostics_and_plots(station.time.data, st_var.data, good_to_uncompress[0][t], good_to_uncompress[0][t+1], start, variable, plots = plots)
                        
        
        
        
        ''' this isn't the nicest way, but a direct copy from IDL
            masked arrays might help remove some of the lines

            Also, this is relatively slow'''
            
        for t in np.arange(len(time_diffs)):
            for spk_len in [1,2,3]:
                if t >= spk_len and t < len(time_diffs) - spk_len:
                    
                    # check if time differences are appropriate, for multi-point spikes
                    if (np.abs(time_diffs[t - spk_len]) <= spk_len * 3) and\
                    (np.abs(time_diffs[t]) <= spk_len * 3) and\
                    (time_diffs[t - spk_len - 1] - 1 < spk_len * 3) and\
                    (time_diffs[t + 1] - 1 < spk_len * 3) and \
                    ((spk_len == 1) or \
                    ((spk_len == 2) and (np.abs(time_diffs[t - spk_len + 1]) <= spk_len * 3)) or \
                    ((spk_len == 3) and (np.abs(time_diffs[t - spk_len + 1]) <= spk_len * 3) and (np.abs(time_diffs[t - spk_len + 2]) <= spk_len * 3))):
                        
                        # check if differences are valid                        
                        if (value_diffs[t - spk_len] != st_var.mdi) and \
                        (value_diffs[t - spk_len] != st_var.fdi) and \
                        (critical_values[time_diffs[t - spk_len] - 1, month_locs[t]] != st_var.mdi):
                        
                            # if exceed critical values
                            if (np.abs(value_diffs[t - spk_len]) >= critical_values[time_diffs[t - spk_len] - 1, month_locs[t]]):

                                # are signs of two differences different
                                if (math.copysign(1, value_diffs[t]) != math.copysign(1, value_diffs[t - spk_len])):
                                    
                                    # are within spike differences small
                                    if (spk_len == 1) or\
                                    ((spk_len == 2) and (np.abs(value_diffs[t - spk_len + 1]) < critical_values[time_diffs[t - spk_len + 1] -1, month_locs[t]] / 2.)) or \
                                    ((spk_len == 3) and (np.abs(value_diffs[t - spk_len + 1]) < critical_values[time_diffs[t - spk_len + 1] -1, month_locs[t]] / 2.) and\
                                      (np.abs(value_diffs[t - spk_len + 2]) < critical_values[time_diffs[t - spk_len + 2] -1, month_locs[t]] / 2.)):
                                    
                                        # check if following value is valid
                                        if (value_diffs[t] != st_var.mdi) and (critical_values[time_diffs[t] - 1, month_locs[t]] != st_var.mdi) and\
                                            (value_diffs[t] != st_var.fdi):
                                            
                                            # and if at least critical value                                            
                                            if (np.abs(value_diffs[t]) >= critical_values[time_diffs[t] - 1, month_locs[t]]):
                                                
                                                # test if surrounding differences below 1/2 critical value
                                                if (np.abs(value_diffs[t - spk_len - 1]) <= critical_values[time_diffs[t - spk_len - 1] - 1, month_locs[t]] / 2.): 
                                                    if (np.abs(value_diffs[t + 1]) <= critical_values[time_diffs[t + 1] - 1, month_locs[t]] / 2.): 
                                                    
                                                        # set the flags
                                                        flags[ t - spk_len + 1 : t +1] = 1   

                                                        if plots or diagnostics:
                                                            
                                                            sc_diagnostics_and_plots(station.time.data, st_var.data, good_to_uncompress[0][t-spk_len+1], good_to_uncompress[0][t+1], start, variable, plots = plots)
                                                           

        station.qc_flags[good_to_uncompress, flag_col[v]] = flags
                                    
        flag_locs = np.where(station.qc_flags[:, flag_col[v]] != 0)

        if plots or diagnostics:
            utils.print_flagged_obs_number(logfile, "Spike", variable, len(flag_locs[0]) - prev_flag_number, noWrite = True) # additional flags
        else:
            utils.print_flagged_obs_number(logfile, "Spike", variable, len(flag_locs[0]) - prev_flag_number) # additional flags

        # copy flags into attribute
        st_var.flags[flag_locs] = 1
 
        # matches 030660 - but with adapted IDL
        # matches 030220 OK, but finds more but all are reasonable 1/9/14

        do_interactive = False
        if plots and do_interactive == True:
            import matplotlib.pyplot as plt
        
            plot_times = utils.times_hours_to_datetime(station.time.data, start)
            
            plt.clf()
            plt.plot(plot_times, all_filtered, 'bo', ls='-')
            flg = np.where(flags[:, flag_col[v]] == 1)
            plt.plot(plot_times[flg], all_filtered[flg], 'ro', markersize=10)
            plt.show()
	    
    station = utils.append_history(station, "Spike Check")  

    return # sc

Beispiel #18

Datei anzeigen

Datei: neighbour_checks.py Projekt: rjhd2/HadISD_v2

def neighbour_checks(station_info, restart_id = "", end_id = "", distances=np.array([]), angles=np.array([]), second = False, masking = False, doZip=False, plots = False, diagnostics = False):
    """
    Run through neighbour checks on list of stations passed
    
    :param list station_info: list of lists - [[ID, lat, lon, elev]] - strings
    :param array distances: array of distances between station pairs
    :param array angles: array of angles between station pairs
    :param bool second: do the second run
    :param bool masking: apply the flags to the data to mask the observations.

    """
    first = not second

    qc_code_version = subprocess.check_output(['svnversion']).strip()

    # if distances and angles not calculated, then do so
    if (len(distances) == 0) or (len(angles) == 0):
        print "calculating distances and bearings matrix"
        distances, angles = get_distances_angles(station_info)

    # extract before truncate the array
    neighbour_elevations = np.array(station_info[:,3], dtype=float) 
    neighbour_ids        = np.array(station_info[:,0])
    neighbour_info       = np.array(station_info[:,:])

    # sort truncated run
    startindex = 0
    if restart_id != "":
        startindex, = np.where(station_info[:,0] == restart_id)


    if end_id != "":
        endindex, = np.where(station_info[:,0] == end_id)
        if endindex != len(station_info) -1:
            station_info = station_info[startindex: endindex+1]
            distances = distances[startindex:endindex+1,:]
            angles = angles[startindex:endindex+1,:]
        else:
            station_info = station_info[startindex:]
            distances = distances[startindex:,:]
            angles = angles[startindex:,:]
    else:
        station_info = station_info[startindex:]
        distances = distances[startindex:,:]
        angles = angles[startindex:,:]
        

    # process each neighbour
    for st, stat in enumerate(station_info):       

        print dt.datetime.strftime(dt.datetime.now(), "%A, %d %B %Y, %H:%M:%S")
        print "Neighbour Check"
        print "{:35s} {}".format("Station Identifier :", stat[0])

        if not plots and not diagnostics:
            logfile = file(LOG_OUTFILE_LOCS+stat[0]+'.log','a') # append to file if second iteration.
            logfile.write(dt.datetime.strftime(dt.datetime.now(), "%A, %d %B %Y, %H:%M:%S\n"))
            logfile.write("Neighbour Check\n")
            logfile.write("{:35s} {}\n".format("Station Identifier :", stat[0]))
        else:
            logfile = ""

        process_start_time = time.time()

        station = utils.Station(stat[0], float(stat[1]), float(stat[2]), float(stat[3]))

        # if running through the first time
        if first:

            if os.path.exists(os.path.join(NETCDF_DATA_LOCS, station.id + "_internal.nc.gz")):
                # if gzip file, unzip here
                subprocess.call(["gunzip",os.path.join(NETCDF_DATA_LOCS, station.id + "_internal.nc.gz")])
                time.sleep(5) # make sure it is unzipped before proceeding

            # read in the data
            ncdfp.read(os.path.join(NETCDF_DATA_LOCS, station.id + "_internal.nc"), station, process_vars, carry_thru_vars, diagnostics = diagnostics)

            if plots or diagnostics:
                print "{:35s}  {}\n".format("Total station record size :",len(station.time.data))
            else:
                logfile.write("{:35s}  {}\n".format("Total station record size :",len(station.time.data)))

            match_to_compress = utils.create_fulltimes(station, process_vars, DATASTART, DATAEND, carry_thru_vars)

        # or if second pass through?
        elif second:
            if os.path.exists(os.path.join(NETCDF_DATA_LOCS, station.id + "internal2.nc.gz")):
                # if gzip file, unzip here
                subprocess.call(["gunzip",os.path.join(NETCDF_DATA_LOCS, station.id + "_internal2.nc.gz")])
                time.sleep(5) # make sure it is unzipped before proceeding

            ncdfp.read(os.path.join(NETCDF_DATA_LOCS, station.id + "_internal2.nc"), station, process_vars, carry_thru_vars, diagnostics = diagnostics)
            if plots or diagnostics:
                print "{:35s}  {}\n".format("Total station record size :",len(station.time.data))
            else:
                logfile.write("{:35s}  {}\n".format("Total station record size :",len(station.time.data)))

            match_to_compress = utils.create_fulltimes(station, process_vars, DATASTART, DATAEND, carry_thru_vars)


        # select neighbours
        neighbour_distances  = distances[st,:]
        neighbour_bearings   = angles[st,:]

        # have to add in start index so that can use location in distance file.
        # neighbours = n_utils.get_neighbours(st+startindex, np.float(stat[3]), neighbour_distances, neighbour_bearings, neighbour_elevations)

        # return all neighbours up to a limit from the distance and elevation offsets (500km and 300m respectively)
        neighbours, neighbour_quadrants = n_utils.get_all_neighbours(st+startindex, np.float(stat[3]), neighbour_distances, neighbour_bearings, neighbour_elevations)

        if plots or diagnostics:
            print "{:14s} {:10s} {:10s}".format("Neighbour","Distance","Elevation")
            for n in neighbours:
                print "{:14s} {:10.1f} {:10.1f}".format(neighbour_ids[n],neighbour_distances[n],neighbour_elevations[n])

        else:
            logfile.write("{:14s} {:10s} {:10s}\n".format("Neighbour","Distance","Elevation"))
            for n in neighbours:
                logfile.write("{:14s} {:10.1f} {:10.1f}\n".format(neighbour_ids[n],neighbour_distances[n],neighbour_elevations[n]))

        # if sufficient neighbours
        if len(neighbours) >= 3:

            for variable, col in FLAG_OUTLIER_DICT.items():
                # NOTE - this requires multiple reads of the same file
                #      but does make it easier to understand and code

                st_var = getattr(station, variable)

                if plots or diagnostics:
                    print "Length of {} record: {}".format(variable, len(st_var.data.compressed()))
                else:
                    logfile.write("Length of {} record: {}\n".format(variable, len(st_var.data.compressed())))

                
                if len(st_var.data.compressed()) > 0:

                    final_neighbours = n_utils.select_neighbours(station, variable, neighbour_info[neighbours], neighbours, neighbour_distances[neighbours], neighbour_quadrants, NETCDF_DATA_LOCS, DATASTART, DATAEND, logfile, second = second, diagnostics = diagnostics, plots = plots)


                    # now read in final set of neighbours and process

                    neigh_flags = np.zeros(len(station.time.data)) # count up how many neighbours think this obs is bad
                    neigh_count = np.zeros(len(station.time.data)) # number of neighbours at each time stamp
                    dpd_flags = np.zeros(len(station.time.data)) # number of neighbours at each time stamp
                    reporting_accuracies = np.zeros(len(neighbours)) # reporting accuracy of each neighbour

                    all_data = np.ma.zeros([len(final_neighbours), len(station.time.data)]) # store all the neighbour values

                    for nn, nn_loc in enumerate(final_neighbours):

                        neigh_details = neighbour_info[nn_loc]
                        neigh = utils.Station(neigh_details[0], float(neigh_details[1]), float(neigh_details[2]), float(neigh_details[3]))

                        if first:
                            ncdfp.read(os.path.join(NETCDF_DATA_LOCS, neigh.id + "_internal.nc"), neigh, [variable], diagnostics = diagnostics, read_input_station_id = False)
                        elif second:
                            ncdfp.read(os.path.join(NETCDF_DATA_LOCS, neigh.id + "_internal2.nc"), neigh, [variable], diagnostics = diagnostics, read_input_station_id = False)

                        dummy = utils.create_fulltimes(neigh, [variable], DATASTART, DATAEND, [], do_input_station_id = False)

                        all_data[nn, :] = utils.apply_filter_flags(getattr(neigh, variable))

                        if diagnostics:
                            print neigh_details

                        n_utils.detect(station, neigh, variable, neigh_flags, neigh_count, DATASTART, DATAEND, distance = neighbour_distances[nn_loc], diagnostics = diagnostics, plots = plots)

                        reporting_accuracies[nn] = utils.reporting_accuracy(getattr(neigh,variable).data)

                        dpd_flags += neigh.qc_flags[:,31]
                    # gone through all neighbours


                    # if at least 2/3 of neighbours have flagged this point (and at least 3 neighbours)
                    some_flags, = np.where(neigh_flags > 0)            
                    outlier_locs, = np.where(np.logical_and((neigh_count[some_flags] >= 3),(neigh_flags[some_flags].astype("float")/neigh_count[some_flags] > 2./3.)))

                    # flag where < 3 neighbours
                    locs = np.where(neigh_count[some_flags] < 3)
                    station.qc_flags[some_flags[locs], col] = -1

                    if len(outlier_locs) >= 1:
                        station.qc_flags[some_flags[outlier_locs], col] = 1

                        # print number flagged and copy into attribute
                        if plots or diagnostics:
                            utils.print_flagged_obs_number(logfile, "Neighbour", variable, len(outlier_locs), noWrite = True)
                        else:
                            utils.print_flagged_obs_number(logfile, "Neighbour", variable, len(outlier_locs))
                        st_var = getattr(station, variable)
                        st_var.flags[some_flags[outlier_locs]] = 1

                    else:
                        if plots or diagnostics:
                            utils.print_flagged_obs_number(logfile, "Neighbour", variable, len(outlier_locs), noWrite = True)
                        else:
                            utils.print_flagged_obs_number(logfile, "Neighbour", variable, len(outlier_locs))


                    if plots:
                        n_utils.plot_outlier(station, variable, some_flags[outlier_locs], all_data, DATASTART)

                    # unflagging using neighbours
                    n_utils.do_unflagging(station, variable, all_data, reporting_accuracies, neigh_count, dpd_flags, FLAG_COL_DICT, DATASTART, logfile, plots = plots, diagnostics = diagnostics)

                else:
                    if plots or diagnostics:
                        print "No observations to assess for {}".format(variable)
                    else:
                        logfile.write("No observations to assess for {}\n".format(variable))
                    


            # variable loop
        else:
            if plots or diagnostics:
                print "Fewer than 3 neighbours"
            else:
                logfile.write("Fewer than 3 neighbours\n")

        print dt.datetime.strftime(dt.datetime.now(), "%A, %d %B %Y, %H:%M:%S")
        print "processing took {:4.0f}s\n\n".format(time.time() - process_start_time)

        # end of neighbour check
	utils.append_history(station, "Neighbour Outlier Check")

        # clean up months 

        qc_tests.clean_up.clu(station, ["temperatures","dewpoints","windspeeds","winddirs","slp"], [44,45,46,47,48], FLAG_COL_DICT, DATASTART, DATAEND, logfile, plots = plots)


        if diagnostics or plots: raw_input("stop")

        # masking (at least call from here - optional call from internal?)

        # write to file
        if first:
            ncdfp.write(os.path.join(NETCDF_DATA_LOCS, station.id + "_external.nc"), station, process_vars, os.path.join(INPUT_FILE_LOCS,'attributes.dat'), opt_var_list = carry_thru_vars, compressed = match_to_compress, processing_date = '', qc_code_version = qc_code_version)
            # gzip the raw file
        elif second:
            ncdfp.write(os.path.join(NETCDF_DATA_LOCS, station.id + "_external2.nc"), station, process_vars, os.path.join(INPUT_FILE_LOCS,'attributes.dat'), opt_var_list = carry_thru_vars, compressed = match_to_compress, processing_date = '', qc_code_version = qc_code_version)
            # gzip the raw file
 

        # masking - apply the flags and copy masked data to flagged_obs attribute
        if masking:

            station = utils.mask(station, process_vars, logfile)

        # write to file
            if first:
                ncdfp.write(os.path.join(NETCDF_DATA_LOCS, station.id + "_mask.nc"), station, process_vars, os.path.join(INPUT_FILE_LOCS,'attributes.dat'), opt_var_list = carry_thru_vars, compressed = match_to_compress, processing_date = '', qc_code_version = qc_code_version)
            elif second:
                ncdfp.write(os.path.join(NETCDF_DATA_LOCS, station.id + "_mask2.nc"), station, process_vars, os.path.join(INPUT_FILE_LOCS,'attributes.dat'), opt_var_list = carry_thru_vars, compressed = match_to_compress, processing_date = '', qc_code_version = qc_code_version)

        if plots or diagnostics:
            print "Masking completed\n"
            print dt.datetime.strftime(dt.datetime.now(), "%A, %d %B %Y, %H:%M:%S\n")
            print "processing took {:4.0f}s\n\n".format(time.time() - process_start_time)
        else:
            logfile.write("Masking completed\n")
            logfile.write(dt.datetime.strftime(dt.datetime.now(), "%A, %d %B %Y, %H:%M:%S\n"))
            logfile.write("processing took {:4.0f}s\n\n".format(time.time() - process_start_time))
            logfile.close()
            
    # gzip up all the raw files
    if doZip:
        for st, stat in enumerate(station_info):       
            if first:
                subprocess.call(["gzip",os.path.join(NETCDF_DATA_LOCS, stat[0]+"_internal.nc")])
                if masking:
                    subprocess.call(["gzip",os.path.join(NETCDF_DATA_LOCS, stat[0]+"_external.nc")])
                    subprocess.call(["gzip",os.path.join(NETCDF_DATA_LOCS, stat[0]+"_mask.nc")])

            elif second:
                subprocess.call(["gzip",os.path.join(NETCDF_DATA_LOCS, stat[0]+"_internal2.nc")])
                if masking:
                    subprocess.call(["gzip",os.path.join(NETCDF_DATA_LOCS, stat[0]+"_external2.nc")])
                    subprocess.call(["gzip",os.path.join(NETCDF_DATA_LOCS, stat[0]+"_mask2.nc")])

    print "Neighbour Checks completed\n"

    return # neighbour_checks

Beispiel #19

Datei anzeigen

def fvc(station,
        variable_list,
        flag_col,
        start,
        end,
        logfile,
        diagnostics=False,
        plots=False,
        doMonth=False):
    '''
    Check for certain values occurring more frequently than would be expected
    
    :param object station: station object to process
    :param list variable_list: list of variables to process
    :param list flag_col: columns to fill in flag array
    :param datetime start: datetime object of start of data
    :param datetime end: datetime object of end of data
    :param file logfile: logfile to store outputs
    :param bool diagnostics: produce extra diagnostic output
    :param bool plots: produce plots
    :param bool month: ignore months after last complete year/season for distribution
    '''

    MIN_DATA_REQUIRED = 500  # to create histogram for complete record
    MIN_DATA_REQUIRED_YEAR = 100  # to create histogram

    month_ranges = utils.month_starts_in_pairs(start, end)

    month_ranges_years = month_ranges.reshape(-1, 12, 2)

    for v, variable in enumerate(variable_list):

        st_var = getattr(station, variable)

        reporting_accuracy = utils.reporting_accuracy(
            utils.apply_filter_flags(st_var))

        # apply flags - for detection only
        filtered_data = utils.apply_filter_flags(st_var,
                                                 doMonth=doMonth,
                                                 start=start,
                                                 end=end)

        for season in range(5):  # Year,MAM,JJA,SON,JF+D

            if season == 0:
                # all year
                season_data = np.ma.masked_values(filtered_data.compressed(),
                                                  st_var.fdi)
                thresholds = [30, 20, 10]

            else:
                thresholds = [20, 15, 10]
                season_data = np.ma.array([])

                for y, year in enumerate(month_ranges_years):
                    # churn through months extracting data, accounting for fdi and concatenating together
                    if season == 1:
                        #mam
                        season_data = np.ma.concatenate([
                            season_data,
                            np.ma.masked_values(
                                filtered_data[year[2][0]:year[4][-1]],
                                st_var.fdi)
                        ])
                    elif season == 2:
                        #jja
                        season_data = np.ma.concatenate([
                            season_data,
                            np.ma.masked_values(
                                filtered_data[year[5][0]:year[7][-1]],
                                st_var.fdi)
                        ])
                    elif season == 3:
                        #son
                        season_data = np.ma.concatenate([
                            season_data,
                            np.ma.masked_values(
                                filtered_data[year[8][0]:year[10][-1]],
                                st_var.fdi)
                        ])
                    elif season == 4:
                        #d+jf
                        season_data = np.ma.concatenate([
                            season_data,
                            np.ma.masked_values(
                                filtered_data[year[0][0]:year[1][-1]],
                                st_var.fdi)
                        ])
                        season_data = np.ma.concatenate([
                            season_data,
                            np.ma.masked_values(
                                filtered_data[year[-1][0]:year[-1][-1]],
                                st_var.fdi)
                        ])

            season_data = season_data.compressed()

            if len(season_data) > MIN_DATA_REQUIRED:

                if 0 < reporting_accuracy <= 0.5:  # -1 used as missing value
                    bins, bincenters = utils.create_bins(season_data, 0.5)
                else:
                    bins, bincenters = utils.create_bins(season_data, 1.0)

                hist, binEdges = np.histogram(season_data, bins=bins)

                if plots:
                    plot_hist, bincenters = fvc_plot_setup(season_data,
                                                           hist,
                                                           binEdges,
                                                           st_var.name,
                                                           title="%s" %
                                                           (SEASONS[season]))

                bad_bin = np.zeros(len(hist))

                # scan through bin values and identify bad ones
                for e, element in enumerate(hist):
                    if e > 3 and e <= (len(hist) - 3):
                        # don't bother with first three or last three bins
                        seven_bins = hist[e - 3:e + 3 + 1]
                        if (seven_bins[3]
                                == seven_bins.max()) and (seven_bins[3] != 0):
                            # is local maximum and != zero
                            if (seven_bins[3] / float(seven_bins.sum()) >=
                                    0.5) and (seven_bins[3] >= thresholds[0]):
                                # contains >50% of data and is greater than threshold
                                bad_bin[e] = 1

                            # for plotting remove good bins
                            else:
                                if plots: plot_hist[e] = 1e-1
                        else:
                            if plots: plot_hist[e] = 1e-1
                    else:
                        if plots: plot_hist[e] = 1e-1

                if plots:
                    import matplotlib.pyplot as plt
                    plt.step(bincenters, plot_hist, 'r-', where='mid')
                    plt.show()

                # having identified possible bad bins, check each year in turn, on unfiltered data
                for y, year in enumerate(month_ranges_years):

                    if season == 0:
                        # year
                        year_data = np.ma.masked_values(
                            st_var.data[year[0][0]:year[-1][-1]], st_var.fdi)
                        year_flags = station.qc_flags[year[0][0]:year[-1][-1],
                                                      flag_col[v]]
                    elif season == 1:
                        #mam
                        year_data = np.ma.masked_values(
                            st_var.data[year[2][0]:year[4][-1]], st_var.fdi)
                        year_flags = station.qc_flags[year[2][0]:year[4][-1],
                                                      flag_col[v]]
                    elif season == 2:
                        #jja
                        year_data = np.ma.masked_values(
                            st_var.data[year[5][0]:year[7][-1]], st_var.fdi)
                        year_flags = station.qc_flags[year[5][0]:year[7][-1],
                                                      flag_col[v]]
                    elif season == 3:
                        #son
                        year_data = np.ma.masked_values(
                            st_var.data[year[8][0]:year[10][-1]], st_var.fdi)
                        year_flags = station.qc_flags[year[8][0]:year[10][-1],
                                                      flag_col[v]]
                    elif season == 4:
                        #d+jf
                        year_data = np.ma.concatenate([np.ma.masked_values(st_var.data[year[0][0]:year[1][-1]], st_var.fdi),\
                                                       np.ma.masked_values(st_var.data[year[-1][0]:year[-1][-1]], st_var.fdi)])
                        year_flags = np.append(
                            station.qc_flags[year[0][0]:year[1][-1],
                                             flag_col[v]],
                            station.qc_flags[year[-1][0]:year[-1][-1],
                                             flag_col[v]])

                    if len(year_data.compressed()) > MIN_DATA_REQUIRED_YEAR:

                        hist, binEdges = np.histogram(year_data.compressed(),
                                                      bins=bins)

                        if plots:
                            plot_hist, bincenters = fvc_plot_setup(
                                year_data.compressed(),
                                hist,
                                binEdges,
                                st_var.name,
                                title="%s - %s" %
                                (y + start.year, SEASONS[season]))

                        for e, element in enumerate(hist):

                            if bad_bin[e] == 1:
                                # only look at pre-identified bins

                                if e >= 3 and e <= (len(hist) - 3):
                                    # don't bother with first three or last three bins
                                    seven_bins = hist[e - 3:e + 3 +
                                                      1].astype('float')
                                    if (seven_bins[3] == seven_bins.max()
                                        ) and (seven_bins[3] != 0):
                                        # is local maximum and != zero
                                        if (seven_bins[3]/seven_bins.sum() >= 0.5 and seven_bins[3] >= thresholds[1]) \
                                            or (seven_bins[3]/seven_bins.sum() >= 0.9 and seven_bins[3] >= thresholds[2]):
                                            # contains >50% or >90% of data and is greater than appropriate threshold

                                            # Flag these data
                                            bad_points = np.where(
                                                (year_data >= binEdges[e]) &
                                                (year_data < binEdges[e + 1]))
                                            year_flags[bad_points] = 1

                                        # for plotting remove good bins
                                        else:
                                            if plots: plot_hist[e] = 1e-1
                                    else:
                                        if plots: plot_hist[e] = 1e-1
                                else:
                                    if plots: plot_hist[e] = 1e-1
                            else:
                                if plots: plot_hist[e] = 1e-1

                        if diagnostics or plots:
                            nflags = len(np.where(year_flags != 0)[0])
                            print "{} {}".format(y + start.year, nflags)

                        if plots:
                            if nflags > 0:
                                plt.step(bincenters,
                                         plot_hist,
                                         'r-',
                                         where='mid')
                                plt.show()
                            else:
                                plt.clf()

                    # copy flags back

                    if season == 0:
                        station.qc_flags[year[0][0]:year[-1][-1],
                                         flag_col[v]] = year_flags
                    elif season == 1:
                        station.qc_flags[year[2][0]:year[4][-1],
                                         flag_col[v]] = year_flags
                    elif season == 2:
                        station.qc_flags[year[5][0]:year[7][-1],
                                         flag_col[v]] = year_flags
                    elif season == 3:
                        station.qc_flags[year[8][0]:year[10][-1],
                                         flag_col[v]] = year_flags
                    elif season == 4:
                        split = len(station.qc_flags[year[0][0]:year[1][-1],
                                                     flag_col[v]])
                        station.qc_flags[year[0][0]:year[1][-1],
                                         flag_col[v]] = year_flags[:split]
                        station.qc_flags[year[-1][0]:year[-1][-1],
                                         flag_col[v]] = year_flags[split:]

        flag_locs = np.where(station.qc_flags[:, flag_col[v]] != 0)
        utils.print_flagged_obs_number(logfile,
                                       "Frequent Value",
                                       variable,
                                       len(flag_locs[0]),
                                       noWrite=diagnostics)

        # copy flags into attribute
        st_var.flags[flag_locs] = 1

    station = utils.append_history(station, "Frequent Values Check")

    return  # fvc

Beispiel #20

Datei anzeigen

Datei: distributional_gap.py Projekt: wk1984/HadISD_v2

def dgc(station,
        variable_list,
        flag_col,
        start,
        end,
        logfile,
        plots=False,
        diagnostics=False,
        idl=False,
        GH=False):
    '''Controller for two individual tests'''

    if plots:
        import matplotlib.pyplot as plt

    for v, variable in enumerate(variable_list):
        station.qc_flags[:, flag_col[v]] = dgc_monthly(
            station,
            variable,
            station.qc_flags[:, flag_col[v]],
            start,
            end,
            plots=plots,
            diagnostics=diagnostics,
            idl=idl)

        if variable == "slp":
            # need to send in windspeeds too
            station.qc_flags[:, flag_col[v]] = dgc_all_obs(
                station,
                variable,
                station.qc_flags[:, flag_col[v]],
                start,
                end,
                plots=plots,
                diagnostics=diagnostics,
                idl=idl,
                windspeeds=True,
                GH=GH)
        else:
            station.qc_flags[:, flag_col[v]] = dgc_all_obs(
                station,
                variable,
                station.qc_flags[:, flag_col[v]],
                start,
                end,
                plots=plots,
                diagnostics=diagnostics,
                idl=idl,
                GH=GH)

        flag_locs = np.where(station.qc_flags[:, flag_col[v]] != 0)

        if plots or diagnostics:
            utils.print_flagged_obs_number(logfile,
                                           "Distributional Gap",
                                           variable,
                                           len(flag_locs[0]),
                                           noWrite=True)
        else:
            utils.print_flagged_obs_number(logfile, "Distributional Gap",
                                           variable, len(flag_locs[0]))

        # copy flags into attribute
        st_var = getattr(station, variable)
        st_var.flags[flag_locs] = 1

        # MATCHES IDL for 030660-99999, 2 flags in T, 30-06-2014

    station = utils.append_history(station, "Distributional Gap Check")

    return  # dgc

Beispiel #21

Datei anzeigen

Datei: frequent_values.py Projekt: rjhd2/HadISD_v2

def fvc(station, variable_list, flag_col, start, end, logfile, diagnostics = False, plots = False):
    '''
    Check for certain values occurring more frequently than would be expected
    
    :param object station: station object to process
    :param list variable_list: list of variables to process
    :param list flag_col: columns to fill in flag array
    :param datetime start: datetime object of start of data
    :param datetime end: datetime object of end of data
    :param file logfile: logfile to store outputs
    :param bool diagnostics: produce extra diagnostic output
    :param bool plots: produce plots
    '''
    
    MIN_DATA_REQUIRED = 500 # to create histogram for complete record
    MIN_DATA_REQUIRED_YEAR = 100 # to create histogram

    month_ranges = utils.month_starts_in_pairs(start, end)

    month_ranges_years = month_ranges.reshape(-1,12,2)

    for v,variable in enumerate(variable_list):
    
        st_var = getattr(station, variable)
        
        reporting_accuracy = utils.reporting_accuracy(utils.apply_filter_flags(st_var))
        
        # apply flags - for detection only
        filtered_data = utils.apply_filter_flags(st_var)

        for season in range(5): # Year,MAM,JJA,SON,JF+D
 
            if season == 0:
                # all year
                season_data = np.ma.masked_values(filtered_data.compressed(), st_var.fdi)
                thresholds = [30,20,10]

            else:
                thresholds = [20,15,10]
                season_data = np.ma.array([])
                
                for y,year in enumerate(month_ranges_years):
                    # churn through months extracting data, accounting for fdi and concatenating together
                    if season == 1:
                        #mam
                        season_data = np.ma.concatenate([season_data, np.ma.masked_values(filtered_data[year[2][0]:year[4][-1]], st_var.fdi)])
                    elif season == 2:
                        #jja
                        season_data = np.ma.concatenate([season_data, np.ma.masked_values(filtered_data[year[5][0]:year[7][-1]], st_var.fdi)])
                    elif season == 3:
                        #son
                        season_data = np.ma.concatenate([season_data, np.ma.masked_values(filtered_data[year[8][0]:year[10][-1]], st_var.fdi)])
                    elif season == 4:
                        #d+jf
                        season_data = np.ma.concatenate([season_data, np.ma.masked_values(filtered_data[year[0][0]:year[1][-1]], st_var.fdi)])
                        season_data = np.ma.concatenate([season_data, np.ma.masked_values(filtered_data[year[-1][0]:year[-1][-1]], st_var.fdi)])
                    

            season_data = season_data.compressed()

            if len(season_data) > MIN_DATA_REQUIRED:    

                if 0 < reporting_accuracy <= 0.5: # -1 used as missing value
                    bins, bincenters = utils.create_bins(season_data, 0.5)
                else:
                    bins, bincenters = utils.create_bins(season_data, 1.0)

                hist, binEdges = np.histogram(season_data, bins = bins)
            
                if plots:
                    plot_hist, bincenters = fvc_plot_setup(season_data, hist, binEdges, st_var.name, title = "%s" % (SEASONS[season]))

                bad_bin = np.zeros(len(hist))

                # scan through bin values and identify bad ones
                for e, element in enumerate(hist):                  
                    if e > 3 and e <= (len(hist) - 3):
                        # don't bother with first three or last three bins
                        seven_bins = hist[e-3:e+3+1]
                        if (seven_bins[3] == seven_bins.max()) and (seven_bins[3] != 0):
                            # is local maximum and != zero
                            if (seven_bins[3]/float(seven_bins.sum()) >= 0.5) and (seven_bins[3] >= thresholds[0]):
                                # contains >50% of data and is greater than threshold
                                bad_bin[e] = 1

                            # for plotting remove good bins
                            else:
                                if plots: plot_hist[e]=1e-1
                        else:
                            if plots: plot_hist[e]=1e-1
                    else:
                        if plots: plot_hist[e]=1e-1



                if plots:
                    plt.step(bincenters, plot_hist, 'r-', where='mid')
                    plt.show()
            
                # having identified possible bad bins, check each year in turn
                for y,year in enumerate(month_ranges_years):

                    if season == 0:
                        # year
                        year_data = np.ma.masked_values(st_var.data[year[0][0]:year[-1][-1]], st_var.fdi)
                        year_flags = station.qc_flags[year[0][0]:year[-1][-1],flag_col[v]]
                    elif season == 1:
                        #mam
                        year_data = np.ma.masked_values(st_var.data[year[2][0]:year[4][-1]], st_var.fdi)
                        year_flags = station.qc_flags[year[2][0]:year[4][-1],flag_col[v]]
                    elif season == 2:
                        #jja
                        year_data = np.ma.masked_values(st_var.data[year[5][0]:year[7][-1]], st_var.fdi)
                        year_flags = station.qc_flags[year[5][0]:year[7][-1],flag_col[v]]
                    elif season == 3:
                        #son
                        year_data = np.ma.masked_values(st_var.data[year[8][0]:year[10][-1]], st_var.fdi)
                        year_flags = station.qc_flags[year[8][0]:year[10][-1],flag_col[v]]
                    elif season == 4:
                        #d+jf
                        year_data = np.ma.concatenate([np.ma.masked_values(st_var.data[year[0][0]:year[1][-1]], st_var.fdi),\
                                                       np.ma.masked_values(st_var.data[year[-1][0]:year[-1][-1]], st_var.fdi)])
                        year_flags = np.append(station.qc_flags[year[0][0]:year[1][-1],flag_col[v]],station.qc_flags[year[-1][0]:year[-1][-1],flag_col[v]])


                    if len(year_data.compressed()) > MIN_DATA_REQUIRED_YEAR:    

                        hist, binEdges = np.histogram(year_data.compressed(), bins = bins)

                        if plots:
                            plot_hist, bincenters = fvc_plot_setup(hist, binEdges, st_var.name, title = "%s - %s" % (y+start.year, SEASONS[season]))

                        for e, element in enumerate(hist):

                            if bad_bin[e] == 1:
                                # only look at pre-identified bins

                                if e >= 3 and e <= (len(hist) - 3):
                                    # don't bother with first three or last three bins
                                    seven_bins = hist[e-3:e+3+1].astype('float')
                                    if (seven_bins[3] == seven_bins.max()) and (seven_bins[3] != 0):
                                        # is local maximum and != zero
                                        if (seven_bins[3]/seven_bins.sum() >= 0.5 and seven_bins[3] >= thresholds[1]) \
                                            or (seven_bins[3]/seven_bins.sum() >= 0.9 and seven_bins[3] >= thresholds[2]):
                                            # contains >50% or >90% of data and is greater than appropriate threshold

                                            # Flag these data
                                            bad_points = np.where((year_data >= binEdges[e]) & (year_data < binEdges[e+1]))
                                            year_flags[bad_points] = 1

                                        # for plotting remove good bins
                                        else:
                                            if plots: plot_hist[e]=1e-1
                                    else:
                                        if plots: plot_hist[e]=1e-1
                                else:
                                    if plots: plot_hist[e]=1e-1
                            else:
                                if plots: plot_hist[e]=1e-1

                        if diagnostics or plots:
                            nflags = len(np.where(year_flags != 0)[0])
                            print "{} {}".format(y + start.year, nflags)

                        if plots:
                            if nflags > 0:
                                plt.step(bincenters, plot_hist, 'r-', where='mid')
                                plt.show()
                            else:
                                plt.clf()

                    # copy flags back

                    if season == 0:
                        station.qc_flags[year[0][0]:year[-1][-1], flag_col[v]] = year_flags   
                    elif season == 1:
                        station.qc_flags[year[2][0]:year[4][-1], flag_col[v]] = year_flags   
                    elif season == 2:
                        station.qc_flags[year[5][0]:year[7][-1], flag_col[v]] = year_flags   
                    elif season == 3:
                        station.qc_flags[year[8][0]:year[10][-1], flag_col[v]] = year_flags   
                    elif season == 4:
                        split = len(station.qc_flags[year[0][0]:year[1][-1], flag_col[v]])
                        station.qc_flags[year[0][0]:year[1][-1], flag_col[v]] = year_flags[:split]
                        station.qc_flags[year[-1][0]:year[-1][-1], flag_col[v]] = year_flags[split:]
 
        flag_locs = np.where(station.qc_flags[:, flag_col[v]] != 0)
        if plots or diagnostics:
            utils.print_flagged_obs_number(logfile, "Frequent Value", variable, len(flag_locs[0]), noWrite = True)
        else:
            utils.print_flagged_obs_number(logfile, "Frequent Value", variable, len(flag_locs[0]))

        # copy flags into attribute
        st_var.flags[flag_locs] = 1
        
    station = utils.append_history(station, "Frequent Values Check")  
                     
    return # fvc

Beispiel #22

Datei anzeigen

Datei: diurnal_cycle.py Projekt: rjhd2/HadISD_v2

def dcc(station, variable_list, full_variable_list, flag_col, logfile, plots = False, diagnostics = False):
    '''
    The diurnal cycle check.
    
    :param object station: the station object to be processed
    :param list variable_list: the variables to be processed
    :param list full_variable_list: the variables for flags to be applied to
    :param list flag_col: which column in the qc_flags array to work on
    :param file logfile: logfile to store outputs
    :param bool plots: to do any plots
    :param bool diagnostics: to do any extra diagnostic output
    :returns:
    '''

    # list of flags for each variable
    diurnal_flags = []

    for v,variable in enumerate(variable_list):
    
        st_var = getattr(station, variable)
 
 	# is this needed 21/08/2014        
#        reporting_accuracy = utils.reporting_accuracy(utils.apply_filter_flags(st_var))
        
        # apply flags - for detection only
        filtered_data = utils.apply_filter_flags(st_var)

        filtered_data = filtered_data.reshape(-1,24) # working in fulltimes.
        number_of_days = filtered_data.shape[0]

        if plots:
            import matplotlib.pyplot as plt
            plt.clf()
            plot_data = np.ma.zeros(filtered_data.shape)
            plot_data.mask = True
#            best_estimate_counter = np.zeros(HOURS)

        diurnal_best_fits     = np.zeros(filtered_data.shape[0], dtype = (int))
        diurnal_best_fits.fill(INTMDI)
        diurnal_uncertainties = np.zeros(filtered_data.shape[0])
        diurnal_uncertainties.fill(INTMDI)

        for d,day in enumerate(filtered_data):

            '''enough observations and have large enough diurnal range '''
            if len(day.compressed()) >= OBS_PER_DAY:

                obs_daily_range = max(day.compressed()) - min(day.compressed())
                if obs_daily_range >= DAILY_RANGE:
                    
                    if dcc_quartile_check(day):
                        scaled_sine = ((dcc_make_sine() + 1.) / 2. * obs_daily_range) + min(day.compressed())
                        diffs = np.zeros(HOURS)

                        '''Find differences for each shifted sine --> cost function'''
                        for h in range(HOURS):
                            diffs[h] = np.sum(np.abs(day - scaled_sine).compressed())
                            scaled_sine = np.roll(scaled_sine, 1) # matched to IDL SHIFT()
                                                                           
                        diurnal_best_fits[d] = np.argmin(diffs)

                        # default uncertainty is the average time resolution of the data
                        diurnal_uncertainties[d] = round(float(HOURS) / len(day.compressed()))

                        if DYNAMIC_DIURNAL:
                            critical_value = min(diffs) + ((max(diffs) - min(diffs)) * 0.33)

                            # centre so minimum in middle
                            diffs = np.roll(diffs, 11 - diurnal_best_fits[d])
                            
                            uncertainty = 1
                            while uncertainty < 11:
                                if (diffs[11 - uncertainty] > critical_value) and\
                                        (diffs[11 + uncertainty] > critical_value):
                                    # break if both sides greater than critical difference
                                    # when counting outwards
                                    #    see diurnal_example.py
                                    break

                                uncertainty += 1

                            # check if uncertainty greater than time resolution for day
                            if uncertainty > diurnal_uncertainties[d] :
                                diurnal_uncertainties[d] = uncertainty

                        if plots:
#                            best_estimate_counter[np.argmin(diffs)] += 1
                            # scale daily data to range -1 -> 1, plot with random scatter for clarity
                            plot_data[d] = ((2 * (day - min(day.compressed())) / obs_daily_range) - 1.)
                            plt.plot(np.arange(24)+np.random.randn(24)*0.25, plot_data[d]+np.random.randn(24)*0.05, 'k,')
                            

        if plots:
            plt.plot(np.arange(24),np.roll(dcc_make_sine(), np.argmax(np.bincount(diurnal_best_fits[np.where(diurnal_best_fits != INTMDI)]))),'r-')
            plt.xlim([-1,25])
            plt.ylim([-1.2,1.2])
            plt.show()

        # dumb copy of IDL

        '''For each uncertainty range (1-6h) find median of cycle offset'''
        best_fits = np.zeros(6)
        best_fits.fill(-9)
        for h in range(6):
            locs = np.where(diurnal_uncertainties == h+1)

            if len(locs[0]) > 300:
                # best_fits[h] = int(np.median(diurnal_best_fits[locs])) 
                # Numpy median gives average of central two values which may not be integer
                # 25/11/2014 use IDL style which gives lower value
                best_fits[h] = utils.idl_median(diurnal_best_fits[locs])
 
        '''Build up range of cycles incl, uncertainty to find where best of best located'''

        hours = np.arange(24)
        hour_matches=np.zeros(24)
        diurnal_peak = -9
        number_estimates = 0
        for h in range(6):
            if best_fits[h] != -9:

                '''Store lowest uncertainty best fit as first guess'''
                if diurnal_peak == -9: 
                    diurnal_peak = best_fits[h]
                    hours = np.roll(hours,11-int(diurnal_peak))
                    hour_matches[11-(h+1):11+(h+2)] = 1
                    number_estimates += 1
                 
                centre = np.where(hours == best_fits[h])
 
                if (centre[0] - h + 1) >= 0:
                    if (centre[0] + h + 1 ) <=23:
                        hour_matches[centre[0] - (h + 1) : centre[0] + h + 2] += 1
                    else:
                        hour_matches[centre[0] - (h + 1) : ] += 1
                        hour_matches[ : centre[0] + h + 2- 24] += 1                                        
                else:
                    hour_matches[: centre[0] + h + 2] += 1
                    hour_matches[centre[0] - (h + 1) :] += 1

                number_estimates += 1

        
        '''If value at lowest uncertainty not found in all others, then see what value is found by all others '''
        if hour_matches[11] != number_estimates:  # central estimate at 12 o'clock
            all_match = np.where(hour_matches == number_estimates)

            # if one is, then use it
            if len(all_match[0]) > 0:
                diurnal_peak = all_match[0][0]
            else:
                diurnal_peak = -9
            
 
        '''Now have value for best fit diurnal offset'''

        potentially_spurious = np.zeros(number_of_days)
        potentially_spurious.fill(INTMDI)

        if diurnal_peak != -9:
            hours = np.arange(24)
            hours = np.roll(hours,11-int(diurnal_peak))
            for d in range(number_of_days):
                if diurnal_best_fits[d] != INTMDI:

                    '''Checks if global falls inside daily value+/-range
                    rather than seeing if each day falls in global value+/-range'''

                    

                    min_range = 11 - diurnal_uncertainties[d]
                    max_range = 11 + diurnal_uncertainties[d]
                    maxloc = np.where(hours == diurnal_best_fits[d])[0][0]


                    if maxloc < min_range or maxloc > max_range:
                        potentially_spurious[d] = 1
                    else:
                        potentially_spurious[d] = 0
                    

 

            # count number of good, missing and not-bad days
            n_good = 0
            n_miss = 0
            n_not_bad = 0
            total_points = 0
            total_not_miss = 0
            to_flag = np.zeros(number_of_days)

            for d in range(number_of_days):

                if potentially_spurious[d] == 1:
                    
                    n_good = 0
                    n_miss = 0
                    n_not_bad = 0
                    total_points += 1
                    total_not_miss +=1
                    
                else:

                    if potentially_spurious[d] == 0:

                        n_good += 1
                        n_not_bad += 1
                        if n_miss != 0:
                            n_miss = 0
                        total_not_miss += 1

                    if potentially_spurious[d] == -999:

                        n_miss += 1
                        n_not_bad += 1
                        if n_good != 0:
                            n_good = 0

                    total_points += 1


                    if (n_good == 3) or (n_miss == 3) or (n_not_bad >=6):

                        
                        if total_points >= 30:
                            if float(total_not_miss)/total_points >= 0.5:
                                to_flag[d - total_points : d ] = 1
                        
                        n_good = 0
                        n_miss = 0
                        n_not_bad = 0
                        total_points = 0 
                        total_not_miss = 0

            dcc_flags = np.zeros(filtered_data.shape)

            for d in range(number_of_days):

                if to_flag[d] == 1:
                    good = np.where(filtered_data.mask[d,:] == False)
                    if len(good[0]) >= 1:
                        dcc_flags[d,good]=1

            if diagnostics:
                print len(np.where(dcc_flags == 1)[0])
                print "currently matches IDL, but should all hours in days have flags set, not just the missing/flagged ones?"

            diurnal_flags += [dcc_flags]
        else: 
            diurnal_flags += [np.zeros(filtered_data.shape)]

        station.qc_flags[:, flag_col[v]] = np.array(diurnal_flags).reshape(-1)

        flag_locs = np.where(station.qc_flags[:, flag_col[v]] != 0)
        if plots or diagnostics:
            utils.print_flagged_obs_number(logfile, "Diurnal Cycle", variable, len(flag_locs[0]), noWrite = True)
        else:
            utils.print_flagged_obs_number(logfile, "Diurnal Cycle", variable, len(flag_locs[0]))

        # copy flags into attribute
        st_var.flags[flag_locs] = 1

        # CHECKED 030660-99999, 30-06-2014, 855 flagged RJHD
    
    utils.apply_flags_all_variables(station, full_variable_list, flag_col[variable_list == "temperatures"], logfile, "Diurnal Cycle", plots = plots, diagnostics = diagnostics)

    station = utils.append_history(station, "Diurnal Cycle Check")  
                     
    return # dcc

Beispiel #23

Datei anzeigen

def sc(station,
       variable_list,
       flag_col,
       start,
       end,
       logfile,
       diagnostics=False,
       plots=False,
       doMonth=False):
    '''
    Spike Check, looks for spikes up to 3 observations long, using thresholds
    calculated from the data itself.

    :param MetVar station: the station object
    :param list variable_list: list of observational variables to process
    :param list flag_col: the columns to set on the QC flag array
    :param datetime start: dataset start time
    :param datetime end: dataset end time
    :param file logfile: logfile to store outputs
    :param bool plots: do plots
    :param bool doMonth: account for incomplete months

    :returns:    
    '''
    print "refactor"

    for v, variable in enumerate(variable_list):

        flags = station.qc_flags[:, flag_col[v]]

        st_var = getattr(station, variable)

        # if incomplete year, mask all obs for the incomplete bit
        all_filtered = utils.apply_filter_flags(st_var,
                                                doMonth=doMonth,
                                                start=start,
                                                end=end)

        reporting_resolution = utils.reporting_accuracy(
            utils.apply_filter_flags(st_var))
        # to match IDL system - should never be called as would mean no data
        if reporting_resolution == -1:
            reporting_resolution = 1

        month_ranges = utils.month_starts_in_pairs(start, end)
        month_ranges = month_ranges.reshape(-1, 12, 2)

        good, = np.where(all_filtered.mask == False)

        full_time_diffs = np.ma.zeros(len(all_filtered), dtype=int)
        full_time_diffs.mask = copy.deepcopy(all_filtered.mask[:])
        full_time_diffs[good[:-1]] = station.time.data[
            good[1:]] - station.time.data[good[:-1]]

        # develop critical values using clean values
        # NOTE 4/7/14 - make sure that Missing and Flagged values treated appropriately
        print "sort the differencing if values were flagged rather than missing"

        full_filtered_diffs = np.ma.zeros(len(all_filtered))
        full_filtered_diffs.mask = copy.deepcopy(all_filtered.mask[:])
        full_filtered_diffs[good[:-1]] = all_filtered.compressed(
        )[1:] - all_filtered.compressed()[:-1]

        # test all values
        good_to_uncompress, = np.where(st_var.data.mask == False)
        full_value_diffs = np.ma.zeros(len(st_var.data))
        full_value_diffs.mask = copy.deepcopy(st_var.data.mask[:])
        full_value_diffs[good_to_uncompress[:-1]] = st_var.data.compressed(
        )[1:] - st_var.data.compressed()[:-1]

        # convert to compressed time to match IDL
        value_diffs = full_value_diffs.compressed()
        time_diffs = full_time_diffs.compressed()
        filtered_diffs = full_filtered_diffs.compressed()
        flags = flags[good_to_uncompress]

        critical_values = np.zeros([9, 12])
        critical_values.fill(st_var.mdi)

        # link observation to calendar month
        month_locs = np.zeros(full_time_diffs.shape, dtype=int)

        for month in range(12):
            for year in range(month_ranges.shape[0]):

                if year == 0:
                    this_month_time_diff = full_time_diffs[month_ranges[
                        year, month, 0]:month_ranges[year, month, 1]]
                    this_month_filtered_diff = full_filtered_diffs[
                        month_ranges[year, month, 0]:month_ranges[year, month,
                                                                  1]]
                else:
                    this_month_time_diff = np.ma.concatenate([
                        this_month_time_diff,
                        full_time_diffs[month_ranges[year, month,
                                                     0]:month_ranges[year,
                                                                     month, 1]]
                    ])
                    this_month_filtered_diff = np.ma.concatenate([
                        this_month_filtered_diff,
                        full_filtered_diffs[month_ranges[year, month,
                                                         0]:month_ranges[year,
                                                                         month,
                                                                         1]]
                    ])

                month_locs[month_ranges[year, month,
                                        0]:month_ranges[year, month,
                                                        1]] = month

            for delta in range(1, 9):

                locs = np.ma.where(this_month_time_diff == delta)

                if len(locs[0]) >= 100:

                    iqr = utils.IQR(this_month_filtered_diff[locs])

                    if iqr == 0. and delta == 1:
                        critical_values[delta - 1, month] = 6.
                    elif iqr == 0:
                        critical_values[delta - 1, month] = st_var.mdi
                    else:
                        critical_values[delta - 1, month] = 6. * iqr

                    # January 2015 - changed to dynamically calculating the thresholds if less than IQR method ^RJHD

                    if plots:
                        import calendar
                        title = "{}, {}-hr differences".format(
                            calendar.month_name[month + 1], delta)
                        line_label = st_var.name
                        xlabel = "First Difference Magnitudes"
                    else:
                        title, line_label, xlabel = "", "", ""

                    threshold = utils.get_critical_values(
                        this_month_filtered_diff[locs],
                        binmin=0,
                        binwidth=0.5,
                        plots=plots,
                        diagnostics=diagnostics,
                        title=title,
                        line_label=line_label,
                        xlabel=xlabel,
                        old_threshold=critical_values[delta - 1, month])

                    if threshold < critical_values[delta - 1, month]:
                        critical_values[delta - 1, month] = threshold

                    if plots or diagnostics:

                        print critical_values[delta - 1, month], iqr, 6 * iqr

        month_locs = month_locs[good_to_uncompress]
        if diagnostics:
            print critical_values[0, :]

        # not less than 5x reporting accuracy
        good_critical_values = np.where(critical_values != st_var.mdi)
        low_critical_values = np.where(
            critical_values[good_critical_values] <= 5. * reporting_resolution)
        temporary = critical_values[good_critical_values]
        temporary[low_critical_values] = 5. * reporting_resolution
        critical_values[good_critical_values] = temporary

        if diagnostics:
            print critical_values[0, :], 5. * reporting_resolution

        # check hourly against 2 hourly, if <2/3 the increase to avoid crazy rejection rate
        for month in range(12):
            if critical_values[0, month] != st_var.mdi and critical_values[
                    1, month] != st_var.mdi:
                if critical_values[0, month] / critical_values[1,
                                                               month] <= 0.66:
                    critical_values[0,
                                    month] = 0.66 * critical_values[1, month]

        if diagnostics:
            print "critical values"
            print critical_values[0, :]

        # get time differences for unfiltered data

        full_time_diffs = np.ma.zeros(len(st_var.data), dtype=int)
        full_time_diffs.mask = copy.deepcopy(st_var.data.mask[:])
        full_time_diffs[good_to_uncompress[:-1]] = station.time.data[
            good_to_uncompress[1:]] - station.time.data[
                good_to_uncompress[:-1]]
        time_diffs = full_time_diffs.compressed()

        # go through each difference, identify which month it is in if passes spike thresholds

        # spikes at the beginning or ends of sections
        for t in np.arange(len(time_diffs)):
            if (np.abs(time_diffs[t - 1]) > 240) and (np.abs(time_diffs[t]) <
                                                      3):
                # 10 days before but short gap thereafter

                next_values = st_var.data[good_to_uncompress[t + 1:]]
                good, = np.where(next_values.mask == False)

                next_median = np.ma.median(next_values[good[:10]])

                next_diff = np.abs(value_diffs[t])  # out of spike
                median_diff = np.abs(next_median -
                                     st_var.data[good_to_uncompress[t]]
                                     )  # are the remaining onees

                if (critical_values[time_diffs[t] - 1, month_locs[t]] !=
                        st_var.mdi):

                    # jump from spike > critical but average after < critical / 2
                    if (np.abs(median_diff) < critical_values[time_diffs[t] - 1, month_locs[t]] / 2.) and\
                        (np.abs(next_diff) > critical_values[time_diffs[t] - 1, month_locs[t]]) :

                        flags[t] = 1
                        if plots or diagnostics:
                            sc_diagnostics_and_plots(station.time.data,
                                                     st_var.data,
                                                     good_to_uncompress[t],
                                                     good_to_uncompress[t + 1],
                                                     start,
                                                     variable,
                                                     plots=plots)

            elif (np.abs(time_diffs[t - 1]) < 3) and (np.abs(time_diffs[t]) >
                                                      240):
                # 10 days after but short gap before

                prev_values = st_var.data[good_to_uncompress[:t - 1]]
                good, = np.where(prev_values.mask == False)

                prev_median = np.ma.median(prev_values[good[-10:]])

                prev_diff = np.abs(value_diffs[t - 1])
                median_diff = np.abs(prev_median -
                                     st_var.data[good_to_uncompress[t]])

                if (critical_values[time_diffs[t - 1] - 1, month_locs[t]] !=
                        st_var.mdi):

                    # jump into spike > critical but average before < critical / 2
                    if (np.abs(median_diff) < critical_values[time_diffs[t - 1] - 1, month_locs[t]] / 2.) and\
                        (np.abs(prev_diff) > critical_values[time_diffs[t - 1] - 1, month_locs[t]]) :

                        flags[t] = 1
                        if plots or diagnostics:
                            sc_diagnostics_and_plots(station.time.data,
                                                     st_var.data,
                                                     good_to_uncompress[t],
                                                     good_to_uncompress[t + 1],
                                                     start,
                                                     variable,
                                                     plots=plots)
        ''' this isn't the nicest way, but a direct copy from IDL
            masked arrays might help remove some of the lines

            Also, this is relatively slow'''

        for t in np.arange(len(time_diffs)):
            for spk_len in [1, 2, 3]:
                if t >= spk_len and t < len(time_diffs) - spk_len:

                    # check if time differences are appropriate, for multi-point spikes
                    if (np.abs(time_diffs[t - spk_len]) <= spk_len * 3) and\
                    (np.abs(time_diffs[t]) <= spk_len * 3) and\
                    (time_diffs[t - spk_len - 1] - 1 < spk_len * 3) and\
                    (time_diffs[t + 1] - 1 < spk_len * 3) and \
                    ((spk_len == 1) or \
                    ((spk_len == 2) and (np.abs(time_diffs[t - spk_len + 1]) <= spk_len * 3)) or \
                    ((spk_len == 3) and (np.abs(time_diffs[t - spk_len + 1]) <= spk_len * 3) and (np.abs(time_diffs[t - spk_len + 2]) <= spk_len * 3))):

                        # check if differences are valid
                        if (value_diffs[t - spk_len] != st_var.mdi) and \
                        (value_diffs[t - spk_len] != st_var.fdi) and \
                        (critical_values[time_diffs[t - spk_len] - 1, month_locs[t]] != st_var.mdi):

                            # if exceed critical values
                            if (np.abs(value_diffs[t - spk_len]) >=
                                    critical_values[time_diffs[t - spk_len] -
                                                    1, month_locs[t]]):

                                # are signs of two differences different
                                if (math.copysign(1, value_diffs[t])
                                        != math.copysign(
                                            1, value_diffs[t - spk_len])):

                                    # are within spike differences small
                                    if (spk_len == 1) or\
                                    ((spk_len == 2) and (np.abs(value_diffs[t - spk_len + 1]) < critical_values[time_diffs[t - spk_len + 1] -1, month_locs[t]] / 2.)) or \
                                    ((spk_len == 3) and (np.abs(value_diffs[t - spk_len + 1]) < critical_values[time_diffs[t - spk_len + 1] -1, month_locs[t]] / 2.) and\
                                      (np.abs(value_diffs[t - spk_len + 2]) < critical_values[time_diffs[t - spk_len + 2] -1, month_locs[t]] / 2.)):

                                        # check if following value is valid
                                        if (value_diffs[t] != st_var.mdi) and (critical_values[time_diffs[t] - 1, month_locs[t]] != st_var.mdi) and\
                                            (value_diffs[t] != st_var.fdi):

                                            # and if at least critical value
                                            if (np.abs(value_diffs[t]) >=
                                                    critical_values[
                                                        time_diffs[t] - 1,
                                                        month_locs[t]]):

                                                # test if surrounding differences below 1/2 critical value
                                                if (np.abs(
                                                        value_diffs[t - spk_len
                                                                    - 1]
                                                ) <= critical_values[
                                                        time_diffs[t -
                                                                   spk_len -
                                                                   1] - 1,
                                                        month_locs[t]] / 2.):
                                                    if (np.abs(
                                                            value_diffs[t + 1]
                                                    ) <= critical_values[
                                                            time_diffs[t + 1] -
                                                            1, month_locs[t]] /
                                                            2.):

                                                        # set the flags
                                                        flags[t - spk_len +
                                                              1:t + 1] = 1

                                                        if plots or diagnostics:

                                                            sc_diagnostics_and_plots(
                                                                station.time.
                                                                data,
                                                                st_var.data,
                                                                good_to_uncompress[
                                                                    t -
                                                                    spk_len +
                                                                    1],
                                                                good_to_uncompress[
                                                                    t + 1],
                                                                start,
                                                                variable,
                                                                plots=plots)

        station.qc_flags[good_to_uncompress, flag_col[v]] = flags

        flag_locs, = np.where(station.qc_flags[:, flag_col[v]] != 0)

        utils.print_flagged_obs_number(logfile,
                                       "Spike",
                                       variable,
                                       len(flag_locs),
                                       noWrite=diagnostics)  # additional flags

        # copy flags into attribute
        st_var.flags[flag_locs] = 1

        # matches 030660 - but with adapted IDL
        # matches 030220 OK, but finds more but all are reasonable 1/9/14

        do_interactive = False
        if plots and do_interactive == True:
            import matplotlib.pyplot as plt

            plot_times = utils.times_hours_to_datetime(station.time.data,
                                                       start)

            plt.clf()
            plt.plot(plot_times, all_filtered, 'bo', ls='-')
            flg = np.where(flags[:, flag_col[v]] == 1)
            plt.plot(plot_times[flg], all_filtered[flg], 'ro', markersize=10)
            plt.show()

    station = utils.append_history(station, "Spike Check")

    return  # sc

Beispiel #24

Datei anzeigen

Datei: climatological.py Projekt: wk1984/HadISD_v2

def coc(station,
        variable_list,
        flag_col,
        start,
        end,
        logfile,
        diagnostics=False,
        plots=False,
        idl=False):

    for v, variable in enumerate(variable_list):

        st_var = getattr(station, variable)
        all_filtered = utils.apply_filter_flags(st_var)

        # is this needed 13th Nov 2014 RJHD
        #reporting_resolution = utils.reporting_accuracy(utils.apply_filter_flags(st_var))

        month_ranges = utils.month_starts_in_pairs(start, end)
        month_ranges = month_ranges.reshape(-1, 12, 2)

        for month in range(12):

            hourly_climatologies = np.zeros(24)
            hourly_climatologies.fill(st_var.mdi)

            # append all e.g. Januaries together

            this_month, year_ids, dummy = utils.concatenate_months(
                month_ranges[:, month, :], st_var.data, hours=True)
            this_month_filtered, dummy, dummy = utils.concatenate_months(
                month_ranges[:, month, :], all_filtered, hours=True)

            # if fixed climatology period, sort this here

            # get as array of 24 hrs.
            this_month = np.ma.array(this_month)
            this_month = this_month.reshape(-1, 24)

            this_month_filtered = np.ma.array(this_month_filtered)
            this_month_filtered = this_month_filtered.reshape(-1, 24)

            # get hourly climatology for each month
            for hour in range(24):

                this_hour = this_month[:, hour]

                # need to have data if this is going to work!
                if len(this_hour.compressed()) > 0:

                    # winsorize & climatologies - done to match IDL
                    if idl:
                        this_hour = utils.winsorize(np.append(
                            this_hour.compressed(), -999999),
                                                    0.05,
                                                    idl=idl)
                        hourly_climatologies[hour] = np.ma.sum(this_hour) / (
                            len(this_hour) - 1)

                    else:
                        this_hour = utils.winsorize(this_hour.compressed(),
                                                    0.05,
                                                    idl=idl)
                        hourly_climatologies[hour] = np.ma.mean(this_hour)

            if len(this_month.compressed()) > 0:
                # can get stations with few obs in a particular variable.

                # anomalise each hour over month appropriately

                anomalies = this_month - np.tile(hourly_climatologies,
                                                 (this_month.shape[0], 1))
                anomalies_filtered = this_month_filtered - np.tile(
                    hourly_climatologies, (this_month_filtered.shape[0], 1))

                if len(anomalies.compressed()) >= 10:
                    iqr = utils.IQR(anomalies.compressed().reshape(
                        -1)) / 2.  # to match IDL
                    if iqr < 1.5: iqr = 1.5
                else:
                    iqr = st_var.mdi

                normed_anomalies = anomalies / iqr
                normed_anomalies_filtered = anomalies_filtered / iqr

                # get average anomaly for year
                year_ids = np.array(year_ids)
                monthly_vqvs = np.ma.zeros(month_ranges.shape[0])
                monthly_vqvs.mask = [
                    False for x in range(month_ranges.shape[0])
                ]
                for year in range(month_ranges.shape[0]):
                    year_locs = np.where(year_ids == year)
                    this_year = normed_anomalies_filtered[year_locs, :]

                    if len(this_year.compressed()) > 0:
                        # need to have data for this to work!
                        if idl:
                            monthly_vqvs[year] = utils.idl_median(
                                this_year.compressed().reshape(-1))
                        else:
                            monthly_vqvs[year] = np.ma.median(this_year)
                    else:
                        monthly_vqvs.mask[year] = True

                # low pass filter
                normed_anomalies = coc_low_pass_filter(normed_anomalies,
                                                       year_ids, monthly_vqvs,
                                                       month_ranges.shape[0])

                # copy from distributional_gap.py - refactor!
                # get the threshold value
                bins, bincenters = utils.create_bins(normed_anomalies, 1.)

                hist, binEdges = np.histogram(normed_anomalies, bins=bins)

                gaussian = utils.fit_gaussian(bincenters,
                                              hist,
                                              max(hist),
                                              mu=np.mean(normed_anomalies),
                                              sig=np.std(normed_anomalies))
                minimum_threshold = round(
                    1. + utils.invert_gaussian(FREQUENCY_THRESHOLD, gaussian))

                if diagnostics:
                    print iqr, minimum_threshold, 1. + utils.invert_gaussian(
                        FREQUENCY_THRESHOLD, gaussian)
                    print gaussian
                    print hist

                if plots:
                    coc_set_up_plot(bincenters,
                                    hist,
                                    gaussian,
                                    variable,
                                    threshold=minimum_threshold,
                                    sub_par="observations")

                uppercount = len(
                    np.where(normed_anomalies > minimum_threshold)[0])
                lowercount = len(
                    np.where(normed_anomalies < -minimum_threshold)[0])

                these_flags = station.qc_flags[:, flag_col[v]]
                gap_plot_values, tentative_plot_values = [], []

                # find the gaps and apply the flags

                gap_start = dgc.dgc_find_gap(hist,
                                             binEdges,
                                             minimum_threshold,
                                             gap_size=1)  # in DGC it is 2.
                these_flags, gap_plot_values, tentative_plot_values =\
                    coc_find_and_apply_flags(month_ranges[:,month,:],normed_anomalies, these_flags, year_ids, minimum_threshold, gap_start, \
                                                           upper = True, plots = plots, gpv = gap_plot_values, tpv = tentative_plot_values)

                gap_start = dgc.dgc_find_gap(hist,
                                             binEdges,
                                             -minimum_threshold,
                                             gap_size=1)  # in DGC it is 2.
                these_flags, gap_plot_values, tentative_plot_values =\
                    coc_find_and_apply_flags(month_ranges[:,month,:],normed_anomalies, these_flags, year_ids, minimum_threshold, gap_start, \
                                                           upper = False, plots = plots, gpv = gap_plot_values, tpv = tentative_plot_values)

                station.qc_flags[:, flag_col[v]] = these_flags

                if uppercount + lowercount > 1000:
                    #print "not sorted spurious stations yet"
                    pass
                if plots:
                    import matplotlib.pyplot as plt
                    hist, binEdges = np.histogram(tentative_plot_values,
                                                  bins=bins)
                    plot_hist = np.array([0.01 if h == 0 else h for h in hist])
                    plt.step(bincenters,
                             plot_hist,
                             c='orange',
                             ls='-',
                             label='tentative',
                             where='mid')

                    hist, binEdges = np.histogram(gap_plot_values, bins=bins)
                    plot_hist = np.array([0.01 if h == 0 else h for h in hist])
                    plt.step(bincenters,
                             plot_hist,
                             'r-',
                             label='flagged',
                             where='mid')
                    import calendar
                    plt.text(0.1,
                             0.9,
                             calendar.month_name[month + 1],
                             transform=plt.gca().transAxes)
                    leg = plt.legend(loc='lower center',
                                     ncol=4,
                                     bbox_to_anchor=(0.5, -0.2),
                                     frameon=False,
                                     prop={'size': 13},
                                     labelspacing=0.15,
                                     columnspacing=0.5)
                    plt.setp(leg.get_title(), fontsize=14)
                    plt.show()
                    #plt.savefig(IMAGELOCATION+'/'+station.id+'_ClimatologicalGap_'+str(month+1)+'.png')

        flag_locs = np.where(station.qc_flags[:, flag_col[v]] != 0)

        # copy flags into attribute
        st_var.flags[flag_locs] = 1

        if plots or diagnostics:
            utils.print_flagged_obs_number(logfile,
                                           "Climatological",
                                           variable,
                                           len(flag_locs[0]),
                                           noWrite=True)
            print "where\n"
            nflags = len(np.where(station.qc_flags[:, flag_col[v]] == 1)[0])
            utils.print_flagged_obs_number(logfile,
                                           "  Firm Clim",
                                           variable,
                                           nflags,
                                           noWrite=True)
            nflags = len(np.where(station.qc_flags[:, flag_col[v]] == 2)[0])
            utils.print_flagged_obs_number(logfile,
                                           "  Tentative Clim",
                                           variable,
                                           nflags,
                                           noWrite=True)
        else:
            utils.print_flagged_obs_number(logfile, "Climatological", variable,
                                           len(flag_locs[0]))
            logfile.write("where\n")
            nflags = len(np.where(station.qc_flags[:, flag_col[v]] == 1)[0])
            utils.print_flagged_obs_number(logfile, "  Firm Clim", variable,
                                           nflags)
            nflags = len(np.where(station.qc_flags[:, flag_col[v]] == 2)[0])
            utils.print_flagged_obs_number(logfile, "  Tentative Clim",
                                           variable, nflags)

        # firm flags match 030220
    station = utils.append_history(station, "Climatological Check")

    return

Beispiel #25

Datei anzeigen

Datei: variance.py Projekt: wk1984/HadISD_v2

def evc(station, variable_list, flag_col, start, end, logfile, diagnostics = False, plots = False, idl = False):
    
    if plots or diagnostics:
        import matplotlib.pyplot as plt
        import calendar

    
    # very similar to climatological check - ensure that not duplicating
    
    for v, variable in enumerate(variable_list):
    
        st_var = getattr(station, variable)
    
        reporting_resolution = utils.reporting_accuracy(utils.apply_filter_flags(st_var))
        reporting_freq = utils.reporting_frequency(utils.apply_filter_flags(st_var))
   
        month_ranges = utils.month_starts_in_pairs(start, end)
        month_ranges = month_ranges.reshape(-1,12,2)

        month_data_count = np.zeros(month_ranges.shape[0:2])

        # for each month
        for month in range(12):

            # set up hourly climatologies
            hourly_clims = np.zeros(24)
            hourly_clims.fill(st_var.data.fill_value)

            this_month, year_ids, month_data_count[:,month] = utils.concatenate_months(month_ranges[:,month,:], st_var.data, hours = True)

            
            # # extract each year and append together
            # year_ids = [] # counter to determine which year each day corresponds to
            # for year in range(month_ranges.shape[0]):
                
            #     this_year = st_var.data[month_ranges[year,month][0]:month_ranges[year,month][1]]
            #     if year == 0:
            #         # store so can access each hour of day separately
            #         this_month = this_year.reshape(-1,24)
                    
            #         year_ids = [year for x in range(this_month.shape[0])]
                    
            #         month_data_count[year,month] = len(this_year.compressed())
                    
            #     else:
            #         this_year = this_year.reshape(-1,24)
                       
            #         this_month = np.ma.concatenate((this_month, this_year), axis = 0)
                    
            #         year_ids.extend([year for x in range(this_year.shape[0])])
                    
            #         month_data_count[year,month] = len(this_year.compressed())

                
                  
            # winsorize and get hourly climatology 
            for h in range(24):
                
                this_hour = this_month[:,h]
                
                if len(this_hour.compressed()) > 100:

                    
                    # winsorize & climatologies - done to match IDL
                    if idl:
                        this_hour_winsorized = utils.winsorize(np.append(this_hour.compressed(), -999999), 0.05, idl = idl)
                        hourly_clims[h] = np.ma.sum(this_hour_winsorized)/(len(this_hour_winsorized) - 1)
                        
                    else:
                        this_hour_winsorized = utils.winsorize(this_hour.compressed(), 0.05, idl = idl)
                        hourly_clims[h] = np.ma.mean(this_hour_winsorized)
                    
            
            hourly_clims = np.ma.masked_where(hourly_clims == st_var.data.fill_value, hourly_clims)           
            anomalies = this_month - np.tile(hourly_clims, (this_month.shape[0], 1))
            
            # extract IQR of anomalies (using 1/2 value to match IDL)
            if len(anomalies.compressed()) >= 10:
                
                iqr = utils.IQR(anomalies.compressed().reshape(-1)) / 2. # to match IDL
                if iqr < 1.5: iqr = 1.5

            else:
                iqr = st_var.mdi
            
            normed_anomalies = anomalies / iqr
            

            variances = np.ma.zeros(month_ranges.shape[0])
            variances.mask = [False for i in range(month_ranges.shape[0])]
            rep_accuracies = np.zeros(month_ranges.shape[0])
            rep_freqs = np.zeros(month_ranges.shape[0])
            
            variances.fill(st_var.mdi)
            rep_accuracies.fill(st_var.mdi)
            rep_freqs.fill(st_var.mdi)
                
            year_ids = np.array(year_ids)
            
            # extract variance of normalised anomalies for each year
            for y, year in enumerate(range(month_ranges.shape[0])):
            
                year_locs = np.where(year_ids == y)
            
                this_year = normed_anomalies[year_locs,:]
                this_year = this_year.reshape(-1)
                
            # end of similarity with Climatological check
            
                if len(this_year.compressed()) >= 30:
            
                    variances[y] = utils.mean_absolute_deviation(this_year, median = True)
                    
                    rep_accuracies[y] = utils.reporting_accuracy(this_year)
                    rep_freqs[y] = utils.reporting_frequency(this_year)

                else:
                    variances.mask[y] = True

            good = np.where(month_data_count[:,month] >= 100)
            
            # get median and IQR of variance for all years for this month
            if len(good[0]) >= 10:
                
                median_variance = np.median(variances[good])
                
                iqr_variance = utils.IQR(variances[good]) / 2. # to match IDL
                
                if iqr_variance < 0.01: iqr_variance = 0.01
            else:
                
                median_variance = st_var.mdi
                iqr_variance = st_var.mdi

                
            # if SLP, then get median and MAD of SLP and windspeed for month
            if variable in ["slp", "windspeeds"]:
                
                winds = getattr(station, "windspeeds")
                slp = getattr(station, "slp")
        
                # refactor this as similar in style to how target data extracted  
                for y, year in enumerate(range(month_ranges.shape[0])):
                    
                    if y == 0:
                        winds_year = winds.data[month_ranges[year,month][0]:month_ranges[year,month][1]]
                        winds_month = winds_year.reshape(-1,24)
                                            
                        slp_year = slp.data[month_ranges[year,month][0]:month_ranges[year,month][1]]
                        slp_month = slp_year.reshape(-1,24)
                                            
                    else:
                        winds_year = winds.data[month_ranges[year,month][0]:month_ranges[year,month][1]]                  
                        winds_year = winds_year.reshape(-1,24)
                        winds_month = np.ma.concatenate((winds_month, winds_year), axis = 0)
                        
                        slp_year = slp.data[month_ranges[year,month][0]:month_ranges[year,month][1]]                  
                        slp_year =  slp_year.reshape(-1,24)
                        slp_month = np.ma.concatenate((slp_month, slp_year), axis = 0)
                        
                median_wind = np.ma.median(winds_month)
                median_slp  = np.ma.median(slp_month)
                
                wind_MAD = utils.mean_absolute_deviation(winds_month.compressed())
                slp_MAD = utils.mean_absolute_deviation(slp_month.compressed())
                
                if diagnostics:
                    print "median windspeed {} m/s, MAD = {}".format(median_wind, wind_MAD)
                    print "median slp {} hPa, MAD = {}".format(median_slp, slp_MAD)

            # now test to see if variance exceeds expected range
            for y, year in enumerate(range(month_ranges.shape[0])):


                if (variances[y] != st_var.mdi) and (iqr_variance != st_var.mdi) and \
                    (median_variance != st_var.mdi) and (month_data_count[y,month] >= DATA_COUNT_THRESHOLD):
                    
                    # if SLP, then need to test if deep low pressure ("hurricane/storm") present
                    #   as this will increase the variance for this month + year
                    if variable in ["slp", "windspeeds"]:
                        
                        iqr_threshold = 6.
                        
                        # increase threshold if reporting frequency and resolution of this
                        #   year doesn't match average
                        if (rep_accuracies[y] != reporting_resolution) and \
                            (rep_freqs[y] != reporting_freq):
                            iqr_threshold = 8.
                       
                        if diagnostics:
                            print np.abs(variances[y] - median_variance) / iqr_variance, variances[y] , median_variance , iqr_variance , iqr_threshold, month+1, year+start.year
                        
                        if np.abs((variances[y] - median_variance) / iqr_variance) > iqr_threshold:
                        
                            # check for storms     
                            winds_month = winds.data[month_ranges[year,month][0]:month_ranges[year,month][1]]                  
                            slp_month = slp.data[month_ranges[year,month][0]:month_ranges[year,month][1]]                  
                   
                            storm = False
                            if (len(winds_month.compressed()) >= 1) and (len(slp_month.compressed()) >= 1):
                                # find max wind & min SLP
                                # max_wind_loc = np.where(winds_month == np.max(winds_month))[0][0]
                                # min_slp_loc = np.where(slp_month == np.min(slp_month))[0][0]

                                # if these are above thresholds and within one day of each other,
                                #    then it likely was a storm
                                # print "fix this in case of multiple max/min locations"
                                # if (np.abs(max_wind_loc - min_slp_loc) <= 24) and \ 
                                #     (((np.max(winds_month) - median_wind) / wind_MAD) > MAD_THRESHOLD) and \
                                #     (((median_slp - np.min(slp_month)) / slp_MAD) > MAD_THRESHOLD): 

                                # locations where winds greater than threshold
                                high_winds, = np.where((winds_month - median_wind)/wind_MAD > MAD_THRESHOLD)
                                # and where SLP less than threshold
                                low_slps, = np.where((median_slp - slp_month)/slp_MAD > MAD_THRESHOLD)

                                # if any locations match, then it's a storm
                                match_loc = high_winds[np.in1d(high_winds, low_slps)]
                                    
                                if len(match_loc) > 0:
                                    storm = True
                            else:
                                print "write spurious"
                                
                            # check the SLP first difference series
                            #   to ensure a drop down and climb out of minimum SLP/or climb up and down from maximum wind speed
                            if variable == "slp":
                                diffs = np.diff(slp_month.compressed())
                            elif variable == "windspeeds":
                                diffs = np.diff(winds_month.compressed())
                            
                            negs, poss = 0,0
                            biggest_neg, biggest_pos = 0,0
                            
                            for diff in diffs:
                                
                                if diff > 0:
                                    if negs > biggest_neg: biggest_neg = negs
                                    negs = 0
                                    poss += 1
                                else:
                                    if poss > biggest_pos: biggest_pos = poss
                                    poss = 0
                                    negs += 1
                                
                            if (biggest_neg < 10) and (biggest_pos < 10) and not storm:
                                
                                # not a hurricane, so mask
                                station.qc_flags[month_ranges[year,month,0]:month_ranges[year,month,1], flag_col[v]] = 1
                                if plots or diagnostics:
                                    print "No Storm or Hurricane in %i %i - flagging\n" % (month+1, y+start.year)
                                else:
                                    logfile.write("No Storm or Hurricane in %i %i - flagging\n" % (month+1, y+start.year))
                                
                            else:
                                # hurricane
                                if plots or diagnostics:
                                    print "Storm or Hurricane in %i %i - not flagging\n" % (month+1, y+start.year)
                                else:
                                    logfile.write("Storm or Hurricane in %i %i - not flagging\n" % (month+1, y+start.year))
                        
                            if plots:
                                # plot showing the pressure, pressure first differences and the wind speeds
                                plot_times = utils.times_hours_to_datetime(station.time.data[month_ranges[year,month][0]:month_ranges[year,month][1]], start)

                                evc_plot_slp_wind(plot_times, slp_month, diffs, median_slp, slp_MAD, winds_month, median_wind, wind_MAD)

                    else:
                        
                        iqr_threshold = 8.
                        
                        if (rep_accuracies[y] != reporting_resolution) and \
                            (rep_freqs[y] != reporting_freq):
                            iqr_threshold = 10.
                            

                        if np.abs(variances[y] - median_variance) / iqr_variance > iqr_threshold:
                                
                            if diagnostics:
                                print "flagging {} {}".format(year+start.year,calendar.month_name[month+1])
                            # remove the data 
                            station.qc_flags[month_ranges[year,month,0]:month_ranges[year,month,1], flag_col[v]] = 1


            if plots:
                plot_variances = (variances - median_variance) / iqr_variance

                plot_variances = np.ma.masked_where(month_data_count[:,month] < DATA_COUNT_THRESHOLD,plot_variances)
                
                evc_plot_hist(plot_variances, iqr_threshold, "Variance Check - %s - %s" % (variable, calendar.month_name[month+1]))
 
        flag_locs = np.where(station.qc_flags[:, flag_col[v]] != 0)
        if plots or diagnostics:
            utils.print_flagged_obs_number(logfile, "Variance", variable, len(flag_locs[0]), noWrite = True)
        else:
            utils.print_flagged_obs_number(logfile, "Variance", variable, len(flag_locs[0]))
            
        # copy flags into attribute
        st_var.flags[flag_locs] = 1

    # matches 030660 for T, D and SLP 21/8/2014

    station = utils.append_history(station, "Excess Variance Check")

    return # evc

Beispiel #26

Datei anzeigen

Datei: neighbour_checks.py Projekt: wk1984/HadISD_v2

def neighbour_checks(station_info, restart_id = "", end_id = "", distances=np.array([]), angles=np.array([]), second = False, masking = False, doZip=False, plots = False, diagnostics = False):
    """
    Run through neighbour checks on list of stations passed
    
    :param list station_info: list of lists - [[ID, lat, lon, elev]] - strings
    :param array distances: array of distances between station pairs
    :param array angles: array of angles between station pairs
    :param bool second: do the second run
    :param bool masking: apply the flags to the data to mask the observations.

    """
    first = not second

    qc_code_version = subprocess.check_output(['svnversion']).strip()

    # if distances and angles not calculated, then do so
    if (len(distances) == 0) or (len(angles) == 0):
        print "calculating distances and bearings matrix"
        distances, angles = get_distances_angles(station_info)

    # extract before truncate the array
    neighbour_elevations = np.array(station_info[:,3], dtype=float) 
    neighbour_ids        = np.array(station_info[:,0])
    neighbour_info       = np.array(station_info[:,:])

    # sort truncated run
    startindex = 0
    if restart_id != "":
        startindex, = np.where(station_info[:,0] == restart_id)


    if end_id != "":
        endindex, = np.where(station_info[:,0] == end_id)
        if endindex != len(station_info) -1:
            station_info = station_info[startindex: endindex+1]
            distances = distances[startindex:endindex+1,:]
            angles = angles[startindex:endindex+1,:]
        else:
            station_info = station_info[startindex:]
            distances = distances[startindex:,:]
            angles = angles[startindex:,:]
    else:
        station_info = station_info[startindex:]
        distances = distances[startindex:,:]
        angles = angles[startindex:,:]
        

    # process each neighbour
    for st, stat in enumerate(station_info):       

        print dt.datetime.strftime(dt.datetime.now(), "%A, %d %B %Y, %H:%M:%S")
        print "Neighbour Check"
        print "{:35s} {}".format("Station Identifier :", stat[0])

        if not plots and not diagnostics:
            logfile = file(LOG_OUTFILE_LOCS+stat[0]+'.log','a') # append to file if second iteration.
            logfile.write(dt.datetime.strftime(dt.datetime.now(), "%A, %d %B %Y, %H:%M:%S\n"))
            logfile.write("Neighbour Check\n")
            logfile.write("{:35s} {}\n".format("Station Identifier :", stat[0]))
        else:
            logfile = ""

        process_start_time = time.time()

        station = utils.Station(stat[0], float(stat[1]), float(stat[2]), float(stat[3]))

        # if running through the first time
        if first:

            if os.path.exists(os.path.join(NETCDF_DATA_LOCS, station.id + "_internal.nc.gz")):
                # if gzip file, unzip here
                subprocess.call(["gunzip",os.path.join(NETCDF_DATA_LOCS, station.id + "_internal.nc.gz")])
                time.sleep(5) # make sure it is unzipped before proceeding

            # read in the data
            ncdfp.read(os.path.join(NETCDF_DATA_LOCS, station.id + "_internal.nc"), station, process_vars, carry_thru_vars, diagnostics = diagnostics)

            if plots or diagnostics:
                print "{:35s}  {}\n".format("Total station record size :",len(station.time.data))
            else:
                logfile.write("{:35s}  {}\n".format("Total station record size :",len(station.time.data)))

            match_to_compress = utils.create_fulltimes(station, process_vars, DATASTART, DATAEND, carry_thru_vars)

        # or if second pass through?
        elif second:
            if os.path.exists(os.path.join(NETCDF_DATA_LOCS, station.id + "internal2.nc.gz")):
                # if gzip file, unzip here
                subprocess.call(["gunzip",os.path.join(NETCDF_DATA_LOCS, station.id + "_internal2.nc.gz")])
                time.sleep(5) # make sure it is unzipped before proceeding

            ncdfp.read(os.path.join(NETCDF_DATA_LOCS, station.id + "_internal2.nc"), station, process_vars, carry_thru_vars, diagnostics = diagnostics)
            if plots or diagnostics:
                print "{:35s}  {}\n".format("Total station record size :",len(station.time.data))
            else:
                logfile.write("{:35s}  {}\n".format("Total station record size :",len(station.time.data)))

            match_to_compress = utils.create_fulltimes(station, process_vars, DATASTART, DATAEND, carry_thru_vars)


        # select neighbours
        neighbour_distances  = distances[st,:]
        neighbour_bearings   = angles[st,:]

        # have to add in start index so that can use location in distance file.
        # neighbours = n_utils.get_neighbours(st+startindex, np.float(stat[3]), neighbour_distances, neighbour_bearings, neighbour_elevations)

        # return all neighbours up to a limit from the distance and elevation offsets (500km and 300m respectively)
        neighbours, neighbour_quadrants = n_utils.get_all_neighbours(st+startindex, np.float(stat[3]), neighbour_distances, neighbour_bearings, neighbour_elevations)

        if plots or diagnostics:
            print "{:14s} {:10s} {:10s}".format("Neighbour","Distance","Elevation")
            for n in neighbours:
                print "{:14s} {:10.1f} {:10.1f}".format(neighbour_ids[n],neighbour_distances[n],neighbour_elevations[n])

        else:
            logfile.write("{:14s} {:10s} {:10s}\n".format("Neighbour","Distance","Elevation"))
            for n in neighbours:
                logfile.write("{:14s} {:10.1f} {:10.1f}\n".format(neighbour_ids[n],neighbour_distances[n],neighbour_elevations[n]))

        # if sufficient neighbours
        if len(neighbours) >= 3:

            for variable, col in FLAG_OUTLIER_DICT.items():
                # NOTE - this requires multiple reads of the same file
                #      but does make it easier to understand and code

                st_var = getattr(station, variable)

                if plots or diagnostics:
                    print "Length of {} record: {}".format(variable, len(st_var.data.compressed()))
                else:
                    logfile.write("Length of {} record: {}\n".format(variable, len(st_var.data.compressed())))

                
                if len(st_var.data.compressed()) > 0:

                    final_neighbours = n_utils.select_neighbours(station, variable, neighbour_info[neighbours], neighbours, neighbour_distances[neighbours], neighbour_quadrants, NETCDF_DATA_LOCS, DATASTART, DATAEND, logfile, second = second, diagnostics = diagnostics, plots = plots)


                    # now read in final set of neighbours and process

                    neigh_flags = np.zeros(len(station.time.data)) # count up how many neighbours think this obs is bad
                    neigh_count = np.zeros(len(station.time.data)) # number of neighbours at each time stamp
                    dpd_flags = np.zeros(len(station.time.data)) # number of neighbours at each time stamp
                    reporting_accuracies = np.zeros(len(neighbours)) # reporting accuracy of each neighbour

                    all_data = np.ma.zeros([len(final_neighbours), len(station.time.data)]) # store all the neighbour values

                    for nn, nn_loc in enumerate(final_neighbours):

                        neigh_details = neighbour_info[nn_loc]
                        neigh = utils.Station(neigh_details[0], float(neigh_details[1]), float(neigh_details[2]), float(neigh_details[3]))

                        if first:
                            ncdfp.read(os.path.join(NETCDF_DATA_LOCS, neigh.id + "_internal.nc"), neigh, [variable], diagnostics = diagnostics, read_input_station_id = False)
                        elif second:
                            ncdfp.read(os.path.join(NETCDF_DATA_LOCS, neigh.id + "_internal2.nc"), neigh, [variable], diagnostics = diagnostics, read_input_station_id = False)

                        dummy = utils.create_fulltimes(neigh, [variable], DATASTART, DATAEND, [], do_input_station_id = False)

                        all_data[nn, :] = utils.apply_filter_flags(getattr(neigh, variable))

                        if diagnostics:
                            print neigh_details

                        n_utils.detect(station, neigh, variable, neigh_flags, neigh_count, DATASTART, DATAEND, distance = neighbour_distances[nn_loc], diagnostics = diagnostics, plots = plots)

                        reporting_accuracies[nn] = utils.reporting_accuracy(getattr(neigh,variable).data)

                        dpd_flags += neigh.qc_flags[:,31]
                    # gone through all neighbours


                    # if at least 2/3 of neighbours have flagged this point (and at least 3 neighbours)
                    some_flags, = np.where(neigh_flags > 0)            
                    outlier_locs, = np.where(np.logical_and((neigh_count[some_flags] >= 3),(neigh_flags[some_flags].astype("float")/neigh_count[some_flags] > 2./3.)))

                    # flag where < 3 neighbours
                    locs = np.where(neigh_count[some_flags] < 3)
                    station.qc_flags[some_flags[locs], col] = -1

                    if len(outlier_locs) >= 1:
                        station.qc_flags[some_flags[outlier_locs], col] = 1

                        # print number flagged and copy into attribute
                        if plots or diagnostics:
                            utils.print_flagged_obs_number(logfile, "Neighbour", variable, len(outlier_locs), noWrite = True)
                        else:
                            utils.print_flagged_obs_number(logfile, "Neighbour", variable, len(outlier_locs))
                        st_var = getattr(station, variable)
                        st_var.flags[some_flags[outlier_locs]] = 1

                    else:
                        if plots or diagnostics:
                            utils.print_flagged_obs_number(logfile, "Neighbour", variable, len(outlier_locs), noWrite = True)
                        else:
                            utils.print_flagged_obs_number(logfile, "Neighbour", variable, len(outlier_locs))


                    if plots:
                        n_utils.plot_outlier(station, variable, some_flags[outlier_locs], all_data, DATASTART)

                    # unflagging using neighbours
                    n_utils.do_unflagging(station, variable, all_data, reporting_accuracies, neigh_count, dpd_flags, FLAG_COL_DICT, DATASTART, logfile, plots = plots, diagnostics = diagnostics)

                else:
                    if plots or diagnostics:
                        print "No observations to assess for {}".format(variable)
                    else:
                        logfile.write("No observations to assess for {}\n".format(variable))
                    

            # variable loop
        else:
            if plots or diagnostics:
                print "Fewer than 3 neighbours"
            else:
                logfile.write("Fewer than 3 neighbours\n")

        print dt.datetime.strftime(dt.datetime.now(), "%A, %d %B %Y, %H:%M:%S")
        print "processing took {:4.0f}s\n\n".format(time.time() - process_start_time)

        # end of neighbour check
        utils.append_history(station, "Neighbour Outlier Check")
        
        # clean up months 

        qc_tests.clean_up.clu(station, ["temperatures","dewpoints","slp","windspeeds","winddirs"], [44,45,46,47,48], FLAG_COL_DICT, DATASTART, DATAEND, logfile, plots = plots, diagnostics = diagnostics)


        if diagnostics or plots: raw_input("stop")

        # masking (at least call from here - optional call from internal?)

        # write to file
        if first:
            ncdfp.write(os.path.join(NETCDF_DATA_LOCS, station.id + "_external.nc"), station, process_vars, os.path.join(INPUT_FILE_LOCS,'attributes.dat'), opt_var_list = carry_thru_vars, compressed = match_to_compress, processing_date = '', qc_code_version = qc_code_version)
            # gzip the raw file
        elif second:
            ncdfp.write(os.path.join(NETCDF_DATA_LOCS, station.id + "_external2.nc"), station, process_vars, os.path.join(INPUT_FILE_LOCS,'attributes.dat'), opt_var_list = carry_thru_vars, compressed = match_to_compress, processing_date = '', qc_code_version = qc_code_version)
            # gzip the raw file
 

        # masking - apply the flags and copy masked data to flagged_obs attribute
        if masking:

            station = utils.mask(station, process_vars, logfile, FLAG_COL_DICT)

            # write to file
            if first:
                ncdfp.write(os.path.join(NETCDF_DATA_LOCS, station.id + "_mask.nc"), station, process_vars, os.path.join(INPUT_FILE_LOCS,'attributes.dat'), opt_var_list = carry_thru_vars, compressed = match_to_compress, processing_date = '', qc_code_version = qc_code_version)
            elif second:
                ncdfp.write(os.path.join(NETCDF_DATA_LOCS, station.id + "_mask2.nc"), station, process_vars, os.path.join(INPUT_FILE_LOCS,'attributes.dat'), opt_var_list = carry_thru_vars, compressed = match_to_compress, processing_date = '', qc_code_version = qc_code_version)

        if plots or diagnostics:
            print "Masking completed\n"
            print dt.datetime.strftime(dt.datetime.now(), "%A, %d %B %Y, %H:%M:%S\n")
            print "processing took {:4.0f}s\n\n".format(time.time() - process_start_time)
        else:
            logfile.write("Masking completed\n")
            logfile.write(dt.datetime.strftime(dt.datetime.now(), "%A, %d %B %Y, %H:%M:%S\n"))
            logfile.write("processing took {:4.0f}s\n\n".format(time.time() - process_start_time))
            logfile.close()
            
    # looped through all stations

    # gzip up all the raw files
    if doZip:
        for st, stat in enumerate(station_info):       
            if first:
                subprocess.call(["gzip",os.path.join(NETCDF_DATA_LOCS, stat[0]+"_internal.nc")])
                if masking:
                    subprocess.call(["gzip",os.path.join(NETCDF_DATA_LOCS, stat[0]+"_external.nc")])
                    subprocess.call(["gzip",os.path.join(NETCDF_DATA_LOCS, stat[0]+"_mask.nc")])

            elif second:
                subprocess.call(["gzip",os.path.join(NETCDF_DATA_LOCS, stat[0]+"_internal2.nc")])
                if masking:
                    subprocess.call(["gzip",os.path.join(NETCDF_DATA_LOCS, stat[0]+"_external2.nc")])
                    subprocess.call(["gzip",os.path.join(NETCDF_DATA_LOCS, stat[0]+"_mask2.nc")])

    print "Neighbour Checks completed\n"

    return # neighbour_checks

Beispiel #27

Datei anzeigen

Datei: odd_cluster.py Projekt: wk1984/HadISD_v2

def occ(station, variable_list, flag_col, datastart, logfile, diagnostics = False, plots = False, second = False):
    '''
    Check for odd clusters of data surrounded by missing 
        up to 6hr/24hr surrounded by at least 48 on each side

    :param MetVar station: the station object
    :param list variable_list: list of observational variables to process
    :param list flag_col: the columns to set on the QC flag array
    :param datetime datastart: dataset start time
    :param file logfile: logfile to store outputs
    :param bool diagnostics: do extra verbose output
    :param bool plots: do plots
    :param bool second: run for second time

    :returns:    
    '''

    # the four options of what to do with each observation
    #   the keys give values which are subroutines, and can be called
    #   all subroutines have to take the same set of inputs
    options = {0 : occ_normal, 1 : occ_start_cluster, 2 : occ_in_cluster, 3 : occ_after_cluster}

    for v,variable in enumerate(variable_list):
    
        st_var = getattr(station, variable)

        filtered_data = utils.apply_filter_flags(st_var)

        var_flags = station.qc_flags[:,flag_col[v]]
	
        prev_flag_number = 0
        if second:
            # count currently existing flags:
            prev_flag_number = len(var_flags[var_flags != 0])	

        # using IDL copy as method to ensure reproducibility (initially)
        
        oc_details = OddCluster(st_var.mdi, st_var.mdi, 0, st_var.mdi, st_var.mdi, -1)

        obs_type = 1

        for time in station.time.data:

            if filtered_data.mask[time] == False:
                # process observation point using subroutines, called from named tuple

                if plots and (obs_type == 3) and (time - oc_details.end >= 48):
                    # do plotting if matches flagging criteria
                    oc_plots(station, oc_details, time, datastart, filtered_data, variable)

                oc_details, obs_type = options[obs_type](oc_details, obs_type, time, var_flags)

            else:
                # have missing data, 
        
                if obs_type  == 2:
                    obs_type = 3
                elif obs_type == 0:
                    obs_type = 1

        station.qc_flags[:,flag_col[v]] = var_flags

        flag_locs = np.where(station.qc_flags[:, flag_col[v]] != 0)
        if plots or diagnostics:
            utils.print_flagged_obs_number(logfile, "Odd Cluster", variable, len(flag_locs[0]) - prev_flag_number, noWrite = True)
        else:
            utils.print_flagged_obs_number(logfile, "Odd Cluster", variable, len(flag_locs[0]) - prev_flag_number)
        
        # copy flags into attribute
        st_var.flags[flag_locs] = 1

        # matches 032070 temperature 26/8/2014
    station = utils.append_history(station, "Isolated Odd Cluster Check")  

    return # occ

Beispiel #28

Datei anzeigen

def dmc(station, variable_list, full_variable_list, flag_col, start, end, logfile, diagnostics=False, plots=False):
    '''
    Method copied from check_duplicates.pro
    :param obj station: station object with suitable attributes (see netcdf_procs.py)
    :param list variable_list: list of netcdf variables to process
    :param list full_variable_list: the variables for flags to be applied to
    :param list flag_col: which column to set in flag array
    :param datetime start: data start
    :param datetime end: data end
    :param file logfile: logfile to store outputs
    :param bool diagnostics: extra verbosity
    :param bool plots: do plots
    '''
    MIN_DATA_REQUIRED = 20 # obs per month
    
    # get array of Nx2 start/end pairs
    month_ranges = utils.month_starts_in_pairs(start, end)
    
    for v, variable in enumerate(variable_list):
    
        st_var = getattr(station, variable)
        
        # double loop structure - not ideal
        duplicated = np.zeros(len(month_ranges))
        
        for sm, source_month in enumerate(month_ranges):
            if diagnostics: print "Month %i of %i" % (sm + 1, len(month_ranges))
    
            source_data = st_var.data[source_month[0]:source_month[1]]
    
            if duplicated[sm] == 0:
                # don't repeat if already a duplicated
                for tm, target_month in enumerate(month_ranges[sm + 1:]):
                    target_data = st_var.data[target_month[0]:target_month[1]]
               
                    # match the data periods
                    overlap = np.min([len(source_data), len(target_data)])
                    s_data, t_data = source_data[:overlap], target_data[:overlap]
                    s_valid, t_valid = np.where(s_data.compressed() != st_var.fdi), \
                        np.where(t_data.compressed() != st_var.fdi)
                
                    # if enough of an overlap
                    if (len(s_valid[0]) >= MIN_DATA_REQUIRED) and \
                        (len(t_valid[0]) >= MIN_DATA_REQUIRED):                        
                        
                        if len(s_valid[0]) < len(t_valid[0]):   
                            
                            duplicated = duplication_test(source_data, target_data, s_valid, sm, tm, source_month, target_month, duplicated, diagnostics, station.qc_flags, flag_col[v])
                                                           
                        else:     
                            # swap the list of valid points                       
                            duplicated = duplication_test(source_data, target_data, t_valid, sm, tm, source_month, target_month, duplicated, diagnostics, station.qc_flags, flag_col[v])

                        if plots:
                            dmc_plot(s_data, t_data, start, source_month[0], target_month[0], st_var.name)
        
                    # target month
            # source month
        # variable list
        flag_locs = np.where(station.qc_flags[:, flag_col[v]] != 0)
        utils.print_flagged_obs_number(logfile, "Duplicate Month", variable, len(flag_locs[0]), noWrite = diagnostics)

        # copy flags into attribute
        st_var.flags[flag_locs] = 1
	
    utils.apply_flags_all_variables(station, full_variable_list, flag_col[variable_list == "temperatures"], logfile, "Duplicate Months", plots = plots, diagnostics = diagnostics)

    station = utils.append_history(station, "Duplicate Months Check")  
                     
    return # dmc

Beispiel #29

Datei anzeigen

def do_unflagging(station,
                  variable,
                  all_data,
                  reporting_accuracies,
                  neigh_count,
                  dpd_flags,
                  FLAG_COL_DICT,
                  start,
                  logfile,
                  plots=False,
                  diagnostics=False):
    '''
    Set up and run the unflagging process for the specified tests

    :param MetVar station: station object
    :param string variable: variable to process
    :param array all_data: array containing all neighbour obs for full time period
    :param array reporting accuracies: reporting accuracy for each neighbour
    :param array neigh_count: number of neighbours with data at each time stamp
    :param array dpd_flags: number of neighbours that have DPD set at each time stamp
    :param dict FLAG_COL_DICT: look up dictionary to 
    :param datetime start: start of dataset
    :param file logfile: logfile to store outputs
    :param bool plots: do plots
    '''

    # unflagging using neighbours
    '''This is slow - np.ma.median is known to be slow
    https://github.com/astropy/ccdproc/issues/74
    https://github.com/astropy/ccdproc/blob/122cdbd5713140174f057eaa8fdb6f9ce03312df/docs/ccdproc/bottleneck_example.rst'''
    mean_of_neighbours = bn_median(all_data, axis=0)
    std_of_neighbours = median_absolute_deviation(all_data, axis=0)

    # find where the spread of neighbour observations is less than 1/2
    #    of maximum reporting accuracy
    std_of_neighbours[
        std_of_neighbours < 0.5 *
        max(reporting_accuracies)] = 0.5 * max(reporting_accuracies)

    # create series of normalised differences of obs from neighbour mean
    st_var = getattr(station, variable)
    normalised_differences = np.ma.abs(st_var.data -
                                       mean_of_neighbours) / std_of_neighbours

    for qc_test in ["climatological", "gap", "odd", "dpd"]:

        if qc_test == "dpd" and variable == "dewpoints":
            flags = station.qc_flags[:, UNFLAG_COL_DICT[qc_test][variable]]
            unset_locs = unflagging_locs(normalised_differences,
                                         flags,
                                         neigh_count,
                                         dpd_count=dpd_flags)
        elif qc_test == "dpd":
            # only unflag DPD on dewpoints
            continue

        elif qc_test == "gap" and variable != "slp":
            # only unflag gap check on slp observations
            continue

        else:
            flags = station.qc_flags[:, UNFLAG_COL_DICT[qc_test][variable]]
            if qc_test == "gap" or qc_test == "climatological":
                # only tentative flags
                unset_locs = unflagging_locs(normalised_differences,
                                             flags,
                                             neigh_count,
                                             flag_value=2)
            else:
                unset_locs = unflagging_locs(normalised_differences, flags,
                                             neigh_count)

        if len(unset_locs) > 0:
            station.qc_flags[unset_locs,
                             UNFLAG_COL_DICT[qc_test][variable]] = 0

            # need to unflag attribute if and only if no other flags are set
            subset_flags = station.qc_flags[:, FLAG_COL_DICT[variable]]
            total_flags = np.sum(subset_flags[unset_locs, :], axis=1)
            clean_locs = np.where(total_flags == 0)
            st_var.flags[unset_locs[clean_locs]] = 0

        # and print result
        if plots or diagnostics:
            utils.print_flagged_obs_number(logfile,
                                           "Unflagging " + qc_test,
                                           variable,
                                           len(unset_locs),
                                           noWrite=True)
        else:
            utils.print_flagged_obs_number(logfile, "Unflagging " + qc_test,
                                           variable, len(unset_locs))

        if plots:
            if len(unset_locs) > 0:
                plot_outlier(station, variable, unset_locs, all_data, start)

    station = utils.append_history(station, "Unflagging - " + variable)

    return  # do_unflagging

Beispiel #30

Datei anzeigen

Datei: neighbour_utils.py Projekt: rjhd2/HadISD_v2

def do_unflagging(station, variable, all_data, reporting_accuracies, neigh_count, dpd_flags, FLAG_COL_DICT, start, logfile, plots = False, diagnostics = False):
    '''
    Set up and run the unflagging process for the specified tests

    :param MetVar station: station object
    :param string variable: variable to process
    :param array all_data: array containing all neighbour obs for full time period
    :param array reporting accuracies: reporting accuracy for each neighbour
    :param array neigh_count: number of neighbours with data at each time stamp
    :param array dpd_flags: number of neighbours that have DPD set at each time stamp
    :param dict FLAG_COL_DICT: look up dictionary to 
    :param datetime start: start of dataset
    :param file logfile: logfile to store outputs
    :param bool plots: do plots
    '''


    # unflagging using neighbours
    '''This is slow - np.ma.median is known to be slow
    https://github.com/astropy/ccdproc/issues/74
    https://github.com/astropy/ccdproc/blob/122cdbd5713140174f057eaa8fdb6f9ce03312df/docs/ccdproc/bottleneck_example.rst'''
    mean_of_neighbours = bn_median(all_data, axis = 0)
    std_of_neighbours = median_absolute_deviation(all_data, axis = 0)

    # find where the spread of neighbour observations is less than 1/2
    #    of maximum reporting accuracy
    std_of_neighbours[std_of_neighbours < 0.5*max(reporting_accuracies)] = 0.5*max(reporting_accuracies)

    # create series of normalised differences of obs from neighbour mean
    st_var = getattr(station, variable)
    normalised_differences = np.ma.abs(st_var.data - mean_of_neighbours)/std_of_neighbours
            
    for qc_test in ["climatological","gap","odd","dpd"]:
        
        if qc_test == "dpd" and variable == "dewpoints":
            flags = station.qc_flags[:, UNFLAG_COL_DICT[qc_test][variable]] 
            unset_locs = unflagging_locs(normalised_differences, flags, neigh_count, dpd_count = dpd_flags)
        elif qc_test == "dpd":
            # only unflag DPD on dewpoints
            continue

        elif qc_test == "gap" and variable != "slp":
            # only unflag gap check on slp observations
            continue

        else:
            flags = station.qc_flags[:, UNFLAG_COL_DICT[qc_test][variable]] 
            if qc_test == "gap" or qc_test == "climatological":
                # only tentative flags
                unset_locs = unflagging_locs(normalised_differences, flags, neigh_count, flag_value = 2)
            else:
                unset_locs = unflagging_locs(normalised_differences, flags, neigh_count)

        if len(unset_locs) > 0:
            station.qc_flags[unset_locs, UNFLAG_COL_DICT[qc_test][variable]] = 0
            
            # need to unflag attribute if and only if no other flags are set
            subset_flags = station.qc_flags[:, FLAG_COL_DICT[variable]]
            total_flags = np.sum(subset_flags[unset_locs, :], axis = 1)
            clean_locs = np.where(total_flags == 0)
            st_var.flags[unset_locs[clean_locs]] = 0

        # and print result
        if plots or diagnostics:
            utils.print_flagged_obs_number(logfile, "Unflagging "+qc_test, variable, len(unset_locs), noWrite = True)
        else:
            utils.print_flagged_obs_number(logfile, "Unflagging "+qc_test, variable, len(unset_locs))
            

        if plots:
            if len(unset_locs) > 0:
                plot_outlier(station, variable, unset_locs, all_data, start)

    station = utils.append_history(station, "Unflagging - "+variable)  
    
    return # do_unflagging

Beispiel #31

Datei anzeigen

Datei: climatological.py Projekt: rjhd2/HadISD_v2

def coc(station, variable_list, flag_col, start, end, logfile, diagnostics = False, plots = False, idl = False):
    
    for v, variable in enumerate(variable_list):
        
        st_var = getattr(station, variable)
        all_filtered = utils.apply_filter_flags(st_var)
        
        # is this needed 13th Nov 2014 RJHD
        #reporting_resolution = utils.reporting_accuracy(utils.apply_filter_flags(st_var))
        
        month_ranges = utils.month_starts_in_pairs(start, end)
        month_ranges = month_ranges.reshape(-1,12,2)
    
        for month in range(12):
            
            hourly_climatologies = np.zeros(24)
            hourly_climatologies.fill(st_var.mdi)
            
            # append all e.g. Januaries together

            this_month, year_ids, dummy = utils.concatenate_months(month_ranges[:,month,:], st_var.data, hours = True)
            this_month_filtered, dummy, dummy = utils.concatenate_months(month_ranges[:,month,:], all_filtered, hours = True)

            # if fixed climatology period, sort this here
            
            # get as array of 24 hrs.  
            this_month = np.ma.array(this_month)
            this_month = this_month.reshape(-1,24)

            this_month_filtered = np.ma.array(this_month_filtered)
            this_month_filtered = this_month_filtered.reshape(-1,24)

            # get hourly climatology for each month
            for hour in range(24):
                
                this_hour = this_month[:,hour]

                # need to have data if this is going to work!
                if len(this_hour.compressed()) > 0:

                    # winsorize & climatologies - done to match IDL
                    if idl:
                        this_hour = utils.winsorize(np.append(this_hour.compressed(), -999999), 0.05, idl = idl)
                        hourly_climatologies[hour] = np.ma.sum(this_hour)/(len(this_hour) - 1)

                    else:
                        this_hour = utils.winsorize(this_hour.compressed(), 0.05, idl = idl)
                        hourly_climatologies[hour] = np.ma.mean(this_hour)



            if len(this_month.compressed()) > 0:
                # can get stations with few obs in a particular variable.

                # anomalise each hour over month appropriately

                anomalies = this_month - np.tile(hourly_climatologies, (this_month.shape[0],1))
                anomalies_filtered = this_month_filtered - np.tile(hourly_climatologies, (this_month_filtered.shape[0],1))

                if len(anomalies.compressed()) >= 10:
                    iqr = utils.IQR(anomalies.compressed().reshape(-1))/2.  # to match IDL
                    if iqr < 1.5: iqr = 1.5
                else:
                    iqr = st_var.mdi

                normed_anomalies = anomalies / iqr
                normed_anomalies_filtered = anomalies_filtered / iqr


                # get average anomaly for year
                year_ids = np.array(year_ids)
                monthly_vqvs = np.ma.zeros(month_ranges.shape[0])
                monthly_vqvs.mask = [False for x in range(month_ranges.shape[0])]
                for year in range(month_ranges.shape[0]):
                    year_locs = np.where(year_ids == year)
                    this_year = normed_anomalies_filtered[year_locs,:]

                    if len(this_year.compressed()) > 0:
                        # need to have data for this to work!
                        if idl:
                            monthly_vqvs[year] = utils.idl_median(this_year.compressed().reshape(-1))
                        else:
                            monthly_vqvs[year] = np.ma.median(this_year)
                    else:
                        monthly_vqvs.mask[year] = True


                # low pass filter
                normed_anomalies = coc_low_pass_filter(normed_anomalies, year_ids, monthly_vqvs, month_ranges.shape[0])

                # copy from distributional_gap.py - refactor!
                # get the threshold value
                bins, bincenters = utils.create_bins(normed_anomalies, 1.)

                hist, binEdges = np.histogram(normed_anomalies, bins = bins)

                gaussian = utils.fit_gaussian(bincenters, hist, max(hist), mu=np.mean(normed_anomalies), sig = np.std(normed_anomalies))
                minimum_threshold = round(1. + utils.invert_gaussian(FREQUENCY_THRESHOLD, gaussian))

                if diagnostics:
                    print iqr, minimum_threshold, 1. + utils.invert_gaussian(FREQUENCY_THRESHOLD, gaussian)
                    print gaussian
                    print hist

                if plots:
                    coc_set_up_plot(bincenters, hist, gaussian, variable, threshold = minimum_threshold, sub_par = "observations")


                uppercount = len(np.where(normed_anomalies > minimum_threshold)[0])
                lowercount = len(np.where(normed_anomalies < -minimum_threshold)[0])

                these_flags = station.qc_flags[:, flag_col[v]]
                gap_plot_values, tentative_plot_values = [], []

                # find the gaps and apply the flags

                gap_start = dgc.dgc_find_gap(hist, binEdges, minimum_threshold, gap_size = 1) # in DGC it is 2.
                these_flags, gap_plot_values, tentative_plot_values =\
                    coc_find_and_apply_flags(month_ranges[:,month,:],normed_anomalies, these_flags, year_ids, minimum_threshold, gap_start, \
                                                           upper = True, plots = plots, gpv = gap_plot_values, tpv = tentative_plot_values)

                gap_start = dgc.dgc_find_gap(hist, binEdges, -minimum_threshold, gap_size = 1) # in DGC it is 2.
                these_flags, gap_plot_values, tentative_plot_values =\
                    coc_find_and_apply_flags(month_ranges[:,month,:],normed_anomalies, these_flags, year_ids, minimum_threshold, gap_start, \
                                                           upper = False, plots = plots, gpv = gap_plot_values, tpv = tentative_plot_values)

                station.qc_flags[:, flag_col[v]] = these_flags

                if uppercount + lowercount > 1000:
                    #print "not sorted spurious stations yet"
                    pass
                if plots:
                    import matplotlib.pyplot as plt
                    hist, binEdges = np.histogram(tentative_plot_values, bins = bins)
                    plot_hist = np.array([0.01 if h == 0 else h for h in hist])
                    plt.step(bincenters, plot_hist, c='orange', ls='-', label = 'tentative', where='mid')

                    hist, binEdges = np.histogram(gap_plot_values, bins = bins)
                    plot_hist = np.array([0.01 if h == 0 else h for h in hist])
                    plt.step(bincenters, plot_hist, 'r-', label = 'flagged', where='mid')
                    import calendar
                    plt.text(0.1,0.9,calendar.month_name[month+1], transform = plt.gca().transAxes)
                    leg=plt.legend(loc='lower center',ncol=4, bbox_to_anchor=(0.5,-0.2),frameon=False,prop={'size':13},labelspacing=0.15,columnspacing=0.5)
                    plt.setp(leg.get_title(), fontsize=14)
                    plt.show()
                    #plt.savefig(IMAGELOCATION+'/'+station.id+'_ClimatologicalGap_'+str(month+1)+'.png')


        
        flag_locs = np.where(station.qc_flags[:, flag_col[v]] != 0)

        # copy flags into attribute
        st_var.flags[flag_locs] = 1


        if plots or diagnostics:
            utils.print_flagged_obs_number(logfile, "Climatological", variable, len(flag_locs[0]), noWrite = True)
            print "where\n"
            nflags = len(np.where(station.qc_flags[:, flag_col[v]] == 1)[0])
            utils.print_flagged_obs_number(logfile, "  Firm Clim", variable, nflags, noWrite = True)
            nflags = len(np.where(station.qc_flags[:, flag_col[v]] == 2)[0])
            utils.print_flagged_obs_number(logfile, "  Tentative Clim", variable, nflags, noWrite = True)
        else:
            utils.print_flagged_obs_number(logfile, "Climatological", variable, len(flag_locs[0]))
            logfile.write("where\n")
            nflags = len(np.where(station.qc_flags[:, flag_col[v]] == 1)[0])
            utils.print_flagged_obs_number(logfile, "  Firm Clim", variable, nflags)
            nflags = len(np.where(station.qc_flags[:, flag_col[v]] == 2)[0])
            utils.print_flagged_obs_number(logfile, "  Tentative Clim", variable, nflags)

        # firm flags match 030220
    station = utils.append_history(station, "Climatological Check")  
                     
    return