コード例 #1
0
def site_iter_process(valid_refs, c):
    #set local counts
    inv_nometa = 0
    inv_anyvaliddata = 0
    inv_nokeymeta = 0
    inv_resolution = 0
    inv_badmeasurementmethod = 0
    n_all = 0
    n_after_nometa = 0
    n_after_flagsandlod = 0
    n_after_duplicate = 0
    n_after_anyvaliddata = 0
    n_after_nokeymeta = 0
    n_after_resolution = 0
    n_after_badmeasurementmethod = 0

    #set local unknown lists
    unknown_mm_list = []
    unknown_mm_refs_list = []
    unknown_local_tz_list = []

    #for ref_i in range(len(valid_refs)):
    data_valid = True

    site_ref = valid_refs[c]
    print 'Current Ref is = ', site_ref, c

    s_files = glob.glob(
        '/work/home/db876/observations/surface/%s/CAPMON/ozon_smpls_%s*' %
        (species, site_ref))
    site_files = []
    for y in year_array:
        for f in s_files:
            if str(y) in f:
                site_files.append(f)

    site_files = modules.natsorted(site_files)

    yymmdd = []
    hhmm = []
    vals = []

    for file_i in range(len(site_files)):

        count = 0
        meta_start = -99999
        start_read_1 = False
        start_read_2 = False

        with open(site_files[file_i], 'rb') as f:
            reader = csv.reader(f, delimiter=',')
            print site_files[file_i]
            for row in reader:
                #print count
                #break out of loop at bottom of file
                if (start_read_2 == True) & (row[0] == '*TABLE ENDS'):
                    break

            #get metadata
                try:
                    if (row[0] == '*TABLE NAME') & (row[1]
                                                    == 'Site information'):
                        meta_start = count + 2
                except:
                    pass
                if count == meta_start:
                    siteid_i = row.index('Site ID: standard')
                    sitename_i = row.index('Description')
                    lat_i = row.index('Latitude: decimal degrees')
                    lon_i = row.index('Longitude: decimal degrees')
                    try:
                        alt_i = row.index(
                            'Ground elevation: above mean sea level')
                    except:
                        alt_i = row.index('Ground altitude')
                    class_i = row.index('Site land use')

                if count == (meta_start + 6):
                    latitude = row[lat_i]
                    longitude = row[lon_i]
                    altitude = row[alt_i]
                    raw_class_name = row[class_i]
                    site_name = row[sitename_i]

                #get data
                if start_read_2 == True:
                    #read dates, times, and vals
                    date = row[8]
                    time = row[9]
                    yymmdd.append(date[:4] + date[5:7] + date[8:])
                    hhmm.append(time[:2] + time[3:])
                    quality_code = row[13]
                    #if flag not equal to V0 then make -99999
                    if quality_code == 'V0':
                        vals = np.append(vals, np.float64(row[12]))
                    else:
                        vals = np.append(vals, -99999)

                try:
                    if (row[0] == '*TABLE NAME') & (row[1] == 'OZONE_HOURLY'):
                        start_read_1 = True
                except:
                    pass

                if (start_read_1 == True) & (row[0] == '*TABLE COLUMN UNITS'):
                    unit = row[12]

                if (start_read_1 == True) & (row[0] == '*TABLE BEGINS'):
                    start_read_2 = True
                count += 1

    #add to n_obs_all
    n_all += len(vals)
    n_after_nometa += len(vals)

    #convert data less < 0 to -99999
    test_inv = vals < 0
    vals[test_inv] = -99999

    #create max possible grid
    full_data = np.empty(n_hours)
    full_data_after_flagsandlod = np.empty(n_hours)
    big_n_dup_array = np.zeros(n_hours)
    full_data[:] = -99999
    full_data_after_flagsandlod[:] = -99999

    #put vals into full grid
    date_con = np.array(yymmdd).astype(int)
    time_con = np.array(hhmm).astype(int)

    #remove data < 1970 and >= 2015
    test_inds = (date_con >= 19700101) & (date_con < 20150101)
    date_con = date_con[test_inds]
    time_con = time_con[test_inds]
    vals = vals[test_inds]

    #set st_big and mm_big
    st_big = ['continuous'] * len(vals)
    mm_big = ['ultraviolet photometry'] * len(vals)

    #get obs valid
    test = vals != -99999
    n_obs_valid = len(vals[test])
    n_after_flagsandlod += n_obs_valid

    #find matching times between actual times and grid of times, return big array of indices of matched indices in grid
    converted_time = modules.date_process(date_con, time_con, start_year)
    converted_time = np.round(converted_time, decimals=5)
    syn_grid_time = np.arange(0, n_days, 1. / 24)
    syn_grid_time = np.round(syn_grid_time, decimals=5)
    raw_indices = np.searchsorted(syn_grid_time, converted_time, side='left')
    vals = np.array(vals)
    full_data_after_flagsandlod[raw_indices] = vals
    raw_st = np.copy(st_big)
    raw_mm = np.copy(mm_big)

    # test and remove duplicate and overlap points
    converted_time, vals, mm_big, st_big, na = modules.remove_duplicate_points(
        site_ref, converted_time, vals, mm_big, st_big, 'blank', output_res)
    test = vals >= 0
    n_obs_valid = int(len(vals[test]))
    n_after_duplicate += n_obs_valid

    #find matching times between actual times and grid of times, return big array of indices of matched indices in grid
    indices = np.searchsorted(syn_grid_time, converted_time, side='left')
    full_data[indices] = vals

    #get metadata
    try:
        lat = np.float32(latitude)
    except:
        lat = 'na'
    try:
        lon = np.float32(longitude)
    except:
        lon = 'na'
    try:
        alt = np.float32(altitude)
    except:
        alt = 'na'
    unit = str(unit)
    raw_class_name = str(raw_class_name)
    site_name = str(site_name)
    country = 'Canada'
    contact = 'Dave MacTavish, 4905 Dufferin St., Toronto ON, CANADA, M3H 5T4, [email protected]'

    #set data tz - all CAPMON times are UTC
    data_tz = 0
    all_tz = [data_tz]

    key_meta = [lat, lon, alt]

    #set site file resolution
    file_res = 'H'

    #get sampling/instrument grids
    raw_st_grid, p_st_grid, p_st_grid_after_flagsandlod, raw_mm_grid, p_mm_grid, p_mm_grid_after_flagsandlod, unknown_mm_list, unknown_mm_refs_list = modules.sampling_and_instruments_by_processgroup(
        site_ref, process_group, species, raw_st, raw_mm,
        full_data_after_flagsandlod, full_data, raw_indices, unknown_mm_list,
        unknown_mm_refs_list, no2_type)

    #do quality checks
    data_valid, full_data, valid_hours_dup, p_st_grid, p_mm_grid, n_all, inv_nometa, n_after_nometa, n_after_flagsandlod, n_after_duplicate, inv_anyvaliddata, n_after_anyvaliddata, inv_nokeymeta, n_after_nokeymeta, inv_resolution, n_after_resolution, inv_badmeasurementmethod, n_after_badmeasurementmethod, exit_r = modules.primary_quality_control(
        site_ref, species, file_res, no2_type, grid_dates, full_data,
        big_n_dup_array, 0, raw_st_grid, p_st_grid,
        p_st_grid_after_flagsandlod, raw_mm_grid, p_mm_grid,
        p_mm_grid_after_flagsandlod, data_resolution, n_obs_valid, key_meta,
        n_all, inv_nometa, n_after_nometa, n_after_flagsandlod,
        n_after_duplicate, inv_anyvaliddata, n_after_anyvaliddata,
        inv_nokeymeta, n_after_nokeymeta, inv_resolution, n_after_resolution,
        inv_badmeasurementmethod, n_after_badmeasurementmethod)
    if data_valid == False:
        exit_c_list = np.array([
            inv_nometa, inv_anyvaliddata, inv_nokeymeta, inv_resolution,
            inv_badmeasurementmethod
        ])
        n_c_list = np.array([
            n_all, n_after_nometa, n_after_flagsandlod, n_after_duplicate,
            n_after_anyvaliddata, n_after_nokeymeta, n_after_resolution,
            n_after_badmeasurementmethod
        ])
        unknown_list = [
            unknown_mm_list, unknown_mm_refs_list, unknown_local_tz_list
        ]
        meta = [
            lat, lon, alt, 'na', 'na', 'na', 'na', 'na', 'na', 'na', 'na', 'na'
        ]
        return c, ['na'], ['na'], [
            'na'
        ], False, meta, exit_c_list, n_c_list, unknown_list, exit_r, np.zeros(
            1)

    #set processed unit
    p_unit = 'pbbv'

    #get local timezone
    try:
        local_tz_name = tz_root.tzNameAt(lat, lon, forceTZ=True)
        pytz_obj = pytz.timezone(local_tz_name)
        datetime_offset = pytz_obj.utcoffset(datetime.datetime(2000, 1, 1))
        if datetime_offset < datetime.timedelta(0):
            local_tz = -(24 - int(datetime_offset.seconds / 60 / 60))
        else:
            local_tz = int(datetime_offset.seconds / 60 / 60)
    except:
        local_tz = 'na'
        print 'TIMEZONE NOT KNOWN, SITE IS %s' % (site_ref)
        unknown_local_tz_list.append(site_ref)

    #pack meta
    meta = [
        lat, lon, alt, raw_class_name, file_res, unit, p_unit, data_tz,
        local_tz, site_name, country, contact
    ]

    #if blank strings in meta then convert to 'na'
    for i in range(len(meta)):
        try:
            if meta[i].strip() == '':
                meta[i] = 'na'
        except:
            pass

    print set(raw_st_grid)
    print set(raw_mm_grid)
    print set(p_st_grid)
    print set(p_mm_grid)
    print meta

    exit_c_list = np.array([
        inv_nometa, inv_anyvaliddata, inv_nokeymeta, inv_resolution,
        inv_badmeasurementmethod
    ])
    n_c_list = np.array([
        n_all, n_after_nometa, n_after_flagsandlod, n_after_duplicate,
        n_after_anyvaliddata, n_after_nokeymeta, n_after_resolution,
        n_after_badmeasurementmethod
    ])
    print 'exit counts = ', exit_c_list
    print 'n obs counts = ', n_c_list

    unknown_list = [
        unknown_mm_list, unknown_mm_refs_list, unknown_local_tz_list
    ]

    return c, full_data, p_st_grid, p_mm_grid, data_valid, meta, exit_c_list, n_c_list, unknown_list, 'na', big_n_dup_array
コード例 #2
0
def site_iter_process(valid_refs,c):
    #set local counts
    inv_nometa = 0
    inv_anyvaliddata = 0
    inv_nokeymeta = 0
    inv_resolution = 0
    inv_badmeasurementmethod = 0
    n_all = 0
    n_after_nometa = 0
    n_after_flagsandlod = 0
    n_after_duplicate = 0
    n_after_anyvaliddata = 0
    n_after_nokeymeta = 0
    n_after_resolution = 0
    n_after_badmeasurementmethod = 0

    #set local unknown lists
    unknown_mm_list = []
    unknown_mm_refs_list = []
    unknown_local_tz_list = []

#read files site at a time
#for ref_i in range(len(valid_refs)):
    site_ref = valid_refs[c]

    all_latitudes = []
    all_longitudes = []
    all_altitudes = []
    all_unit = []
    all_site_name = []
    all_country = []
    all_contact = []
    mm_big = []
    meta_valid_list = []

    data_valid = True

    print 'Current Ref is = ', site_ref,c
    #find if sites have full valid range from start year and finishing in end year
    s_files = glob.glob('/work/home/db876/observations/surface/%s/EMEP/%s*'%(fname_species,site_ref))
    year_files = [file.replace("/work/home/db876/observations/surface/%s/EMEP/"%(fname_species), "") for file in s_files]
    cut_year_files = [file[8:12] for file in year_files]
    site_files = []
    for y in year_array:
        for i in range(len(s_files)):
            if str(y) in cut_year_files[i]:
                site_files.append(s_files[i])
                  
    site_files = modules.natsorted(site_files)
    
    #test for duplicate file years, if duplicates break processing
    file_years = []
    for file in site_files:
        last_file_split = file.split('/')[-1]
        file_years=np.append(file_years,last_file_split[8:12])
    for y in year_array:
        test = file_years == str(y)
        if len(file_years[test]) > 1:
            print 'Site has duplicate files for %s. Breaking processing'%(y)
            1+'a'

    if site_files == []:
        print 'No valid files for site\n'
        return
    
    #remove daily/monthly files if necessary
    if output_res == 'H':
        del_i = []
        for i in range(len(site_files)):
            if '.1d.' in site_files[i]:
                del_i.append(i)
            elif '.1mo.' in site_files[i]:
                del_i.append(i)
        site_files=np.delete(site_files,del_i)
    elif output_res == 'HD':
        del_i = []
        for i in range(len(site_files)):
            if '.1mo.' in site_files[i]:
                del_i.append(i)
        site_files=np.delete(site_files,del_i)
    
    for y in year_array:
        bad_meta = False
        got_year = False
        for file in site_files:
            last_file_split = file.split('/')[-1]
            if str(y) in last_file_split[8:12]:
                got_year = True
                break
        if got_year == False:
            #fill in data for missing year
            timedelta_diff = datetime.date(y+1, 1, 1) - datetime.date(y, 1, 1)
            ndays_missing = timedelta_diff.days       
            continue
    
        count = 0
        with open(file, 'rb') as f:
            reader = csv.reader(f,delimiter=' ')
            print file
            for row in reader:
                try:
                    row = filter(lambda a: a != '', row)
                except:
                    pass
                try:
                    row = filter(lambda a: a != ',', row)
                except:
                    pass
                                
                #get start date of file
                if row[0] == 'Startdate:':
                    data = row[1]
                    s_yyyy = data[:4]
                    s_mm = data[4:6]
                    s_dd = data[6:8]
                    s_hh = data[8:10]
                    s_min = data[10:12]
                    start_datetime = datetime.datetime(int(s_yyyy),1,1,0,0)
                
                #get unit
                if row[0] == 'Unit:':
                    try:
                        if len(row) == 3:
                            unit_part1 = row[1]
                            unit_part2 = row[2]
                            unit = unit_part1+'_'+unit_part2
                        
                        elif len(row) == 2:
                            unit = row[1] 
                        all_unit.append(unit)
                    except:
                        bad_meta = True
        
                #get resolution
                if row[0] == 'Resolution':
                    if row[1] == 'code:':
                        file_res = row[2]
                        print 'Resolution = %s'%file_res
                
                #get latitude
                if row[0] == 'Station':
                    if row[1] == 'latitude:':
                        latitude = row[2]
                        all_latitudes.append(latitude)
            
                #get longitude
                if row[0] == 'Station':
                    if row[1] == 'longitude:':
                        longitude = row[2]
                        all_longitudes.append(longitude)
                    
                #get altitude
                if row[0] == 'Station':
                    if row[1] == 'altitude:':
                        altitude = row[2][:-1]
                        all_altitudes.append(altitude)
                        
                #get site name
                if row[0] == 'Station':
                    if row[1] == 'name:':
                        site_name = row[2]
                        all_site_name.append(site_name)
            
                #get period
                if row[0] == 'Period':
                    period_code = row[2]
                
                #get stats method
                if row[0] == 'Statistics:':
                    try:
                        st = row[1] + row[2]
                        if st != 'arithmeticmean':
                            print 'Not Arithmetic Mean!'
                            print row[1]
                            print 1+'a'  
                    except:
                        print 'Not Arithmetic Mean!'
                        print row[1]
                        print 1+'a'
            
                #get instrument method and name
                if row[0] == 'Instrument':
                    if row[1] == 'type:':
                        mm_list = row[2:]
                        if len(mm_list) > 1:
                            site_mm = ''
                            for x in range(len(mm_list)):
                                site_mm = site_mm+mm_list[x]+' '
                            site_mm = site_mm.strip()
                        else:
                            site_mm = mm_list[0]
                
                    if row[1] == 'name:':
                        mn_list = row[2:]
                        if len(mn_list) > 1:
                            site_mn = ''
                            for x in range(len(mn_list)):
                                site_mn = site_mn+mn_list[x]+' '
                            site_mn = site_mn.strip()
                        else:
                            site_mn = mn_list[0]
                
                #get method ref
                if row[0] == 'Method':
                    if row[1] == 'ref:':
                        try:
                            mf_list = row[2:]
                            if len(mf_list) > 1:
                                site_mf = ''
                                for x in range(len(mf_list)):
                                    site_mf = site_mf+mf_list[x]+' '
                                site_mf = site_mf.strip()
                            else:
                                site_mf = mf_list[0]
                        except:
                            site_mf = ''
                
                    #put together intrument type+instrument_name+method_ref
                    mm = site_mm+site_mn+site_mf
                
                #get contact
                if row[0] == 'Originator:':
                    try:
                        contact_list = row[1:]
                        if len(contact_list) > 1:
                            site_contact = ''
                            for x in range(len(mf_list)):
                                site_contact = site_contact+contact_list[x]+' '
                            site_contact = site_contact.strip()
                        else:
                            site_contact = site_contact[0]
                    except:
                        site_contact = ''
                    all_contact.append(site_contact)
                
                #get country
                site_country = EMEP_COUNTRIES(file.split('/')[-1][:2])
                all_country.append(site_country)
                
                if row[0] == 'starttime':
                    skip_n = count+1
                    if species == 'ISOP':
                        spec_ind = row.index('C5H8')
                        try:
                            flag_ind = row.index('flag_C5H8')
                        except:
                            flag_ind = row.index('flag')
                    else:
                        spec_ind = row.index(species)
                        try:
                            flag_ind = row.index('flag_'+species)
                        except:
                            flag_ind = row.index('flag')
                    
                count+=1
            
        read = np.loadtxt(file,dtype="f8,f8,f8,f8",skiprows=skip_n,usecols=(0,1,spec_ind,flag_ind),unpack=True)
        read = np.array(read)
        times_since_start = read[0,:]
        endtimes_since_start = read[1,:]
        conc = read[2,:]
        conc = np.array(conc).astype('float64')
        flags = read[3,:]

        dates = []
        times = []
        enddates = []
        endtimes = []
        times_since_start = np.float64(times_since_start)   
        endtimes_since_start = np.float64(endtimes_since_start)  
        for x in range(len(times_since_start)):
            days_since_start = math.trunc(times_since_start[x])
            enddays_since_start = math.trunc(endtimes_since_start[x])
            remainder = times_since_start[x] - days_since_start
            remainder_end = endtimes_since_start[x] - enddays_since_start
            unrounded_hour = remainder*24
            unrounded_hour_end = remainder_end*24
            hour = np.round(unrounded_hour)
            hour_end = np.round(unrounded_hour_end)
            time_delta = datetime.timedelta(days = days_since_start,hours = hour)
            time_delta_end = datetime.timedelta(days = enddays_since_start,hours = hour_end)
            calc_datetime = start_datetime + time_delta
            calc_datetime_end = start_datetime + time_delta_end
            calc_yyyymmdd = calc_datetime.strftime("%Y%m%d") 
            calc_hhmm = calc_datetime.strftime("%H%M")  
            end_calc_yyyymmdd = calc_datetime_end.strftime("%Y%m%d") 
            end_calc_hhmm = calc_datetime_end.strftime("%H%M")
            dates.append(calc_yyyymmdd)
            times.append(calc_hhmm)
            enddates.append(end_calc_yyyymmdd)
            endtimes.append(end_calc_hhmm)
            
        conc = np.float64(conc)
        flags = np.float64(flags)
        
        #add to n_obs_all
        n_all += len(conc)
        
        #IF bad_meta == True then set all file vals as nans
        if bad_meta == True:
            conc[:] = np.NaN
        meta_valid_list.append(bad_meta)
        
        #DO INLINE INVALID AND FLAG CONVERT to NaN
        test = conc < 0
        conc[test] = np.NaN
        
        test = flags != 0
        conc[test] = np.NaN
            
        #convert units by line (only if value is >= than 0
        try:
            if (unit.lower() != 'ppb') & (unit.lower() != 'ppbv'):
                if unit == 'ug/m3':
                    #calculate conversion factor from mg/m3 assuming 293K and 1013 hPa - in EU LAW
                    #R/MW*(TEMP0C(K)*TEMP(degC)/P(hPa)/10
                    conv_fact = 8.3144/mol_mass*(293.)/(1013./10.)
                    conc = conv_fact*conc
                elif unit == 'ug_N/m3':
                    conv_fact = 8.3144/14.00674*(293.)/(1013./10.)
                    conc = conv_fact*conc
                elif (unit == 'ppm') or (unit == 'ppmv'):
                    conc = conc*1e3
                    #print 'Converting Units from ppmv to ppbv'
                elif (unit == 'ppt') or (unit == 'pptv'):
                    conc = conc/1e3
                    #print 'Converting Units from pptv to ppbv'
                else:
                    print 'Unknown Unit'
                    1+'a'
        except:
            pass
        
        #remove 9.999 from ISOP dataset
        if species == 'ISOP':
            test = conc == 9.999
            conc[test] = np.NaN
        
        #if file resolution is daily or monthly then replicate times after point, to fill hourly data array.
        count=0
        if file_res == '1h':
            n_dups = np.zeros(len(conc))
        elif file_res == '1d':
            n_dups = []
            #if measurement method is flask, then put leave flask measurement in as hourly measurement, the first hour of month 
            file_hours = len(dates)
            for i in range(file_hours):
                current_year = int(dates[count][:4])
                current_month = int(dates[count][4:6])
                current_day = int(dates[count][6:])
                current_hh = int(times[count][:2])
                current_mm = int(times[count][2:])
        
                next_year = int(enddates[i][:4])
                next_month = int(enddates[i][4:6])
                next_day = int(enddates[i][6:])
                next_hh = int(endtimes[i][:2])
                next_mm =  int(endtimes[i][2:])
                
                s = datetime.datetime(year = current_year, month = current_month, day = current_day, hour = current_hh, minute = current_mm)
                e = datetime.datetime(year = next_year, month = next_month, day = next_day, hour = next_hh, minute = next_mm)
                day_dates = [d.strftime('%Y%m%d') for d in pd.date_range(s,e,freq='H')][1:-1]
                day_hours = [d.strftime('%H%M') for d in pd.date_range(s,e,freq='H')][1:-1]

                dates = np.insert(dates,count+1,day_dates)
                times = np.insert(times,count+1,day_hours)
                conc = np.insert(conc,count+1,[conc[count]]*len(day_dates))

                #append to n duplicated array
                n_dups=np.append(n_dups,0)
                n_dups=np.append(n_dups,[1]*len(day_dates))

                count +=(len(day_dates)+1)
        
        elif file_res == '1mo':
            n_dups = []
            #if measurement method is flask, then put leave flask measurement in as hourly measurement, the first hour of month 
            file_hours = len(dates)
            for i in range(file_hours):
                current_year = int(dates[count][:4])
                current_month = int(dates[count][4:6])
                current_day = int(dates[count][6:])
                current_hh = int(times[count][:2])
                current_mm = int(times[count][2:])
    
                next_year = int(enddates[i][:4])
                next_month = int(enddates[i][4:6])
                next_day = int(enddates[i][6:])
                next_hh = int(endtimes[i][:2])
                next_mm =  int(endtimes[i][2:])
    
                s = datetime.datetime(year = current_year, month = current_month, day = current_day, hour = current_hh, minute = current_mm)
                e = datetime.datetime(year = next_year, month = next_month, day = next_day, hour = next_hh, minute = next_mm)
        
                day_dates = [d.strftime('%Y%m%d') for d in pd.date_range(s,e,freq='H')][1:-1]
                day_hours = [d.strftime('%H%M') for d in pd.date_range(s,e,freq='H')][1:-1]
                dates = np.insert(dates,count+1,day_dates)
                times = np.insert(times,count+1,day_hours)
                conc = np.insert(conc,count+1,[conc[count]]*len(day_dates))
                
                #append to n duplicated array
                n_dups=np.append(n_dups,0)
                n_dups=np.append(n_dups,[1]*len(day_dates))
                
                count += (len(day_dates)+1)
        
        data = [dates,times,conc,n_dups]
        
        #put measurnement methods and into big list len of times
        mm_big=np.append(mm_big,[mm]*len(dates))
      
        try:
            big_list = np.hstack((big_list,data))
        except:
            big_list = np.array(data)
                
    if (y == year_array[-1]):    

        #get dates and times
        date_con = big_list[0,:]
        time_con = big_list[1,:]
          
        #get vals
        vals = np.array(big_list[2,:]).astype('float64')
        
        #get n dup array
        n_dup_array = np.array(big_list[3,:]).astype(float).astype(int)

        #if all files have missing key meta then exit
        if all(i == True for i in meta_valid_list) == True:
            inv_nometa += 1
            print 'Site Invalid. No Metadata for ref'
            if no2_type == 'MOLYBDENUM':
                n_all,inv_nometa,n_after_nometa,n_after_flagsandlod,n_after_duplicate,inv_anyvaliddata,n_obs_after_anyvaliddata,inv_nokeymeta,n_obs_after_nokeymeta,inv_resolution,n_obs_after_resolution,inv_badmeasurementmethod,n_obs_after_badmeasurementmethod = 0,0,0,0,0,0,0,0,0,0,0,0,0
            exit_c_list = np.array([inv_nometa,inv_anyvaliddata,inv_nokeymeta,inv_resolution,inv_badmeasurementmethod])
            n_c_list = np.array([n_all,n_after_nometa,n_after_flagsandlod,n_after_duplicate,n_after_anyvaliddata,n_after_nokeymeta,n_after_resolution,n_after_badmeasurementmethod])
            unknown_list = [unknown_mm_list,unknown_mm_refs_list,unknown_local_tz_list]
            meta = ['na','na','na','na','na','na','na','na','na','na','na','na']
            exit_r = 'nometa'
            return c,['na'],['na'],['na'],False,meta,exit_c_list,n_c_list,unknown_list,exit_r,np.zeros(1)
        valid_hours_dup = np.sum(n_dup_array)
        n_after_nometa += (len(vals)-valid_hours_dup)

        #delete big list
        del big_list

        date_con = np.array(date_con).astype(int)
        time_con = np.array(time_con).astype(int)
        
        #remove data < 1970 and >= 2015
        test_inds = (date_con >= 19700101) & (date_con < 20150101)
        date_con = date_con[test_inds]
        time_con = time_con[test_inds]
        vals = vals[test_inds]
        mm_big = mm_big[test_inds]
        n_dup_array = n_dup_array[test_inds]
        
        #set st_big as 'continuous'
        st_big = ['continuous']*len(vals)
        
        #convert all Nans back to -99999
        test = np.isnan(vals)
        vals[test] = -99999
        
        #get obs valid
        test = vals >= 0
        valid_hours_dup = np.sum(n_dup_array[test])
        n_obs_valid = int(len(vals[test]) - valid_hours_dup)
        n_after_flagsandlod += n_obs_valid
        
        #create max possible species grid, measurement method and sampling type grids
        full_data = np.empty(n_hours)
        full_data_after_flagsandlod = np.empty(n_hours)
        big_n_dup_array = np.zeros(n_hours)
        full_data[:] = -99999
        full_data_after_flagsandlod[:] = -99999
        
        #find matching times between actual times and grid of times, return big array of indices of matched indices in grid
        converted_time = modules.date_process(date_con,time_con,start_year)
        converted_time = np.round(converted_time,decimals=5)
        syn_grid_time = np.arange(0,n_days,1./24)
        syn_grid_time = np.round(syn_grid_time,decimals=5)
        #find matching times between actual times and grid of times, return big array of indices of matched indices in grid
        raw_indices = np.searchsorted(syn_grid_time, converted_time, side='left')
        vals = np.array(vals)
        full_data_after_flagsandlod[raw_indices] = vals
        raw_st = np.copy(st_big)
        raw_mm = np.copy(mm_big)
        
        # test and remove duplicate and overlap points
        converted_time,vals,mm_big,st_big,n_dup_array = modules.remove_duplicate_points(site_ref,converted_time,vals,mm_big,st_big,n_dup_array,output_res)
        test = vals >= 0
        valid_hours_dup = np.sum(n_dup_array[test])
        n_obs_valid = int(len(vals[test]) - valid_hours_dup)
        n_after_duplicate += n_obs_valid
        
        #find matching times between actual times and grid of times, return big array of indices of matched indices in grid
        indices = np.searchsorted(syn_grid_time, converted_time, side='left')
        full_data[indices] = vals 
        big_n_dup_array[indices] = n_dup_array
    
        #get mode of metadata
        try:
            lat = np.float32(stats.mode(all_latitudes)[0][0]) 
        except:
            lat = 'na'
        try:
            lon = np.float32(stats.mode(all_longitudes)[0][0])  
        except:
            lon = 'na'
        try:
            alt = np.float32(stats.mode(all_altitudes)[0][0]) 
        except:
            alt = 'na'
        unit = stats.mode(all_unit)[0][0]
        #remove empty strings from extra meta before mode test
        try:
            site_name = stats.mode(filter(None, all_site_name))[0][0]
        except:
            site_name = 'na'
        try:
            country = stats.mode(filter(None, all_country))[0][0]
        except:
            country = 'na'
        try:
            contact = stats.mode(filter(None, all_contact))[0][0] 
        except:
            contact = 'na'
    
        #set data tz - all EMEP times are UTC
        data_tz = 0
        all_tz = [data_tz]
    
        key_meta = [lat,lon,alt]
        
        #convert file res to standard format
        if file_res == '1h':
            file_res = 'H'
        elif file_res == '1d':
            file_res = 'D'
        elif file_res == '1mo':
            file_res = 'M'
    
        #get sampling/instrument grids
        raw_st_grid,p_st_grid,p_st_grid_after_flagsandlod,raw_mm_grid,p_mm_grid,p_mm_grid_after_flagsandlod,unknown_mm_list,unknown_mm_refs_list = modules.sampling_and_instruments_by_processgroup(site_ref,process_group,species,raw_st,raw_mm,full_data_after_flagsandlod,full_data,raw_indices,unknown_mm_list,unknown_mm_refs_list,no2_type)

        #do quality checks                                                                                                                                                                                                                                                                                                     
        data_valid,full_data,valid_hours_dup,p_st_grid,p_mm_grid,n_all,inv_nometa,n_after_nometa,n_after_flagsandlod,n_after_duplicate,inv_anyvaliddata,n_after_anyvaliddata,inv_nokeymeta,n_after_nokeymeta,inv_resolution,n_after_resolution,inv_badmeasurementmethod,n_after_badmeasurementmethod,exit_r = modules.primary_quality_control(site_ref,species,file_res,no2_type,grid_dates,full_data,big_n_dup_array,valid_hours_dup,raw_st_grid,p_st_grid,p_st_grid_after_flagsandlod,raw_mm_grid,p_mm_grid,p_mm_grid_after_flagsandlod,data_resolution,n_obs_valid,key_meta,n_all,inv_nometa,n_after_nometa,n_after_flagsandlod,n_after_duplicate,inv_anyvaliddata,n_after_anyvaliddata,inv_nokeymeta,n_after_nokeymeta,inv_resolution,n_after_resolution,inv_badmeasurementmethod,n_after_badmeasurementmethod)
        if data_valid == False:
            exit_c_list = np.array([inv_nometa,inv_anyvaliddata,inv_nokeymeta,inv_resolution,inv_badmeasurementmethod])
            n_c_list = np.array([n_all,n_after_nometa,n_after_flagsandlod,n_after_duplicate,n_after_anyvaliddata,n_after_nokeymeta,n_after_resolution,n_after_badmeasurementmethod])
            unknown_list = [unknown_mm_list,unknown_mm_refs_list,unknown_local_tz_list]
            meta = [lat,lon,alt,'na','na','na','na','na','na','na','na','na']
            return c,['na'],['na'],['na'],False,meta,exit_c_list,n_c_list,unknown_list,exit_r,np.zeros(1)

        #set metadata not available as na
        raw_class_name = 'na'
    
        #set processed unit
        p_unit = 'ppbv'
    
        #get local timezone
        try:
            local_tz_name = tz_root.tzNameAt(lat,lon,forceTZ=True)
            pytz_obj = pytz.timezone(local_tz_name)
            datetime_offset = pytz_obj.utcoffset(datetime.datetime(2000,1,1))
            if datetime_offset < datetime.timedelta(0):
                local_tz = -(24-int(datetime_offset.seconds/60/60))
            else:
                local_tz = int(datetime_offset.seconds/60/60)
        except:
            local_tz = 'na'
            print 'TIMEZONE NOT KNOWN, SITE IS %s'%(site_ref)
            unknown_local_tz_list.append(site_ref)

        #pack meta
        meta = [lat,lon,alt,raw_class_name,file_res,unit,p_unit,data_tz,local_tz,site_name,country,contact]
    
        #if blank strings in meta then convert to 'na'
        for i in range(len(meta)):
            try:
                if meta[i].strip() == '':
                    meta[i] = 'na'
            except:
                pass
    
        print set(raw_st_grid)
        print set(raw_mm_grid)
        print set(p_st_grid)
        print set(p_mm_grid)
        print meta
    
        exit_c_list = np.array([inv_nometa,inv_anyvaliddata,inv_nokeymeta,inv_resolution,inv_badmeasurementmethod])
        n_c_list = np.array([n_all,n_after_nometa,n_after_flagsandlod,n_after_duplicate,n_after_anyvaliddata,n_after_nokeymeta,n_after_resolution,n_after_badmeasurementmethod])
        print 'exit counts = ', exit_c_list
        print 'n obs counts = ', n_c_list

        unknown_list = [unknown_mm_list,unknown_mm_refs_list,unknown_local_tz_list]

        return c,full_data,p_st_grid,p_mm_grid,data_valid,meta,exit_c_list,n_c_list,unknown_list,'na',big_n_dup_array
コード例 #3
0
def site_iter_process(valid_refs, c):
    #set local counts
    inv_nometa = 0
    inv_anyvaliddata = 0
    inv_nokeymeta = 0
    inv_resolution = 0
    inv_badmeasurementmethod = 0
    n_all = 0
    n_after_nometa = 0
    n_after_flagsandlod = 0
    n_after_duplicate = 0
    n_after_anyvaliddata = 0
    n_after_nokeymeta = 0
    n_after_resolution = 0
    n_after_badmeasurementmethod = 0

    #set local unknown lists
    unknown_mm_list = []
    unknown_mm_refs_list = []
    unknown_local_tz_list = []

    site_resolutions = []

    site_ref = valid_refs[c]

    data_valid = True
    print 'ref = ', site_ref, c

    if species != 'ISOP':
        site_test = all_refs == site_ref
        site_yyyymmdd = yyyymmdd[site_test]
        site_hhmm = hhmm[site_test]
        site_vals = vals[site_test]
        n_dup_array = np.array([0] * len(site_vals))
    else:
        if site_ref[0] == '0':
            site_ref = site_ref[1:]
        files = []
        site_yyyymmdd = []
        site_hhmm = []
        site_vals = []
        n_dup_array = []
        for y in all_years:
            try:
                files.append(
                    glob.glob('../CANADANAPS/VOC%s/S%s*' % (y, site_ref)))
            except:
                pass
        files = [item for sublist in files for item in sublist]
        for f in files:
            print f
            all_data = get_data(f)
            all_data = all_data.values()
            test_header_range = range(0, 10)
            for x in test_header_range:
                headers = all_data[0][x]
                if 'Isoprene' in headers:
                    header_x = x
                    break
            data_cut = all_data[0][header_x + 1:]
            var_i = headers.index('Isoprene')
            #date_i = headers.index('Sample Date')
            date_i = headers.index('Compounds')
            time_i = headers.index('START TIME')
            duration_i = headers.index('DURATION')

            for i in range(len(data_cut)):
                row_cut = data_cut[i]

                try:
                    dur = float(row_cut[duration_i])
                    if dur.is_integer() == False:
                        dur = round(dur, 0)
                except:
                    #round to nearest hour if necessary
                    if float(row_cut[duration_i].strftime("%M")) != 0:
                        if dur >= 30:
                            dur = float(row_cut[duration_i].strftime("%H")) + 1
                        else:
                            dur = float(row_cut[duration_i].strftime("%H"))
                    else:
                        dur = float(row_cut[duration_i].strftime("%H"))

                if dur.is_integer() == False:
                    print 'duration is float'
                    1 + 'a'

                try:
                    val = np.float64(row_cut[var_i])
                except:
                    val = -99999

                if dur == 1:
                    site_resolutions.append('H')

                    #if (val >= 0.01):
                    #    site_vals.append([val])
                    #else:
                    #    site_vals.append([-99999])
                    site_vals.append([val])

                    n_dup_array.append([0])
                    site_yyyymmdd.append([row_cut[date_i].strftime("%Y%m%d")])
                    try:
                        site_hhmm.append(
                            [row_cut[time_i][:2] + row_cut[time_i][3:5]])
                    except:
                        #round to nearest hour if necessary
                        ti = row_cut[time_i].strftime("%H%M")
                        if float(row_cut[time_i].strftime("%M")) != 0:
                            print 'non whole time = ', row_cut[time_i]
                            if float(row_cut[time_i].strftime("%M")) >= 30:
                                site_hhmm.append([
                                    datetime.time(hour=int(ti[:2]) + 1,
                                                  minute=0).strftime("%H%M")
                                ])
                            else:
                                site_hhmm.append([
                                    datetime.time(hour=int(ti[:2]),
                                                  minute=0).strftime("%H%M")
                                ])

                        else:
                            site_hhmm.append(
                                [row_cut[time_i].strftime("%H%M")])
                #deal with sample lens > 1 hour
                else:
                    if output_res == 'H':
                        continue
                    else:
                        site_resolutions.append('D')

                        #if (val >= 0.01):
                        #    site_vals.append([val])
                        #else:
                        #    site_vals.append([-99999])
                        site_vals.append([val])

                        n_dup_array.append([0])

                        try:
                            site_yyyymmdd.append(
                                [row_cut[date_i].strftime("%Y%m%d")])
                        except:
                            print row_cut[date_i]
                            1 + 'a'
                        try:
                            site_hhmm.append(
                                [row_cut[time_i][:2] + row_cut[time_i][3:5]])
                        except:
                            #round to nearest hour if necessary
                            ti = row_cut[time_i].strftime("%H%M")
                            if float(row_cut[time_i].strftime("%M")) != 0:
                                print 'non whole time = ', row_cut[time_i]
                                if float(row_cut[time_i].strftime("%M")) >= 30:
                                    site_hhmm.append([
                                        datetime.time(
                                            hour=int(ti[:2]) + 1,
                                            minute=0).strftime("%H%M")
                                    ])
                                else:
                                    site_hhmm.append([
                                        datetime.time(
                                            hour=int(ti[:2]),
                                            minute=0).strftime("%H%M")
                                    ])

                            else:
                                site_hhmm.append(
                                    [row_cut[time_i].strftime("%H%M")])

                        current_year = int(site_yyyymmdd[-1][0][:4])
                        current_month = int(site_yyyymmdd[-1][0][4:6])
                        current_day = int(site_yyyymmdd[-1][0][6:])
                        current_hh = int(site_hhmm[-1][0][:2])
                        current_mm = int(site_hhmm[-1][0][2:])

                        s = datetime.datetime(year=current_year,
                                              month=current_month,
                                              day=current_day,
                                              hour=current_hh,
                                              minute=current_mm)
                        e = s + datetime.timedelta(hours=dur)
                        day_dates = [
                            d.strftime('%Y%m%d')
                            for d in pd.date_range(s, e, freq='H')
                        ][1:-1]
                        day_hours = [
                            d.strftime('%H%M')
                            for d in pd.date_range(s, e, freq='H')
                        ][1:-1]

                        site_yyyymmdd.append(day_dates)
                        site_hhmm.append(day_hours)
                        site_vals.append([site_vals[-1][0]] * len(day_dates))

                        #append to n duplicated array
                        n_dup_array.append([0])
                        n_dup_array.append([1] * len(day_dates))

    if species == 'ISOP':
        site_yyyymmdd = [item for sublist in site_yyyymmdd for item in sublist]
        site_hhmm = [item for sublist in site_hhmm for item in sublist]
        site_vals = [item for sublist in site_vals for item in sublist]
        n_dup_array = np.array(
            [item for sublist in n_dup_array for item in sublist])
        if len(site_ref) == 5:
            site_ref = '0' + site_ref

    site_vals = np.float64(site_vals)

    #add val to total obs count
    n_all += len(site_vals)

    #test if site_ref in meta_refs, if not then exit
    if site_ref not in meta_refs:
        print site_ref
        inv_nometa += 1
        print 'Site Invalid. No Metadata for ref'
        if no2_type == 'MOLYBDENUM':
            n_all, inv_nometa, n_after_nometa, n_after_flagsandlod, n_after_duplicate, inv_anyvaliddata, n_obs_after_anyvaliddata, inv_nokeymeta, n_obs_after_nokeymeta, inv_resolution, n_obs_after_resolution, inv_badmeasurementmethod, n_obs_after_badmeasurementmethod = 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
        exit_c_list = np.array([
            inv_nometa, inv_anyvaliddata, inv_nokeymeta, inv_resolution,
            inv_badmeasurementmethod
        ])
        n_c_list = np.array([
            n_all, n_after_nometa, n_after_flagsandlod, n_after_duplicate,
            n_after_anyvaliddata, n_after_nokeymeta, n_after_resolution,
            n_after_badmeasurementmethod
        ])
        unknown_list = [
            unknown_mm_list, unknown_mm_refs_list, unknown_local_tz_list
        ]
        meta = [
            'na', 'na', 'na', 'na', 'na', 'na', 'na', 'na', 'na', 'na', 'na',
            'na'
        ]
        exit_r = 'nometa'
        return c, ['na'], ['na'], [
            'na'
        ], False, meta, exit_c_list, n_c_list, unknown_list, exit_r, np.zeros(
            1)
    n_after_nometa += len(site_vals)

    #convert all invalids to -99999
    test_inv = site_vals < 0
    site_vals[test_inv] = -99999

    #create max possible grid
    full_data = np.empty(n_hours)
    full_data_after_flagsandlod = np.empty(n_hours)
    big_n_dup_array = np.zeros(n_hours)
    full_data[:] = -99999
    full_data_after_flagsandlod[:] = -99999

    #get meta
    meta_index = meta_refs.index(site_ref)
    data_tz = np.float32(meta_tz[meta_index])
    all_tz = [data_tz]
    try:
        lat = np.float32(meta_lats[meta_index])
    except:
        lat = 'na'
    try:
        lon = np.float32(meta_lons[meta_index])
    except:
        lon = 'na'
    try:
        alt = np.float32(meta_alts[meta_index])
    except:
        alt = 'na'
    raw_class_name = meta_class[meta_index]
    site_name = meta_sitenames[meta_index]
    unit = 'na'
    contact = meta_contacts[meta_index]
    country = meta_countries[meta_index]

    #adjust dates and times if tz is not equal to 0
    tz = int(data_tz)
    if tz != 0:
        for i in range(len(site_yyyymmdd)):
            #create datetime
            dt = datetime.datetime(int(site_yyyymmdd[i][:4]),
                                   int(site_yyyymmdd[i][4:6]),
                                   int(site_yyyymmdd[i][6:]),
                                   int(site_hhmm[i][:2]),
                                   int(site_hhmm[i][2:]))
            if tz > 0:
                dt = dt - datetime.timedelta(hours=int(tz))
            elif tz < 0:
                dt = dt + datetime.timedelta(hours=np.abs(int(tz)))
            site_yyyymmdd[i] = dt.strftime("%Y%m%d")
            site_hhmm[i] = dt.strftime("%H%M")

    #put vals into full grid
    date_con = np.array(site_yyyymmdd).astype(int)
    time_con = np.array(site_hhmm).astype(int)

    #remove data < 1970 and >= 2015
    test_inds = (date_con >= 19700101) & (date_con < 20150101)
    date_con = date_con[test_inds]
    time_con = time_con[test_inds]
    site_vals = site_vals[test_inds]
    n_dup_array = n_dup_array[test_inds]

    #set st_big and mm_big
    st_big = ['continuous'] * len(site_vals)

    if species == 'O3':
        mm_big = ['ultraviolet photometry'] * len(site_vals)
    elif species == 'NO':
        mm_big = ['chemiluminescence'] * len(site_vals)
    elif species == 'NO2':
        mm_big = ['chemiluminescence (conversion-molybdenum)'] * len(site_vals)
    elif species == 'CO':
        mm_big = ['non-dispersive infrared spectrometry'] * len(site_vals)
    elif species == 'ISOP':
        mm_big = ['gas chromatography mass selective detection'
                  ] * len(site_vals)

    #get obs valid after flagsandlod
    test = site_vals != -99999
    valid_hours_dup = np.sum(n_dup_array[test])
    n_obs_valid = len(site_vals[test] - valid_hours_dup)
    n_after_flagsandlod += n_obs_valid

    #find matching times between actual times and grid of times, return big array of indices of matched indices in grid
    converted_time = modules.date_process(date_con, time_con, start_year)
    converted_time = np.round(converted_time, decimals=5)
    syn_grid_time = np.arange(0, n_days, 1. / 24)
    syn_grid_time = np.round(syn_grid_time, decimals=5)
    raw_indices = np.searchsorted(syn_grid_time, converted_time, side='left')
    site_vals = np.array(site_vals)
    full_data_after_flagsandlod[raw_indices] = site_vals
    raw_st = np.copy(st_big)
    raw_mm = np.copy(mm_big)

    # test and remove duplicate and overlap points
    converted_time, site_vals, mm_big, st_big, n_dup_array = modules.remove_duplicate_points(
        site_ref, converted_time, site_vals, mm_big, st_big, n_dup_array,
        output_res)
    test = site_vals != -99999
    valid_hours_dup = np.sum(n_dup_array[test])
    n_obs_valid = int(len(site_vals[test]) - valid_hours_dup)
    print 'n obs valid = ', n_obs_valid
    n_after_duplicate += n_obs_valid

    #find matching times between actual times and grid of times, return big array of indices of matched indices in grid
    indices = np.searchsorted(syn_grid_time, converted_time, side='left')
    full_data[indices] = site_vals
    big_n_dup_array[indices] = n_dup_array

    #if species is CO then convert units from ppmv to ppbv
    if species == 'CO':
        valid_inds = full_data != -99999
        full_data[valid_inds] = full_data[valid_inds] * 1e3

    #if species is ISOP then connvert units from mg/m3 to ppbv
    if species == 'ISOP':
        #calculate conversion factor from mg/m3 assuming 25 degC and 1 atm
        #R/MW*(TEMP0C(K)*TEMP(degC)/P(hPa)/10
        conv_fact = 8.3144 / mol_mass * (273.15 + 25) / (1013.25 / 10)
        valid_inds = full_data != -99999
        full_data[valid_inds] = full_data[valid_inds] * conv_fact

    key_meta = [lat, lon, alt]

    #set site file resolution
    if (species == 'O3') or (species == 'CO') or (species
                                                  == 'NO') or (species
                                                               == 'NO2'):
        file_res = 'H'
    else:
        # if no valid data then site res does not matter
        if len(site_resolutions) == 0:
            file_res = 'na'
        else:
            #if all site resolutions are same continue then take first file_res
            all_same = all(x == site_resolutions[0] for x in site_resolutions)
            if all_same == True:
                file_res = site_resolutions[0]
            else:
                #otherwise take highest frequency res as file_res
                if 'M' in site_resolutions:
                    file_res = 'M'
                elif 'D' in site_resolutions:
                    file_res = 'D'
                else:
                    file_res = 'H'

    #get sampling/instrument grids
    raw_st_grid, p_st_grid, p_st_grid_after_flagsandlod, raw_mm_grid, p_mm_grid, p_mm_grid_after_flagsandlod, unknown_mm_list, unknown_mm_refs_list = modules.sampling_and_instruments_by_processgroup(
        site_ref, process_group, species, raw_st, raw_mm,
        full_data_after_flagsandlod, full_data, raw_indices, unknown_mm_list,
        unknown_mm_refs_list, no2_type)

    #do quality checks
    data_valid, full_data, valid_hours_dup, p_st_grid, p_mm_grid, n_all, inv_nometa, n_after_nometa, n_after_flagsandlod, n_after_duplicate, inv_anyvaliddata, n_after_anyvaliddata, inv_nokeymeta, n_after_nokeymeta, inv_resolution, n_after_resolution, inv_badmeasurementmethod, n_after_badmeasurementmethod, exit_r = modules.primary_quality_control(
        site_ref, species, file_res, no2_type, grid_dates, full_data,
        big_n_dup_array, valid_hours_dup, raw_st_grid, p_st_grid,
        p_st_grid_after_flagsandlod, raw_mm_grid, p_mm_grid,
        p_mm_grid_after_flagsandlod, data_resolution, n_obs_valid, key_meta,
        n_all, inv_nometa, n_after_nometa, n_after_flagsandlod,
        n_after_duplicate, inv_anyvaliddata, n_after_anyvaliddata,
        inv_nokeymeta, n_after_nokeymeta, inv_resolution, n_after_resolution,
        inv_badmeasurementmethod, n_after_badmeasurementmethod)
    if data_valid == False:
        exit_c_list = np.array([
            inv_nometa, inv_anyvaliddata, inv_nokeymeta, inv_resolution,
            inv_badmeasurementmethod
        ])
        n_c_list = np.array([
            n_all, n_after_nometa, n_after_flagsandlod, n_after_duplicate,
            n_after_anyvaliddata, n_after_nokeymeta, n_after_resolution,
            n_after_badmeasurementmethod
        ])
        unknown_list = [
            unknown_mm_list, unknown_mm_refs_list, unknown_local_tz_list
        ]
        meta = [
            lat, lon, alt, 'na', 'na', 'na', 'na', 'na', 'na', 'na', 'na', 'na'
        ]
        return c, ['na'], ['na'], [
            'na'
        ], False, meta, exit_c_list, n_c_list, unknown_list, exit_r, np.zeros(
            1)

    #make tz int after checks
    data_tz = np.float32(data_tz)

    #set processed unit
    p_unit = 'pbbv'

    #get local timezone
    try:
        local_tz_name = tz_root.tzNameAt(lat, lon, forceTZ=True)
        pytz_obj = pytz.timezone(local_tz_name)
        datetime_offset = pytz_obj.utcoffset(datetime.datetime(2000, 1, 1))
        if datetime_offset < datetime.timedelta(0):
            local_tz = -(24 - int(datetime_offset.seconds / 60 / 60))
        else:
            local_tz = int(datetime_offset.seconds / 60 / 60)
    except:
        local_tz = 'na'
        print 'TIMEZONE NOT KNOWN, SITE IS %s' % (site_ref)
        unknown_local_tz_list.append(site_ref)

    #pack meta
    meta = [
        lat, lon, alt, raw_class_name, file_res, unit, p_unit, data_tz,
        local_tz, site_name, country, contact
    ]

    #if blank strings in meta then convert to 'na'
    for i in range(len(meta)):
        try:
            if meta[i].strip() == '':
                meta[i] = 'na'
        except:
            pass

    print set(raw_st_grid)
    print set(raw_mm_grid)
    print set(p_st_grid)
    print set(p_mm_grid)
    print meta

    exit_c_list = np.array([
        inv_nometa, inv_anyvaliddata, inv_nokeymeta, inv_resolution,
        inv_badmeasurementmethod
    ])
    n_c_list = np.array([
        n_all, n_after_nometa, n_after_flagsandlod, n_after_duplicate,
        n_after_anyvaliddata, n_after_nokeymeta, n_after_resolution,
        n_after_badmeasurementmethod
    ])
    print 'exit counts = ', exit_c_list
    print 'n obs counts = ', n_c_list

    unknown_list = [
        unknown_mm_list, unknown_mm_refs_list, unknown_local_tz_list
    ]

    return c, full_data, p_st_grid, p_mm_grid, data_valid, meta, exit_c_list, n_c_list, unknown_list, 'na', big_n_dup_array
コード例 #4
0
def site_iter_process(valid_refs, c):
    #set local counts
    inv_nometa = 0
    inv_anyvaliddata = 0
    inv_nokeymeta = 0
    inv_resolution = 0
    inv_badmeasurementmethod = 0
    n_all = 0
    n_after_nometa = 0
    n_after_flagsandlod = 0
    n_after_duplicate = 0
    n_after_anyvaliddata = 0
    n_after_nokeymeta = 0
    n_after_resolution = 0
    n_after_badmeasurementmethod = 0

    #set local unknown lists
    unknown_mm_list = []
    unknown_mm_refs_list = []
    unknown_local_tz_list = []

    data_valid = True
    site_ref = valid_refs[c]
    print 'ref = ', site_ref, c
    site_test = all_refs == site_ref

    site_yyyymmdd = yyyymmdd[site_test]
    site_hhmm = hhmm[site_test]
    site_vals = vals[site_test]
    site_vals = np.array(site_vals)

    #add val to total obs count
    n_all += len(site_vals)

    #test if site_ref in meta_refs, if not then exit
    if site_ref not in meta_refs:
        inv_nometa += 1
        print 'Site Invalid. No Metadata for ref'
        if no2_type == 'MOLYBDENUM':
            n_all, inv_nometa, n_after_nometa, n_after_flagsandlod, n_after_duplicate, inv_anyvaliddata, n_obs_after_anyvaliddata, inv_nokeymeta, n_obs_after_nokeymeta, inv_resolution, n_obs_after_resolution, inv_badmeasurementmethod, n_obs_after_badmeasurementmethod = 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
        exit_c_list = np.array([
            inv_nometa, inv_anyvaliddata, inv_nokeymeta, inv_resolution,
            inv_badmeasurementmethod
        ])
        n_c_list = np.array([
            n_all, n_after_nometa, n_after_flagsandlod, n_after_duplicate,
            n_after_anyvaliddata, n_after_nokeymeta, n_after_resolution,
            n_after_badmeasurementmethod
        ])
        unknown_list = [
            unknown_mm_list, unknown_mm_refs_list, unknown_local_tz_list
        ]
        meta = [
            'na', 'na', 'na', 'na', 'na', 'na', 'na', 'na', 'na', 'na', 'na',
            'na'
        ]
        exit_r = 'nometa'
        return c, ['na'], ['na'], [
            'na'
        ], False, meta, exit_c_list, n_c_list, unknown_list, exit_r, np.zeros(
            1)
    n_after_nometa += len(site_vals)

    #convert blank values to -99999
    test_inv = site_vals == ''
    site_vals[test_inv] = -99999

    #convert number invalids to -99999
    test_inv = site_vals < 0
    site_vals[test_inv] = -99999

    #create max possible grid
    full_data = np.empty(n_hours)
    full_data_after_flagsandlod = np.empty(n_hours)
    big_n_dup_array = np.zeros(n_hours)
    full_data[:] = -99999
    full_data_after_flagsandlod[:] = -99999

    #get meta
    meta_index = meta_refs.index(site_ref)
    data_tz = np.float32(meta_tz[meta_index])
    all_tz = [data_tz]
    try:
        lat = np.float32(meta_lats[meta_index])
    except:
        lat = 'na'
    try:
        lon = np.float32(meta_lons[meta_index])
    except:
        lon = 'na'
    try:
        alt = np.float32(meta_alts[meta_index])
    except:
        alt = 'na'
    unit = 'na'
    raw_class_name = meta_class[meta_index]
    site_name = meta_sitename[meta_index]
    country = 'United States'
    contact = '*****@*****.**'

    #adjust dates and times if tz is not equal to 0
    tz = int(data_tz)
    if tz != 0:
        for i in range(len(site_yyyymmdd)):
            #create datetime
            dt = datetime.datetime(int(site_yyyymmdd[i][:4]),
                                   int(site_yyyymmdd[i][4:6]),
                                   int(site_yyyymmdd[i][6:]),
                                   int(site_hhmm[i][:2]),
                                   int(site_hhmm[i][2:]))
            if tz > 0:
                dt = dt - datetime.timedelta(hours=int(tz))
            elif tz < 0:
                dt = dt + datetime.timedelta(hours=np.abs(int(tz)))
            site_yyyymmdd[i] = dt.strftime("%Y%m%d")
            site_hhmm[i] = dt.strftime("%H%M")

    #put vals into full grid
    date_con = np.array(site_yyyymmdd).astype(int)
    time_con = np.array(site_hhmm).astype(int)

    #remove data < 1970 and >= 2015
    test_inds = (date_con >= 19700101) & (date_con < 20150101)
    date_con = date_con[test_inds]
    time_con = time_con[test_inds]
    site_vals = site_vals[test_inds]

    #set st_big and mm_big
    st_big = ['continuous'] * len(site_vals)
    if species == 'O3':
        mm_big = ['ultraviolet photometry'] * len(site_vals)
    elif (species == 'NO'):
        mm_big = ['chemiluminescence'] * len(site_vals)
    elif (species == 'CO'):
        mm_big = ['non-dispersive infrared spectroscopy'] * len(site_vals)

    #get obs valid
    test = site_vals >= 0
    n_obs_valid = len(site_vals[test])
    n_after_flagsandlod += n_obs_valid
    print site_vals, n_after_flagsandlod

    #find matching times between actual times and grid of times, return big array of indices of matched indices in grid
    converted_time = modules.date_process(date_con, time_con, start_year)
    converted_time = np.round(converted_time, decimals=5)
    syn_grid_time = np.arange(0, n_days, 1. / 24)
    syn_grid_time = np.round(syn_grid_time, decimals=5)
    raw_indices = np.searchsorted(syn_grid_time, converted_time, side='left')
    site_vals = np.array(site_vals)
    full_data_after_flagsandlod[raw_indices] = site_vals
    raw_st = np.copy(st_big)
    raw_mm = np.copy(mm_big)

    # test and remove duplicate and overlap points
    converted_time, site_vals, mm_big, st_big, na = modules.remove_duplicate_points(
        site_ref, converted_time, site_vals, mm_big, st_big, 'blank',
        output_res)
    test = site_vals >= 0
    n_obs_valid = int(len(site_vals[test]))
    n_after_duplicate += n_obs_valid

    #find matching times between actual times and grid of times, return big array of indices of matched indices in grid
    indices = np.searchsorted(syn_grid_time, converted_time, side='left')
    full_data[indices] = site_vals

    key_meta = [lat, lon, alt]

    #set site file resolution
    file_res = 'H'

    #get sampling/instrument grids
    raw_st_grid, p_st_grid, p_st_grid_after_flagsandlod, raw_mm_grid, p_mm_grid, p_mm_grid_after_flagsandlod, unknown_mm_list, unknown_mm_refs_list = modules.sampling_and_instruments_by_processgroup(
        site_ref, process_group, species, raw_st, raw_mm,
        full_data_after_flagsandlod, full_data, raw_indices, unknown_mm_list,
        unknown_mm_refs_list, no2_type)

    #do quality checks
    data_valid, full_data, valid_hours_dup, p_st_grid, p_mm_grid, n_all, inv_nometa, n_after_nometa, n_after_flagsandlod, n_after_duplicate, inv_anyvaliddata, n_after_anyvaliddata, inv_nokeymeta, n_after_nokeymeta, inv_resolution, n_after_resolution, inv_badmeasurementmethod, n_after_badmeasurementmethod, exit_r = modules.primary_quality_control(
        site_ref, species, file_res, no2_type, grid_dates, full_data,
        big_n_dup_array, 0, raw_st_grid, p_st_grid,
        p_st_grid_after_flagsandlod, raw_mm_grid, p_mm_grid,
        p_mm_grid_after_flagsandlod, data_resolution, n_obs_valid, key_meta,
        n_all, inv_nometa, n_after_nometa, n_after_flagsandlod,
        n_after_duplicate, inv_anyvaliddata, n_after_anyvaliddata,
        inv_nokeymeta, n_after_nokeymeta, inv_resolution, n_after_resolution,
        inv_badmeasurementmethod, n_after_badmeasurementmethod)
    if data_valid == False:
        exit_c_list = np.array([
            inv_nometa, inv_anyvaliddata, inv_nokeymeta, inv_resolution,
            inv_badmeasurementmethod
        ])
        n_c_list = np.array([
            n_all, n_after_nometa, n_after_flagsandlod, n_after_duplicate,
            n_after_anyvaliddata, n_after_nokeymeta, n_after_resolution,
            n_after_badmeasurementmethod
        ])
        unknown_list = [
            unknown_mm_list, unknown_mm_refs_list, unknown_local_tz_list
        ]
        meta = [
            lat, lon, alt, 'na', 'na', 'na', 'na', 'na', 'na', 'na', 'na', 'na'
        ]
        return c, ['na'], ['na'], [
            'na'
        ], False, meta, exit_c_list, n_c_list, unknown_list, exit_r, np.zeros(
            1)

    #make tz int after checks
    data_tz = np.float32(data_tz)

    #set processed unit
    p_unit = 'pbbv'

    #get local timezone
    try:
        local_tz_name = tz_root.tzNameAt(lat, lon, forceTZ=True)
        pytz_obj = pytz.timezone(local_tz_name)
        datetime_offset = pytz_obj.utcoffset(datetime.datetime(2000, 1, 1))
        if datetime_offset < datetime.timedelta(0):
            local_tz = -(24 - int(datetime_offset.seconds / 60 / 60))
        else:
            local_tz = int(datetime_offset.seconds / 60 / 60)
    except:
        local_tz = 'na'
        print 'TIMEZONE NOT KNOWN, SITE IS %s' % (site_ref)
        unknown_local_tz_list.append(site_ref)

    #pack meta
    meta = [
        lat, lon, alt, raw_class_name, file_res, unit, p_unit, data_tz,
        local_tz, site_name, country, contact
    ]

    #if blank strings in meta then convert to 'na'
    for i in range(len(meta)):
        try:
            if meta[i].strip() == '':
                meta[i] = 'na'
        except:
            pass

    print set(raw_st_grid)
    print set(raw_mm_grid)
    print set(p_st_grid)
    print set(p_mm_grid)
    print meta

    exit_c_list = np.array([
        inv_nometa, inv_anyvaliddata, inv_nokeymeta, inv_resolution,
        inv_badmeasurementmethod
    ])
    n_c_list = np.array([
        n_all, n_after_nometa, n_after_flagsandlod, n_after_duplicate,
        n_after_anyvaliddata, n_after_nokeymeta, n_after_resolution,
        n_after_badmeasurementmethod
    ])
    print 'exit counts = ', exit_c_list
    print 'n obs counts = ', n_c_list

    unknown_list = [
        unknown_mm_list, unknown_mm_refs_list, unknown_local_tz_list
    ]

    return c, full_data, p_st_grid, p_mm_grid, data_valid, meta, exit_c_list, n_c_list, unknown_list, 'na', big_n_dup_array
コード例 #5
0
def site_iter_process(valid_refs, c):
    #set local counts
    inv_nometa = 0
    inv_anyvaliddata = 0
    inv_nokeymeta = 0
    inv_resolution = 0
    inv_badmeasurementmethod = 0
    n_all = 0
    n_after_nometa = 0
    n_after_flagsandlod = 0
    n_after_duplicate = 0
    n_after_anyvaliddata = 0
    n_after_nokeymeta = 0
    n_after_resolution = 0
    n_after_badmeasurementmethod = 0

    #set local unknown lists
    unknown_mm_list = []
    unknown_mm_refs_list = []
    unknown_local_tz_list = []

    data_valid = True

    site_data = data[c]
    site_meta = site_data[0]
    file_res = resolutions[c]

    #get data and metadata
    try:
        lat = np.float32(site_meta['LATITUDE'])
    except:
        lat = 'na'
    try:
        lon = np.float32(site_meta['LONGITUDE'])
    except:
        lon = 'na'
    try:
        alt = np.float32(site_meta['ALTITUDE'])
    except:
        alt = 'na'
    land_use_class = site_meta['LAND_USE']
    if pd.isnull(land_use_class) == True:
        land_use_class = 'na'
    station_class = site_meta['STATION CATEGORY']
    if pd.isnull(station_class) == True:
        station_class = 'na'
    raw_class_name = land_use_class + ' ' + station_class
    mm = site_meta['MEASUREMENT METHOD']
    if pd.isnull(mm) == True:
        mm = ''
    country = site_meta['COUNTRY/TERRITORY']
    if pd.isnull(country) == True:
        country = 'na'
    site_name = site_meta['STATION NAME']
    if pd.isnull(site_name) == True:
        site_name = 'na'
    continuous_check = site_meta['MEASUREMENT AUTOMATIC']
    if pd.isnull(continuous_check) == True:
        continuous_check = 'na'
    unit = site_meta['MEASUREMENT UNIT']
    #integration_time = site_meta['TIME INTERVAL']
    tz = site_meta['TIME ZONE']
    contact = '*****@*****.**'
    #convert timezone from str to int
    tzd = {'UTC': 0, 'CET': 1, 'EET': 2}
    data_tz = tzd[tz]
    all_tz = [data_tz]

    if (file_res == 'hr') or (file_res == 'da'):
        var = np.array(site_data[1].values.tolist())
    elif file_res == 'mo':
        all_var = np.array(site_data[1].values.tolist())
        var = np.array(all_var[:, 1]).astype('float64')
        end_times = all_var[:, 0]
        end_date_con = [d[:4] + d[5:7] + d[8:10] for d in end_times]
        end_time_con = [d[11:13] + d[14:] for d in end_times]

    times = site_data[1].index
    date_con = [d.strftime('%Y%m%d') for d in times]
    time_con = [d.strftime('%H%M') for d in times]

    #get ref
    site_ref = valid_refs[c]
    site_group = group_codes[c]

    print 'ref == %s, %s' % (site_ref, c)
    print 'res = ', file_res

    #add var to total obs count
    n_all += len(var)
    n_after_nometa += len(var)

    #if file resolution is daily or monthly then replicate times after point, to fill hourly data array.
    count = 0
    if file_res == 'hr':
        n_dup_array = np.zeros(len(var))

    elif file_res == 'da':
        n_dup_array = []
        file_hours = len(date_con)
        for i in range(file_hours):
            current_hh = int(time_con[count][:2])
            current_mm = int(time_con[count][2:])
            s = datetime.datetime(year=start_year,
                                  month=1,
                                  day=1,
                                  hour=current_hh,
                                  minute=current_mm)
            e = datetime.datetime(year=start_year,
                                  month=1,
                                  day=2,
                                  hour=current_hh,
                                  minute=current_mm)
            day_hours = [
                d.strftime('%H%M') for d in pd.date_range(s, e, freq='H')
            ][1:-1]

            date_con = np.insert(date_con, count + 1, [date_con[count]] * 23)
            time_con = np.insert(time_con, count + 1, day_hours)
            var = np.insert(var, count + 1, [var[count]] * 23)

            #append to n duplicated array
            n_dup_array = np.append(n_dup_array, 0)
            n_dup_array = np.append(n_dup_array, [1] * 23)

            count += 24

    elif file_res == 'mo':
        n_dup_array = []
        file_hours = len(date_con)

        for i in range(file_hours):
            current_year = int(date_con[count][:4])
            current_month = int(date_con[count][4:6])
            current_day = int(date_con[count][6:])
            current_hour = int(time_con[count][:2])
            current_min = int(time_con[count][2:])

            next_year = int(end_date_con[i][:4])
            next_month = int(end_date_con[i][4:6])
            next_day = int(end_date_con[i][6:])
            next_hour = int(end_time_con[i][:2])
            next_min = int(end_time_con[i][2:])

            s = datetime.datetime(year=current_year,
                                  month=current_month,
                                  day=current_day,
                                  hour=current_hour,
                                  minute=current_min)
            e = datetime.datetime(year=next_year,
                                  month=next_month,
                                  day=next_day,
                                  hour=next_hour,
                                  minute=next_min)

            day_date = [
                d.strftime('%Y%m%d') for d in pd.date_range(s, e, freq='H')
            ][1:-1]
            day_hour = [
                d.strftime('%H%M') for d in pd.date_range(s, e, freq='H')
            ][1:-1]
            date_con = np.insert(date_con, count + 1, day_date)
            time_con = np.insert(time_con, count + 1, day_hour)
            var = np.insert(var, count + 1, [var[count]] * len(day_date))

            #append to n duplicated array
            n_dup_array = np.append(n_dup_array, 0)
            n_dup_array = np.append(n_dup_array, [1] * len(day_date))

            count += (len(day_date) + 1)

    date_con = np.array(date_con).astype(int)
    time_con = np.array(time_con).astype(int)

    #remove data < 1970 and >= 2015
    test_inds = (date_con >= 19700101) & (date_con < 20150101)
    date_con = date_con[test_inds]
    time_con = time_con[test_inds]
    var = var[test_inds]
    n_dup_array = n_dup_array[test_inds]

    #convert nans to -99999's
    nan_inds = np.isnan(var)
    var[nan_inds] = -99999

    if continuous_check == 'yes':
        st_big = ['continuous'] * len(var)
    else:
        st_big = ['filter'] * len(var)
    mm_big = [mm] * len(var)

    #get obs valid
    test = var >= 0
    valid_hours_dup = np.sum(n_dup_array[test])
    n_obs_valid = int(len(var[test]) - valid_hours_dup)
    n_after_flagsandlod += n_obs_valid

    #create max possible grid
    full_data = np.empty(len(grid_dates))
    full_data_after_flagsandlod = np.empty(n_hours)
    big_n_dup_array = np.zeros(n_hours)
    full_data[:] = -99999
    full_data_after_flagsandlod[:] = -99999

    #find matching times between actual times and grid of times, return big array of indices of matched indices in grid
    converted_time = modules.date_process(date_con, time_con, start_year)
    converted_time = np.round(converted_time, decimals=5)
    raw_indices = np.searchsorted(syn_grid_time, converted_time, side='left')
    var = np.array(var)
    full_data_after_flagsandlod[raw_indices] = var
    raw_st = np.copy(st_big)
    raw_mm = np.copy(mm_big)

    #test and remove duplicate and overlap points
    converted_time, var, mm_big, st_big, n_dup_array = modules.remove_duplicate_points(
        site_ref, converted_time, var, mm_big, st_big, n_dup_array, output_res)
    test = var >= 0
    valid_hours_dup = np.sum(n_dup_array[test])
    n_obs_valid = int(len(var[test]) - valid_hours_dup)
    n_after_duplicate += n_obs_valid

    #find matching times between actual times and grid of times, return big array of indices of matched indices in grid
    indices = np.searchsorted(syn_grid_time, converted_time, side='left')
    full_data[indices] = var
    big_n_dup_array[indices] = n_dup_array

    key_meta = [lat, lon, alt]

    #convert file res to standard format
    if file_res == 'hr':
        file_res = 'H'
    elif file_res == 'da':
        file_res = 'D'
    elif file_res == 'mo':
        file_res = 'M'

    #get sampling/instrument grids
    raw_st_grid, p_st_grid, p_st_grid_after_flagsandlod, raw_mm_grid, p_mm_grid, p_mm_grid_after_flagsandlod, unknown_mm_list, unknown_mm_refs_list = modules.sampling_and_instruments_by_processgroup(
        site_ref, process_group, species, raw_st, raw_mm,
        full_data_after_flagsandlod, full_data, raw_indices, unknown_mm_list,
        unknown_mm_refs_list, no2_type)

    #do quality checks
    data_valid, full_data, valid_hours_dup, p_st_grid, p_mm_grid, n_all, inv_nometa, n_after_nometa, n_after_flagsandlod, n_after_duplicate, inv_anyvaliddata, n_after_anyvaliddata, inv_nokeymeta, n_after_nokeymeta, inv_resolution, n_after_resolution, inv_badmeasurementmethod, n_after_badmeasurementmethod, exit_r = modules.primary_quality_control(
        site_ref, species, file_res, no2_type, grid_dates, full_data,
        big_n_dup_array, valid_hours_dup, raw_st_grid, p_st_grid,
        p_st_grid_after_flagsandlod, raw_mm_grid, p_mm_grid,
        p_mm_grid_after_flagsandlod, data_resolution, n_obs_valid, key_meta,
        n_all, inv_nometa, n_after_nometa, n_after_flagsandlod,
        n_after_duplicate, inv_anyvaliddata, n_after_anyvaliddata,
        inv_nokeymeta, n_after_nokeymeta, inv_resolution, n_after_resolution,
        inv_badmeasurementmethod, n_after_badmeasurementmethod)
    if data_valid == False:
        exit_c_list = np.array([
            inv_nometa, inv_anyvaliddata, inv_nokeymeta, inv_resolution,
            inv_badmeasurementmethod
        ])
        n_c_list = np.array([
            n_all, n_after_nometa, n_after_flagsandlod, n_after_duplicate,
            n_after_anyvaliddata, n_after_nokeymeta, n_after_resolution,
            n_after_badmeasurementmethod
        ])
        unknown_list = [
            unknown_mm_list, unknown_mm_refs_list, unknown_local_tz_list
        ]
        meta = [
            lat, lon, alt, 'na', 'na', 'na', 'na', 'na', 'na', 'na', 'na', 'na'
        ]
        return c, ['na'], ['na'], [
            'na'
        ], False, meta, exit_c_list, n_c_list, unknown_list, exit_r, np.zeros(
            1)

    #set processed unit
    p_unit = 'pbbv'

    #get local timezone
    try:
        local_tz_name = tz_root.tzNameAt(lat, lon, forceTZ=True)
        pytz_obj = pytz.timezone(local_tz_name)
        datetime_offset = pytz_obj.utcoffset(datetime.datetime(2000, 1, 1))
        if datetime_offset < datetime.timedelta(0):
            local_tz = -(24 - int(datetime_offset.seconds / 60 / 60))
        else:
            local_tz = int(datetime_offset.seconds / 60 / 60)
    except:
        local_tz = 'na'
        print 'TIMEZONE NOT KNOWN, SITE IS %s' % (site_ref)
        unknown_local_tz_list.append(site_ref)

    #pack meta
    meta = [
        lat, lon, alt, raw_class_name, file_res, unit, p_unit, data_tz,
        local_tz, site_name, country, contact
    ]

    #if blank strings in meta then convert to 'na'
    for i in range(len(meta)):
        try:
            if meta[i].strip() == '':
                meta[i] = 'na'
        except:
            pass

    print set(raw_st_grid)
    print set(raw_mm_grid)
    print set(p_st_grid)
    print set(p_mm_grid)
    print meta

    exit_c_list = np.array([
        inv_nometa, inv_anyvaliddata, inv_nokeymeta, inv_resolution,
        inv_badmeasurementmethod
    ])
    n_c_list = np.array([
        n_all, n_after_nometa, n_after_flagsandlod, n_after_duplicate,
        n_after_anyvaliddata, n_after_nokeymeta, n_after_resolution,
        n_after_badmeasurementmethod
    ])
    print 'exit counts = ', exit_c_list
    print 'n obs counts = ', n_c_list

    unknown_list = [
        unknown_mm_list, unknown_mm_refs_list, unknown_local_tz_list
    ]

    return c, full_data, p_st_grid, p_mm_grid, data_valid, meta, exit_c_list, n_c_list, unknown_list, 'na', big_n_dup_array
コード例 #6
0
def site_iter_process(valid_refs, c):
    #set local counts
    inv_nometa = 0
    inv_anyvaliddata = 0
    inv_nokeymeta = 0
    inv_resolution = 0
    inv_badmeasurementmethod = 0
    n_all = 0
    n_after_nometa = 0
    n_after_flagsandlod = 0
    n_after_duplicate = 0
    n_after_anyvaliddata = 0
    n_after_nokeymeta = 0
    n_after_resolution = 0
    n_after_badmeasurementmethod = 0

    #set local unknown lists
    unknown_mm_list = []
    unknown_mm_refs_list = []
    unknown_local_tz_list = []

    ref = valid_refs[c]
    print 'ref = ', ref, c

    #get site instrument for species
    met_i = file_refs.index(ref)
    file_name = met_refs[met_i]
    site_name = met_sitenames[met_i]
    print site_name
    site_species = list(met_species[met_i])
    print site_species
    site_instruments = list(met_instruments[met_i])
    m_method = site_instruments[site_species.index(species)]

    site_resolutions = []
    data_valid = True

    s_files = insensitive_glob(
        '/work/home/db876/observations/surface/%s/EANET/*%s.csv' %
        (fname_species, file_name))
    site_files = []
    for y in year_array:
        for f in s_files:
            if str(y)[-2:] in f:
                site_files.append(f)

    site_files = modules.natsorted(site_files)

    years = []
    months = []
    days = []
    hours = []

    vals = []
    yyyymmdd = []
    hhmm = []

    n_dup_array = []

    last_year_index = len(site_files)
    for y in year_array:
        got_year = False
        for file in site_files:
            last_file_split = file.split('/')[-1]
            if str(y)[2:] in last_file_split:
                got_year = True
                break
        if got_year == False:
            timedelta_diff = datetime.date(y + 1, 1, 1) - datetime.date(
                y, 1, 1)
            ndays_missing = timedelta_diff.days
            continue

        print file

        valid = True
        with open(file, 'rb') as f:
            reader = csv.reader(f, delimiter=',')
            counter = 0

            #get resolution
            for row in reader:
                if counter == 0:
                    all_units = row

                elif counter == 1:
                    file_res = 'H'

                    try:
                        hour_index = row.index('Hour')
                    except:
                        file_res = 'D'
                    try:
                        day_index = row.index('Day')
                    except:
                        file_res = 'M'
                    month_index = row.index('Month')
                    year_index = row.index('Year')

                    try:
                        spec_index = row.index(species.upper())
                        unit = all_units[spec_index]
                    except:
                        valid = False
                        break

                    #make sure each year units are ppb
                    if unit != 'ppb':
                        print 'Units not ppb!'
                        1 + 'a'

                if counter == 2:
                    if file_res == 'H':
                        yyyy = row[year_index]
                        mm = row[month_index]
                        dd = row[day_index]
                        hh = row[hour_index]
                    elif file_res == 'D':
                        yyyy = row[year_index]
                        mm = row[month_index]
                        dd = row[day_index]
                        hh = 1
                    elif file_res == 'M':
                        yyyy = row[year_index]
                        mm = row[month_index]
                        dd = 1
                        hh = 1

                    start_datetime = datetime.datetime(int(yyyy), int(mm),
                                                       int(dd), int(hh))

                if counter == 3:
                    if file_res == 'H':
                        yyyy = row[year_index]
                        mm = row[month_index]
                        dd = row[day_index]
                        hh = row[hour_index]
                    elif file_res == 'D':
                        yyyy = row[year_index]
                        mm = row[month_index]
                        dd = row[day_index]
                        hh = 1
                    elif file_res == 'M':
                        yyyy = row[year_index]
                        mm = row[month_index]
                        dd = 1
                        hh = 1

                    present_datetime = datetime.datetime(
                        int(yyyy), int(mm), int(dd), int(hh))

                    time_delt = present_datetime - start_datetime
                    hour_delt = datetime.timedelta(hours=1)
                    day_delt = datetime.timedelta(hours=24)
                    week_delt = datetime.timedelta(hours=24 * 7)
                    month_delt = datetime.timedelta(hours=24 * 28)

                    print time_delt

                    if (time_delt < day_delt):
                        print 'Hourly Data'
                        file_res = 'H'
                        site_resolutions.append(file_res)

                    elif (time_delt > hour_delt) & (time_delt < week_delt):
                        print 'Daily Data'
                        file_res = 'D'
                        site_resolutions.append(file_res)

                    elif (time_delt > week_delt):
                        print 'Monthly Data'
                        file_res = 'M'
                        site_resolutions.append(file_res)

                counter += 1

        #READ IN DATA
        if valid == True:
            #limit to sites with hourly date files for, if required
            if output_res == 'H':
                if file_res != 'H':
                    print 'Not processing as only want hourly files'
                    continue
            if output_res == 'HD':
                if file_res == 'M':
                    print 'Not processing as only want hourly and daily files'
                    continue
            with open(file, 'rb') as f:
                reader = csv.reader(f, delimiter=',')
                counter = 0
                val_count = 0
                for row in reader:

                    if counter >= 2:
                        yyyy = row[year_index]
                        mm = row[month_index]

                        #add to n_obs_all
                        n_all += 1
                        n_after_nometa += 1

                        if file_res == 'H':
                            try:
                                vals = np.append(vals,
                                                 np.float64(row[spec_index]))
                            except:
                                vals = np.append(vals, -99999)

                            current_datetime = present_datetime + relativedelta(
                                hours=val_count)
                            yyyymmdd.append(
                                current_datetime.strftime("%Y%m%d"))
                            hhmm.append(current_datetime.strftime("%H%M"))
                            n_dup_array = np.append(n_dup_array, 0)

                        elif file_res == 'D':
                            try:
                                vals = np.append(
                                    vals, [np.float64(row[spec_index])] * 24)
                            except:
                                vals = np.append(vals, [-99999] * 24)

                            current_datetime = present_datetime + relativedelta(
                                days=val_count)
                            next_datetime = present_datetime + relativedelta(
                                days=val_count + 1)
                            all_datetimes = pd.date_range(current_datetime,
                                                          next_datetime,
                                                          freq='H')[:-1]
                            for d in all_datetimes:
                                yyyymmdd.append(d.strftime("%Y%m%d"))
                                hhmm.append(d.strftime("%H%M"))
                            n_dup_array = np.append(n_dup_array, 0)
                            n_dup_array = np.append(n_dup_array, [1] * 23)

                        elif file_res == 'M':
                            month_days = monthrange(int(yyyy), int(mm))[1]
                            try:
                                vals = np.append(
                                    vals, [np.float64(row[spec_index])] *
                                    (month_days * 24))
                            except:
                                vals = np.append(vals,
                                                 [-99999] * (month_days * 24))

                            current_datetime = present_datetime + relativedelta(
                                months=int(mm) - 1)
                            next_datetime = present_datetime + relativedelta(
                                months=int(mm))
                            all_datetimes = pd.date_range(current_datetime,
                                                          next_datetime,
                                                          freq='H')[:-1]
                            for d in all_datetimes:
                                yyyymmdd.append(d.strftime("%Y%m%d"))
                                hhmm.append(d.strftime("%H%M"))
                            n_dup_array = np.append(n_dup_array, 0)
                            n_dup_array = np.append(n_dup_array, [1] *
                                                    ((month_days * 24) - 1))

                        val_count += 1
                    counter += 1

        else:
            print 'Species is not in file header. Skipping Year'
            timedelta_diff = datetime.date(y + 1, 1, 1) - datetime.date(
                y, 1, 1)
            ndays_missing = timedelta_diff.days
            print 'ndays missing = ', ndays_missing

    #test if have no data due to not required time resolution, if so exit
    if len(vals) == 0:
        n_all, inv_nometa, n_after_nometa, n_after_flagsandlod, n_after_duplicate, inv_anyvaliddata, n_obs_after_anyvaliddata, inv_nokeymeta, n_obs_after_nokeymeta, inv_resolution, n_obs_after_resolution, inv_badmeasurementmethod, n_obs_after_badmeasurementmethod = 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
        exit_c_list = np.array([
            inv_nometa, inv_anyvaliddata, inv_nokeymeta, inv_resolution,
            inv_badmeasurementmethod
        ])
        n_c_list = np.array([
            n_all, n_after_nometa, n_after_flagsandlod, n_after_duplicate,
            n_after_anyvaliddata, n_after_nokeymeta, n_after_resolution,
            n_after_badmeasurementmethod
        ])
        unknown_list = [
            unknown_mm_list, unknown_mm_refs_list, unknown_local_tz_list
        ]
        meta = [
            'na', 'na', 'na', 'na', 'na', 'na', 'na', 'na', 'na', 'na', 'na',
            'na'
        ]
        return c, ['na'], ['na'], [
            'na'
        ], False, meta, exit_c_list, n_c_list, unknown_list, 'nothourly', np.zeros(
            0)

    #create max possible grid
    full_data = np.empty(n_hours)
    full_data_after_flagsandlod = np.empty(n_hours)
    big_n_dup_array = np.zeros(n_hours)
    full_data[:] = -99999
    full_data_after_flagsandlod[:] = -99999

    #convert blank values to -99999
    test_inv = vals == ''
    vals[test_inv] = -99999

    #convert number invalids to -99999
    test_inv = vals < 0
    vals[test_inv] = -99999

    #if all site resolutions are same continue then take first file_res
    all_same = all(x == site_resolutions[0] for x in site_resolutions)
    if all_same == True:
        file_res = site_resolutions[0]
    else:
        #otherwise take lowest frequency res as file_res
        if 'M' in site_resolutions:
            file_res = 'M'
        elif 'D' in site_resolutions:
            file_res = 'D'
        else:
            file_res = 'H'

    #get meta
    i_ref = file_refs.index(ref)
    site_ref = ref
    data_tz = np.float32(met_tz[i_ref])
    all_tz = [data_tz]
    lat = np.float32(met_lats[i_ref])
    lon = np.float32(met_lons[i_ref])
    alt = np.float32(met_alts[i_ref])
    raw_class_name = met_class[i_ref]
    country = met_country[i_ref]
    unit = str(unit)
    contact = 'Ayako Aoyagi, Asia Center for Air Pollution Research, [email protected]'

    #adjust dates and times if tz is not equal to 0
    tz = int(data_tz)
    if tz != 0:
        for i in range(len(yyyymmdd)):
            #create datetime
            dt = datetime.datetime(int(yyyymmdd[i][:4]), int(yyyymmdd[i][4:6]),
                                   int(yyyymmdd[i][6:]), int(hhmm[i][:2]),
                                   int(hhmm[i][2:]))
            if tz > 0:
                dt = dt - datetime.timedelta(hours=int(tz))
            elif tz < 0:
                dt = dt + datetime.timedelta(hours=np.abs(int(tz)))
            yyyymmdd[i] = dt.strftime("%Y%m%d")
            hhmm[i] = dt.strftime("%H%M")

    #put vals into full grid
    date_con = np.array(yyyymmdd).astype(int)
    time_con = np.array(hhmm).astype(int)

    #remove data < 1970 and >= 2015
    test_inds = (date_con >= 19700101) & (date_con < 20150101)
    date_con = date_con[test_inds]
    time_con = time_con[test_inds]
    vals = vals[test_inds]
    n_dup_array = n_dup_array[test_inds]

    #set st_big and mm_big
    st_big = ['continuous'] * len(vals)
    mm_big = [m_method] * len(vals)

    #get obs valid
    test = vals >= 0
    valid_hours_dup = np.sum(n_dup_array[test])
    n_obs_valid = int(len(vals[test]) - valid_hours_dup)
    n_after_flagsandlod += n_obs_valid

    #find matching times between actual times and grid of times, return big array of indices of matched indices in grid
    converted_time = modules.date_process(date_con, time_con, start_year)
    converted_time = np.round(converted_time, decimals=5)
    syn_grid_time = np.arange(0, n_days, 1. / 24)
    syn_grid_time = np.round(syn_grid_time, decimals=5)
    raw_indices = np.searchsorted(syn_grid_time, converted_time, side='left')
    vals = np.array(vals)
    full_data_after_flagsandlod[raw_indices] = vals
    raw_st = np.copy(st_big)
    raw_mm = np.copy(mm_big)

    # test and remove duplicate and overlap points
    converted_time, vals, mm_big, st_big, n_dup_array = modules.remove_duplicate_points(
        site_ref, converted_time, vals, mm_big, st_big, n_dup_array,
        output_res)
    test = vals >= 0
    valid_hours_dup = np.sum(n_dup_array[test])
    n_obs_valid = int(len(vals[test]) - valid_hours_dup)
    n_after_duplicate += n_obs_valid

    #find matching times between actual times and grid of times, return big array of indices of matched indices in grid
    indices = np.searchsorted(syn_grid_time, converted_time, side='left')
    full_data[indices] = vals
    big_n_dup_array[indices] = n_dup_array

    key_meta = [lat, lon, alt]

    #get sampling/instrument grids
    raw_st_grid, p_st_grid, p_st_grid_after_flagsandlod, raw_mm_grid, p_mm_grid, p_mm_grid_after_flagsandlod, unknown_mm_list, unknown_mm_refs_list = modules.sampling_and_instruments_by_processgroup(
        site_ref, process_group, species, raw_st, raw_mm,
        full_data_after_flagsandlod, full_data, raw_indices, unknown_mm_list,
        unknown_mm_refs_list, no2_type)

    #do quality checks
    data_valid, full_data, valid_hours_dup, p_st_grid, p_mm_grid, n_all, inv_nometa, n_after_nometa, n_after_flagsandlod, n_after_duplicate, inv_anyvaliddata, n_after_anyvaliddata, inv_nokeymeta, n_after_nokeymeta, inv_resolution, n_after_resolution, inv_badmeasurementmethod, n_after_badmeasurementmethod, exit_r = modules.primary_quality_control(
        site_ref, species, file_res, no2_type, grid_dates, full_data,
        big_n_dup_array, valid_hours_dup, raw_st_grid, p_st_grid,
        p_st_grid_after_flagsandlod, raw_mm_grid, p_mm_grid,
        p_mm_grid_after_flagsandlod, data_resolution, n_obs_valid, key_meta,
        n_all, inv_nometa, n_after_nometa, n_after_flagsandlod,
        n_after_duplicate, inv_anyvaliddata, n_after_anyvaliddata,
        inv_nokeymeta, n_after_nokeymeta, inv_resolution, n_after_resolution,
        inv_badmeasurementmethod, n_after_badmeasurementmethod)
    if data_valid == False:
        exit_c_list = np.array([
            inv_nometa, inv_anyvaliddata, inv_nokeymeta, inv_resolution,
            inv_badmeasurementmethod
        ])
        n_c_list = np.array([
            n_all, n_after_nometa, n_after_flagsandlod, n_after_duplicate,
            n_after_anyvaliddata, n_after_nokeymeta, n_after_resolution,
            n_after_badmeasurementmethod
        ])
        unknown_list = [
            unknown_mm_list, unknown_mm_refs_list, unknown_local_tz_list
        ]
        meta = [
            lat, lon, alt, 'na', 'na', 'na', 'na', 'na', 'na', 'na', 'na', 'na'
        ]
        return c, ['na'], ['na'], [
            'na'
        ], False, meta, exit_c_list, n_c_list, unknown_list, exit_r, np.zeros(
            1)

    #make tz int after checks
    data_tz = np.float32(data_tz)

    #set processed unit
    p_unit = 'pbbv'

    #get local timezone
    try:
        local_tz_name = tz_root.tzNameAt(lat, lon, forceTZ=True)
        pytz_obj = pytz.timezone(local_tz_name)
        datetime_offset = pytz_obj.utcoffset(datetime.datetime(2000, 1, 1))
        if datetime_offset < datetime.timedelta(0):
            local_tz = -(24 - int(datetime_offset.seconds / 60 / 60))
        else:
            local_tz = int(datetime_offset.seconds / 60 / 60)
    except:
        local_tz = 'na'
        print 'TIMEZONE NOT KNOWN, SITE IS %s' % (site_ref)
        unknown_local_tz_list.append(site_ref)

    #pack meta
    meta = [
        lat, lon, alt, raw_class_name, file_res, unit, p_unit, data_tz,
        local_tz, site_name, country, contact
    ]

    #if blank strings in meta then convert to 'na'
    for i in range(len(meta)):
        try:
            if meta[i].strip() == '':
                meta[i] = 'na'
        except:
            pass

    print set(raw_st_grid)
    print set(raw_mm_grid)
    print set(p_st_grid)
    print set(p_mm_grid)
    print meta

    exit_c_list = np.array([
        inv_nometa, inv_anyvaliddata, inv_nokeymeta, inv_resolution,
        inv_badmeasurementmethod
    ])
    n_c_list = np.array([
        n_all, n_after_nometa, n_after_flagsandlod, n_after_duplicate,
        n_after_anyvaliddata, n_after_nokeymeta, n_after_resolution,
        n_after_badmeasurementmethod
    ])
    print 'exit counts = ', exit_c_list
    print 'n obs counts = ', n_c_list

    unknown_list = [
        unknown_mm_list, unknown_mm_refs_list, unknown_local_tz_list
    ]

    return c, full_data, p_st_grid, p_mm_grid, data_valid, meta, exit_c_list, n_c_list, unknown_list, 'na', big_n_dup_array
コード例 #7
0
def site_iter_process(valid_refs, c):
    #set local counts
    inv_nometa = 0
    inv_anyvaliddata = 0
    inv_nokeymeta = 0
    inv_resolution = 0
    inv_badmeasurementmethod = 0
    n_all = 0
    n_after_nometa = 0
    n_after_flagsandlod = 0
    n_after_duplicate = 0
    n_after_anyvaliddata = 0
    n_after_nokeymeta = 0
    n_after_resolution = 0
    n_after_badmeasurementmethod = 0

    #set local unknown lists
    unknown_mm_list = []
    unknown_mm_refs_list = []
    unknown_local_tz_list = []

    #process data for each site at a time
    site_ref = valid_refs[c]
    data_valid = True
    print 'ref = ', site_ref, c

    #get all files for ref
    all_files = glob.glob(
        '/work/home/db876/observations/surface/O3/SEARCH/%s*' % (site_ref))

    file_years = [i[-8:-4] for i in all_files]

    #sort files
    all_files = [x for (y, x) in sorted(zip(file_years, all_files))]

    dates = []
    times = []
    site_vals = []

    print all_files

    for f in all_files:
        print f
        if f[-3:] == 'xls':
            spec_str = species
            flag_str = '%s FL' % (species)
            date_str = 'DATE/TIME'
            all_data = get_data(f)
            all_data = all_data.values()
            headers = all_data[0][2]
            date_ind = headers.index(date_str)
            spec_ind = headers.index(spec_str)
            flag_ind = headers.index(flag_str)

            data_cut = all_data[0][3:]

            for i in range(len(data_cut)):
                row_cut = data_cut[i]
                if len(row_cut) < 30:
                    diff = 30 - len(row_cut)
                    for x in range(diff):
                        row_cut.append('')

                dates.append(row_cut[date_ind].strftime("%Y%m%d"))
                times.append(row_cut[date_ind].strftime("%H%M"))

                try:
                    val = np.float64(row_cut[spec_ind])
                except:
                    val = -99999

                if (row_cut[flag_ind] == 'I') or (row_cut[flag_ind]
                                                  == 'C') or (val < 0):
                    site_vals.append(-99999)
                else:
                    site_vals.append(val)

        elif f[-3:] == 'csv':
            date_str = 'Date/Time[LST]'
            spec_str = 'Average %s[ppb]' % (species)
            flag_str = 'Flags[%s]' % (species)
            mycsv = csv.reader(open(f), delimiter=',')
            start_read = 999999
            row_count = 0
            for row in mycsv:
                try:
                    if row[0] == date_str:
                        date_ind = 0
                        spec_ind = row.index(spec_str)
                        flag_ind = row.index(flag_str)
                        start_read = row_count + 1
                except:
                    pass

                if row_count >= start_read:
                    dates.append(
                        parser.parse(row[date_ind]).strftime("%Y%m%d"))
                    times.append(parser.parse(row[date_ind]).strftime("%H%M"))
                    #dates.append(row[date_ind][6:10]+row[date_ind][0:2]+row[date_ind][3:5])
                    #times.append(row[date_ind][11:13]+row[date_ind][14:])
                    if ('I' in row[flag_ind]) or ('C' in row[flag_ind]) or (
                            row[flag_ind]
                            == 'Null') or (np.float64(row[spec_ind]) < 0):
                        site_vals.append(-99999)
                    else:
                        site_vals.append(np.float64(row[spec_ind]))

                row_count += 1

    site_vals = np.array(site_vals)

    #adjust dates and times if tz is not equal to 0
    data_tz = tz_dict[site_ref]
    if data_tz != 0:
        for i in range(len(dates)):
            #create datetime
            dt = datetime.datetime(int(dates[i][:4]), int(dates[i][4:6]),
                                   int(dates[i][6:]), int(times[i][:2]),
                                   int(times[i][2:]))
            if data_tz > 0:
                dt = dt - datetime.timedelta(hours=int(data_tz))
            elif data_tz < 0:
                dt = dt + datetime.timedelta(hours=np.abs(int(data_tz)))
            dates[i] = dt.strftime("%Y%m%d")
            times[i] = dt.strftime("%H%M")

    #add val to total obs count
    n_all += len(site_vals)
    n_after_nometa += len(site_vals)

    #put vals into full grid
    date_con = np.array(dates).astype(int)
    time_con = np.array(times).astype(int)

    #remove data < 1970 and >= 2015
    test_inds = (date_con >= 19700101) & (date_con < 20150101)
    date_con = date_con[test_inds]
    time_con = time_con[test_inds]
    site_vals = site_vals[test_inds]

    #set st_big as 'continuous'
    st_big = ['continuous'] * len(site_vals)

    #set mm_big
    if species == 'O3':
        mm_big = ['ultraviolet photometry'] * len(site_vals)
    elif species == 'NO':
        mm_big = ['chemiluminescence'] * len(site_vals)
    elif species == 'NO2':
        mm_big = ['chemiluminescence (conversion-photolysis)'] * len(site_vals)
    elif species == 'CO':
        mm_big = ['non-dispersive infrared spectroscopy'] * len(site_vals)

    #get obs valid after flagsandlod
    test = site_vals >= 0
    n_obs_valid = len(site_vals[test])
    n_after_flagsandlod += n_obs_valid

    #create max possible grid
    full_data = np.empty(n_hours)
    full_data_after_flagsandlod = np.empty(n_hours)
    big_n_dup_array = np.zeros(n_hours)
    full_data[:] = -99999
    full_data_after_flagsandlod[:] = -99999

    #find matching times between actual times and grid of times, return big array of indices of matched indices in grid
    converted_time = modules.date_process(date_con, time_con, start_year)
    converted_time = np.round(converted_time, decimals=5)
    syn_grid_time = np.arange(0, n_days, 1. / 24)
    syn_grid_time = np.round(syn_grid_time, decimals=5)
    raw_indices = np.searchsorted(syn_grid_time, converted_time, side='left')
    site_vals = np.array(site_vals)
    full_data_after_flagsandlod[raw_indices] = site_vals
    raw_st = np.copy(st_big)
    raw_mm = np.copy(mm_big)

    #test and remove duplicate and overlap points
    converted_time, site_vals, mm_big, st_big, na = modules.remove_duplicate_points(
        site_ref, converted_time, site_vals, mm_big, st_big, 'blank',
        output_res)
    test = site_vals >= 0
    n_obs_valid = int(len(site_vals[test]))
    print 'n obs valid = ', n_obs_valid
    n_after_duplicate += n_obs_valid

    #find matching times between actual times and grid of times, return big array of indices of matched indices in grid
    indices = np.searchsorted(syn_grid_time, converted_time, side='left')
    full_data[indices] = site_vals

    #get site meta
    lat = lat_dict[site_ref]
    lon = lon_dict[site_ref]
    alt = alt_dict[site_ref]
    unit = 'ppb'
    raw_class_name = raw_class_dict[site_ref]
    site_name = sitename_dict[site_ref]
    country = 'United States'
    contact = '*****@*****.**'
    all_tz = [data_tz]

    key_meta = [lat, lon, alt]

    #set site file resolution as hourly
    file_res = 'H'

    #get sampling/instrument grids
    raw_st_grid, p_st_grid, p_st_grid_after_flagsandlod, raw_mm_grid, p_mm_grid, p_mm_grid_after_flagsandlod, unknown_mm_list, unknown_mm_refs_list = modules.sampling_and_instruments_by_processgroup(
        site_ref, process_group, species, raw_st, raw_mm,
        full_data_after_flagsandlod, full_data, raw_indices, unknown_mm_list,
        unknown_mm_refs_list, no2_type)

    #do quality checks
    data_valid, full_data, valid_hours_dup, p_st_grid, p_mm_grid, n_all, inv_nometa, n_after_nometa, n_after_flagsandlod, n_after_duplicate, inv_anyvaliddata, n_after_anyvaliddata, inv_nokeymeta, n_after_nokeymeta, inv_resolution, n_after_resolution, inv_badmeasurementmethod, n_after_badmeasurementmethod, exit_r = modules.primary_quality_control(
        site_ref, species, file_res, no2_type, grid_dates, full_data,
        big_n_dup_array, 0, raw_st_grid, p_st_grid,
        p_st_grid_after_flagsandlod, raw_mm_grid, p_mm_grid,
        p_mm_grid_after_flagsandlod, data_resolution, n_obs_valid, key_meta,
        n_all, inv_nometa, n_after_nometa, n_after_flagsandlod,
        n_after_duplicate, inv_anyvaliddata, n_after_anyvaliddata,
        inv_nokeymeta, n_after_nokeymeta, inv_resolution, n_after_resolution,
        inv_badmeasurementmethod, n_after_badmeasurementmethod)
    if data_valid == False:
        exit_c_list = np.array([
            inv_nometa, inv_anyvaliddata, inv_nokeymeta, inv_resolution,
            inv_badmeasurementmethod
        ])
        n_c_list = np.array([
            n_all, n_after_nometa, n_after_flagsandlod, n_after_duplicate,
            n_after_anyvaliddata, n_after_nokeymeta, n_after_resolution,
            n_after_badmeasurementmethod
        ])
        unknown_list = [
            unknown_mm_list, unknown_mm_refs_list, unknown_local_tz_list
        ]
        meta = [
            lat, lon, alt, 'na', 'na', 'na', 'na', 'na', 'na', 'na', 'na', 'na'
        ]
        return c, ['na'], ['na'], [
            'na'
        ], False, meta, exit_c_list, n_c_list, unknown_list, exit_r, np.zeros(
            1)

    #set processed unit
    p_unit = 'pbbv'

    #get local timezone
    try:
        local_tz_name = tz_root.tzNameAt(lat, lon, forceTZ=True)
        pytz_obj = pytz.timezone(local_tz_name)
        datetime_offset = pytz_obj.utcoffset(datetime.datetime(2000, 1, 1))
        if datetime_offset < datetime.timedelta(0):
            local_tz = -(24 - int(datetime_offset.seconds / 60 / 60))
        else:
            local_tz = int(datetime_offset.seconds / 60 / 60)
    except:
        local_tz = 'na'
        print 'TIMEZONE NOT KNOWN, SITE IS %s' % (site_ref)
        unknown_local_tz_list.append(site_ref)

    #pack meta
    meta = [
        lat, lon, alt, raw_class_name, file_res, unit, p_unit, data_tz,
        local_tz, site_name, country, contact
    ]

    #if blank strings in meta then convert to 'na'
    for i in range(len(meta)):
        try:
            if meta[i].strip() == '':
                meta[i] = 'na'
        except:
            pass

    print set(raw_st_grid)
    print set(raw_mm_grid)
    print set(p_st_grid)
    print set(p_mm_grid)
    print meta

    exit_c_list = np.array([
        inv_nometa, inv_anyvaliddata, inv_nokeymeta, inv_resolution,
        inv_badmeasurementmethod
    ])
    n_c_list = np.array([
        n_all, n_after_nometa, n_after_flagsandlod, n_after_duplicate,
        n_after_anyvaliddata, n_after_nokeymeta, n_after_resolution,
        n_after_badmeasurementmethod
    ])
    print 'exit counts = ', exit_c_list
    print 'n obs counts = ', n_c_list

    unknown_list = [
        unknown_mm_list, unknown_mm_refs_list, unknown_local_tz_list
    ]

    return c, full_data, p_st_grid, p_mm_grid, data_valid, meta, exit_c_list, n_c_list, unknown_list, 'na', big_n_dup_array
コード例 #8
0
def site_iter_process(valid_refs,c):
    #set local counts
    inv_nometa = 0
    inv_anyvaliddata = 0
    inv_nokeymeta = 0
    inv_resolution = 0
    inv_badmeasurementmethod = 0
    n_all = 0
    n_after_nometa = 0
    n_after_flagsandlod = 0
    n_after_duplicate = 0
    n_after_anyvaliddata = 0
    n_after_nokeymeta = 0
    n_after_resolution = 0
    n_after_badmeasurementmethod = 0

    #set local unknown lists
    unknown_mm_list = []
    unknown_mm_refs_list = []
    unknown_local_tz_list = []

    data_valid = True
    
    site_ref = valid_refs[c]
    print 'ref = ',site_ref,c
    
    #read in site data from chunk
    site_yyyymmdd = a_site_yyyymmdd[c]
    site_hhmm = a_site_hhmm[c]
    site_vals = a_site_vals[c]
    mm_big = a_mm_big[c]
    site_units = a_site_units[c]
    site_res = a_site_res[c]
    n_dup_arr = a_n_dup_arr[c]
    lat = a_lat[c]
    lon = a_lon[c]
    alt = a_alt[c]
    unit = a_unit[c]
    raw_class_name = a_raw_class_name[c]
    site_name = a_site_name[c]
    no_meta = a_no_meta[c]
    country = 'United States'
    contact = '*****@*****.**'
    
    print '1'
    
    try:
        lat = np.float32(lat)
    except:
        pass
    try:
        lon = np.float32(lon)
    except:
        pass  
    try:
        alt = np.float32(alt)
    except:
        pass
        

#process data for each site at a time
#for site_ref in valid_refs:
    #site_ref = valid_refs[c]
    #site_test = all_refs == site_ref
    #site_yyyymmdd = yyyymmdd[site_test]
    #site_hhmm = hhmm[site_test]
    #site_vals = vals[site_test]
    #mm_big = all_mm[site_test]
    #site_units = all_units[site_test]
    #if species == 'ISOP':
    #    n_dup_arr = n_dup_array[site_test]
    #    site_res = site_resolutions[site_test]
    #else:
    #    n_dup_arr = np.zeros(len(site_vals))
    
    #convert to ppb
    if (species == 'O3') or (species == 'NO') or (species == 'NO2') or (species == 'CO'):
        for i in range(len(site_vals)):
            if site_units[i] == 'Parts per million':
                site_vals[i] = site_vals[i]*1.e3
            elif site_units[i] == 'Parts per billion':
                pass
            else:
                print site_units[i]
                1+'a'
        
    # convert to ppb
    if species == 'ISOP':
        for i in range(len(site_vals)):
            #078 is Parts per billion Carbon, Isoprene has 5 Carbons
            if site_units[i] == 'Parts per billion Carbon':
                site_vals[i] = site_vals[i]/5.  
            #008 is Parts per billion
            elif site_units[i] == 'Parts per billion':
                pass
            #101 is Parts per million Carbon
            elif site_units[i] == 'Parts per million Carbon':
                site_vals[i] = (site_vals[i]/5.)*1.e3
            else:
                print site_units[i]
                1+'a'
               
    #add val to total obs count
    valid_hours_dup = np.sum(n_dup_arr)
    n_all += len(site_vals) - valid_hours_dup
    
    #get site meta
    #try:
    #    meta_index = meta_refs.index(site_ref)
    #    try:
    #        lat = np.float32(meta_lats[meta_index])
    #    except:
    #        lat = 'na'
    #    try:
    #        lon =  np.float32(meta_lons[meta_index])
    #    except:
    #        lon = 'na'
    #    try:
    #        alt =  np.float32(meta_alts[meta_index])
    #    except:
    #        alt = 'na'
    #except:
    #    pass
    
    #get local timezone
    try:
        local_tz_name = tz_root.tzNameAt(lat,lon,forceTZ=True)
        pytz_obj = pytz.timezone(local_tz_name)
        datetime_offset = pytz_obj.utcoffset(datetime.datetime(2000,1,1))
        if datetime_offset < datetime.timedelta(0):
            local_tz = -(24-int(datetime_offset.seconds/60/60))
        else:
            local_tz = int(datetime_offset.seconds/60/60)
    except:
        local_tz = 'na'
        print 'TIMEZONE NOT KNOWN, SITE IS %s'%(site_ref)
        unknown_local_tz_list.append(site_ref)
    #if species is ISOP set data_tz as local_tz
    if species == 'ISOP':
        data_tz = int(local_tz)
    else:
        data_tz = 0
    
    #test if site_ref in meta_refs, if not then exit
    #also test for ISOP if have local_tz
    
    if (no_meta == 'Yes') or (data_tz == 'na'):
        inv_nometa+=1
        print 'Site Invalid. No Metadata for ref'
        if no2_type == 'MOLYBDENUM':
            n_all,inv_nometa,n_after_nometa,n_after_flagsandlod,n_after_duplicate,inv_anyvaliddata,n_obs_after_anyvaliddata,inv_nokeymeta,n_obs_after_nokeymeta,inv_resolution,n_obs_after_resolution,inv_badmeasurementmethod,n_obs_after_badmeasurementmethod = 0,0,0,0,0,0,0,0,0,0,0,0,0
        exit_c_list = np.array([inv_nometa,inv_anyvaliddata,inv_nokeymeta,inv_resolution,inv_badmeasurementmethod])
        n_c_list = np.array([n_all,n_after_nometa,n_after_flagsandlod,n_after_duplicate,n_after_anyvaliddata,n_after_nokeymeta,n_after_resolution,n_after_badmeasurementmethod])
        unknown_list = [unknown_mm_list,unknown_mm_refs_list,unknown_local_tz_list]
        meta = ['na','na','na','na','na','na','na','na','na','na','na','na']
        exit_r = 'nometa'
        return c,['na'],['na'],['na'],False,meta,exit_c_list,n_c_list,unknown_list,exit_r,np.zeros(1)
    

    valid_hours_dup = np.sum(n_dup_arr)
    n_after_nometa += len(site_vals) - valid_hours_dup
        
    #adjust dates and times if tz is not equal to 0 (only for ISOP)
    #use local tz calc to adjust times to UTC
    if species == 'ISOP':
        tz = int(data_tz)
        if tz != 0:
            for i in range(len(site_yyyymmdd)):
                #create datetime
                dt = datetime.datetime(int(site_yyyymmdd[i][:4]),int(site_yyyymmdd[i][4:6]),int(site_yyyymmdd[i][6:]),int(site_hhmm[i][:2]),int(site_hhmm[i][2:]))
                if tz > 0:
                    dt  = dt - datetime.timedelta(hours = int(tz))
                elif tz < 0:
                    dt  = dt + datetime.timedelta(hours = np.abs(int(tz)))
                site_yyyymmdd[i] = dt.strftime("%Y%m%d")
                site_hhmm[i] = dt.strftime("%H%M")
 
    #put vals into full grid
    date_con = np.array(site_yyyymmdd).astype(int)
    time_con = np.array(site_hhmm).astype(int)
    
    #remove data < 1970 and >= 2015
    test_inds = (date_con >= 19700101) & (date_con < 20150101)
    date_con = date_con[test_inds]
    time_con = time_con[test_inds]
    site_vals = site_vals[test_inds]
    mm_big = mm_big[test_inds]
    n_dup_arr = n_dup_arr[test_inds]
    
    #set st_big as 'continuous'
    st_big = ['continuous']*len(site_vals)
    
    #get obs valid
    test = site_vals >= 0
    valid_hours_dup = np.sum(n_dup_arr[test])
    n_obs_valid = len(site_vals[test]) - valid_hours_dup
    n_after_flagsandlod += n_obs_valid
    
    #create max possible grid
    full_data = np.empty(n_hours)
    full_data_after_flagsandlod = np.empty(n_hours)
    big_n_dup_array = np.zeros(n_hours)
    full_data[:] = -99999
    full_data_after_flagsandlod[:] = -99999
    
    #find matching times between actual times and grid of times, return big array of indices of matched indices in grid
    converted_time = modules.date_process(date_con,time_con,start_year)
    converted_time = np.round(converted_time,decimals=5)
    syn_grid_time = np.arange(0,n_days,1./24)
    syn_grid_time = np.round(syn_grid_time,decimals=5)
    raw_indices = np.searchsorted(syn_grid_time, converted_time, side='left')
    site_vals = np.array(site_vals)
    full_data_after_flagsandlod[raw_indices] = site_vals
    raw_st = np.copy(st_big)
    raw_mm = np.copy(mm_big)
    
    #test and remove duplicate and overlap points
    converted_time,site_vals,mm_big,st_big,n_dup_arr = modules.remove_duplicate_points(site_ref,converted_time,site_vals,mm_big,st_big,n_dup_arr,output_res)
    test = site_vals >= 0
    valid_hours_dup = np.sum(n_dup_arr[test])
    n_obs_valid = int(len(site_vals[test]) - valid_hours_dup)
    n_after_duplicate += n_obs_valid
    
    #find matching times between actual times and grid of times, return big array of indices of matched indices in grid
    indices = np.searchsorted(syn_grid_time, converted_time, side='left')
    full_data[indices] = site_vals 
    big_n_dup_array[indices] = n_dup_arr
    
    #unit = stats.mode(site_units)[0][0]
    #raw_class_name = meta_class[meta_index]
    #site_name = meta_sitenames[meta_index]
    #country = 'United States'
    #contact = '*****@*****.**'
    all_tz = [data_tz]
    
    key_meta = [lat,lon,alt]
    
    #set site file resolution 
    if species != 'ISOP':
        file_res = 'H'
    else:
        #if all site resolutions are same continue then take first file_res
        all_same = all(x == site_res[0] for x in site_res)
        if all_same == True:
            file_res = site_res[0]
        else:
        #otherwise take highest frequency res as file_res 
            if 'M' in site_res:
                file_res = 'M'
            elif 'D' in site_res:
                file_res = 'D'
            else:
                file_res = 'H'
    
    #get sampling/instrument grids
    raw_st_grid,p_st_grid,p_st_grid_after_flagsandlod,raw_mm_grid,p_mm_grid,p_mm_grid_after_flagsandlod,unknown_mm_list,unknown_mm_refs_list = modules.sampling_and_instruments_by_processgroup(site_ref,process_group,species,raw_st,raw_mm,full_data_after_flagsandlod,full_data,raw_indices,unknown_mm_list,unknown_mm_refs_list,no2_type)

    print set(p_mm_grid)

    #do quality checks                                                                                                                                                                                                                                                                                                     
    data_valid,full_data,valid_hours_dup,p_st_grid,p_mm_grid,n_all,inv_nometa,n_after_nometa,n_after_flagsandlod,n_after_duplicate,inv_anyvaliddata,n_after_anyvaliddata,inv_nokeymeta,n_after_nokeymeta,inv_resolution,n_after_resolution,inv_badmeasurementmethod,n_after_badmeasurementmethod,exit_r = modules.primary_quality_control(site_ref,species,file_res,no2_type,grid_dates,full_data,big_n_dup_array,valid_hours_dup,raw_st_grid,p_st_grid,p_st_grid_after_flagsandlod,raw_mm_grid,p_mm_grid,p_mm_grid_after_flagsandlod,data_resolution,n_obs_valid,key_meta,n_all,inv_nometa,n_after_nometa,n_after_flagsandlod,n_after_duplicate,inv_anyvaliddata,n_after_anyvaliddata,inv_nokeymeta,n_after_nokeymeta,inv_resolution,n_after_resolution,inv_badmeasurementmethod,n_after_badmeasurementmethod)
    if data_valid == False:
        exit_c_list = np.array([inv_nometa,inv_anyvaliddata,inv_nokeymeta,inv_resolution,inv_badmeasurementmethod])
        n_c_list = np.array([n_all,n_after_nometa,n_after_flagsandlod,n_after_duplicate,n_after_anyvaliddata,n_after_nokeymeta,n_after_resolution,n_after_badmeasurementmethod])
        unknown_list = [unknown_mm_list,unknown_mm_refs_list,unknown_local_tz_list]
        meta = [lat,lon,alt,'na','na','na','na','na','na','na','na','na']
        return c,['na'],['na'],['na'],False,meta,exit_c_list,n_c_list,unknown_list,exit_r,np.zeros(1)

    #set processed unit
    p_unit = 'pbbv'

    #pack meta
    meta = [lat,lon,alt,raw_class_name,file_res,unit,p_unit,data_tz,local_tz,site_name,country,contact]
    
    #if blank strings in meta then convert to 'na'
    for i in range(len(meta)):
        try:
            if meta[i].strip() == '':
                meta[i] = 'na'
        except:
            pass
    
    print set(raw_st_grid)
    print set(raw_mm_grid)
    print set(p_st_grid)
    print set(p_mm_grid)
    print meta
    
    
    exit_c_list = np.array([inv_nometa,inv_anyvaliddata,inv_nokeymeta,inv_resolution,inv_badmeasurementmethod])
    n_c_list = np.array([n_all,n_after_nometa,n_after_flagsandlod,n_after_duplicate,n_after_anyvaliddata,n_after_nokeymeta,n_after_resolution,n_after_badmeasurementmethod])
    print 'exit counts = ', exit_c_list
    print 'n obs counts = ', n_c_list

    unknown_list = [unknown_mm_list,unknown_mm_refs_list,unknown_local_tz_list]

    return c,full_data,p_st_grid,p_mm_grid,data_valid,meta,exit_c_list,n_c_list,unknown_list,'na',big_n_dup_array