def base_qc_report(rep,HardLimit): ''' Take a marine report and do some base qc on it. HardLimit: either a float value or None - this is a given maximum limit for clim test ''' #Basic positional QC rep.set_qc('POS', 'pos', qc.position_check(rep.getvar('LAT'), rep.getvar('LON'))) rep.set_qc('POS', 'date', qc.date_check(rep.getvar('YR'), rep.getvar('MO'), rep.getvar('DY'), rep.getvar('HR'))) # KW Test for day 1=day, 0=night if (rep.get_qc('POS', 'pos') == 0 and rep.get_qc('POS', 'date') == 0): rep.set_qc('POS', 'day', qc.day_test(rep.getvar('YR'), rep.getvar('MO'), rep.getvar('DY'), rep.getvar('HR'), rep.getvar('LAT'), rep.getvar('LON'))) else: rep.set_qc('POS', 'day', 1) rep.set_qc('POS', 'blklst', qc.blacklist(rep.getvar('ID'), rep.getvar('DCK'), rep.getvar('YR'), rep.getvar('LAT'), rep.getvar('LON'))) # KW NEW climatology check that uses the simultaneous climatological stdev (of all obs in pentad climatology) to # provide a threshold for outlier detection. According to ERA-Interim (?) AT over ocean stdev doesn't vary that much # but it is higher in the mid- to high lats, especially around the n. hemi coastlines. It is a little higher around the El Nino # tropical pacific warm pool region. stdev for DPT is higher than for AT - esp in the mid-lats. # Howmany stdevs to use? Looks like average stdev is 1-2. So 4.5*stdev = 4.5 to 9 deg. # 1 stdev ~68.2%, 2 stdev ~95.4%, 3 stdev 99.7%, 4 stdev ~99.9%, 4.5 stdev >99.9% # So for the 138196 workable obs from Dec 1973 4.5 stdev < 138 obs-ish # Lets start with 4.5 # I have added in the climatological stdevs to each rep so this should be easy # I'm only applying to AT and DPT # This really needs a minimum and maximum threshold on it to prevent too much removal of very small anomalies and not # enough removal of ridiculously large ones (>50deg for Dec 1973 which does seem crazy - needs checking with old use of SST clim # Min: stdev<0.5 are forced to be 0.5 so minimum threshold is 2.25 deg # Max: (was previously 10 deg - needs to be large enough to account for diurnal cycle vs pentad mean) stdev>3 forced # to be 3 so max threshold is 13.25 # KW - NEED TO CHANGE THE MAX/MIN PERMITTED SD AS WE'RE CUTTING OFF ABRUPTLY, ESP IF WE CONTINUE TO USE ERA # PROBABLY GO FOR MIN = 1 (4.5 deg) and MAX = 4 (18 deg)? Don't want to let too much rubbish in #SST base QC # KW Could noval = 0 be a value that is present in IMMA but actually a missing data indicator e.g. -99.9 or 99.9? rep.set_qc('SST', 'noval', qc.value_check(rep.getvar('SST'))) rep.set_qc('SST', 'freez', qc.sst_freeze_check(rep.getvar('SST'), 0.0)) rep.set_qc('SST', 'clim', qc.climatology_check(rep.getvar('SST'), rep.getnorm('SST'), 8.0)) rep.set_qc('SST', 'nonorm', qc.no_normal_check(rep.getnorm('SST'))) #MAT base QC # KW Could noval = 0 be a value that is present in IMMA but actually a missing data indicator e.g. -99.9 or 99.9? rep.set_qc('AT', 'noval', qc.value_check(rep.getvar('AT'))) # KW commented out old clim test and trying new one that uses 4.5*stdev as the threshold with minimum allowed limit and test for # no stdev found # rep.set_qc('AT', 'clim', # qc.climatology_check(rep.getvar('AT'), rep.getnorm('AT'), 10.0)) if (qc.value_check(rep.getstdev('AT')) == 0): # KW check for HardLimit or set to default of 4.5 if HardLimit != None: MyMulti = HardLimit else: MyMulti = 4.5 if (rep.getstdev('AT') > 4.): atlimit = MyMulti*4 elif ((rep.getstdev('AT') >= 1.) & (rep.getstdev('AT') <= 4.)): atlimit = MyMulti*rep.getstdev('AT') else: atlimit = MyMulti*1. else: atlimit = 10. rep.set_qc('AT', 'clim', qc.climatology_check(rep.getvar('AT'), rep.getnorm('AT'), atlimit)) #print('CLIMTEST: ',rep.getvar('AT'), rep.getnorm('AT'),rep.getstdev('AT'),qc.climatology_check(rep.getvar('AT'), rep.getnorm('AT'), HardLimit,dptlimit)) #pdb.set_trace() rep.set_qc('AT', 'nonorm', qc.no_normal_check(rep.getnorm('AT'))) # KW Added QC for DPT # DPT base QC rep.set_qc('DPT', 'noval', qc.value_check(rep.getvar('DPT'))) # KW commented out old clim test and trying new one that uses 4.5*stdev as the threshold with minimum allowed limit and test for # no stdev found # rep.set_qc('DPT', 'clim', # qc.climatology_check(rep.getvar('DPT'), rep.getnorm('DPT'), 10.0)) if (qc.value_check(rep.getstdev('DPT')) == 0): # KW check for HardLimit or set to default of 4.5 if HardLimit != None: MyMulti = HardLimit else: MyMulti = 4.5 if (rep.getstdev('DPT') > 4.): dptlimit = MyMulti*4. # greater than clim+/-10deg (13.5 deg) elif ((rep.getstdev('DPT') >= 1.) & (rep.getstdev('DPT') <= 4)): dptlimit = MyMulti*rep.getstdev('DPT') else: dptlimit = MyMulti*1. # less than clim+/- 10deg (2.25 deg) else: dptlimit = 10. rep.set_qc('DPT', 'clim', qc.climatology_check(rep.getvar('DPT'), rep.getnorm('DPT'), dptlimit)) #print('CLIMTEST: ',rep.getvar('DPT'), rep.getnorm('DPT'),rep.getstdev('DPT'),qc.climatology_check(rep.getvar('DPT'), rep.getnorm('DPT'), HardLimit,dptlimit)) #pdb.set_trace() rep.set_qc('DPT', 'nonorm', qc.no_normal_check(rep.getnorm('DPT'))) # KW New QC tests specifically for humidity rep.set_qc('DPT', 'ssat', qc.supersat_check(rep.getvar('DPT'),rep.getvar('AT'))) return rep
def main(argv): ''' This is the program that runs the base QC on data in the data base (created by Make_DB.py. The checks are the simpler checks, which can be performed on an observation-by-observation basis. ''' print '###############' print 'Running base_qc' print '###############' inputfile = 'configuration.txt' month1 = 1 month2 = 12 try: opts, args = getopt.getopt(argv,"hi:", ["ifile=", "year1=", "year2=", "month1=", "month2="]) except getopt.GetoptError: print 'Usage Make_DB.py -i <configuration_file> '+\ '--year1 <start year> --year2 <end year>'+\ '--month1 <start month> --month2 <end month>' sys.exit(2) inputfile, year1, year2, month1, month2 = qc.get_arguments(opts) print 'Input file is ', inputfile print 'Running from ',year1,' to ',year2 print 'Running from ',month1,' to ',month2 print '' config = qc.get_config(inputfile) data_base_host = config['data_base_host'] data_base_name = config['data_base_name'] print 'Data base host =', data_base_host print 'Data base name =', data_base_name print '' #connect to data base connection = MySQLdb.connect(host=data_base_host, user='******', db=data_base_name) for years,months in qc.year_month_gen(year1, month1, year2, month2): print '\nRunning Base QC for',years,months cursor = connection.cursor() cursor2 = connection.cursor() syr = str(years) '''set up a QC filter and use it to extract obs from the database direct into MarineReport format''' filter = db.Quality_Control_Filter() filter.year = years filter.month = months t0 = time.time() reps = db.get_marine_report_from_db(cursor,years,filter) t1 = time.time() total_time = t1-t0 print "read",total_time '''For each report, do all the basic QC checks then update the QC flags in the data base''' for rep in reps: rep.bad_position = qc.position_check(rep.lat, rep.lon) rep.bad_date = qc.date_check(rep.year, rep.month, rep.day, rep.hour) if rep.bad_position == 0 and rep.bad_date == 0: rep.day_check = qc.day_test(rep.year,rep.month,rep.day,rep.hour,rep.lat,rep.lon) else: rep.day_check = 1 rep.no_sst = qc.value_check(rep.sst) rep.sst_below_freezing = qc.sst_freeze_check(rep.sst, 0.0) rep.sst_climatology_fail = qc.climatology_check(rep.sst,rep.sst_norm,8.0) rep.no_sst_normal = qc.no_normal_check(rep.sst_norm) rep.no_mat = qc.value_check(rep.mat) rep.mat_climatology_fail = qc.climatology_check(rep.mat,rep.mat_norm,10.0) rep.no_mat_normal = qc.no_normal_check(rep.mat_norm) rep.blacklist = qc.blacklist(rep.id, rep.dck, rep.year, rep.lat, rep.lon) t15 = time.time() print "qcd",t15-t1 for rep in reps: result = db.update_db_basic_qc_flags(rep,years,cursor2) t2 = time.time() print "added to db",t2-t15 '''Commit the changes then print a summary''' connection.commit() #db.report_qc_counts(cursor,years,months) t3 = time.time() print "commited",t3-t2 connection.close() print "All Done :)"
def main(argv): ''' This program builds the marine data base which will be used to store the subset of ICOADS used in QC and other data processing. The current version reads in IMMA1 data from ICOADS.2.5.1 and the UID is used as the primary key for the data base so that it can be easily matched to individual obs if need be. The first step of the process is to read in the SST and MAT climatologies from file. These are 1degree latitude by 1 degree longitude by 73 pentad fields in NetCDF format. The data are read into numpy arrays. Next a connection is made to the data base, which may or may not already exist. If it does not exist, a database will be created. The program then loops over all years and months and DROPs existing tables for each year if they already exist and then recreates them. It then loops over all months in the year, opens the appropriate IMMA file and reads in the data one observation at a time. ''' print '######################' print 'Running Make_and_qc_DB' print '######################' inputfile = 'configuration.txt' month1 = 1 month2 = 12 try: opts, args = getopt.getopt(argv, "hi:", ["ifile=", "year1=", "year2=", "month1=", "month2="]) except getopt.GetoptError: print 'Usage Make_DB.py -i <configuration_file> '+\ '--year1 <start year> --year2 <end year> '+\ '--month1 <start month> --month2 <end month>' sys.exit(2) inputfile, year1, year2, month1, month2 = qc.get_arguments(opts) print 'Input file is ', inputfile print 'Running from ', year1, ' to ', year2 print '' config = qc.get_config(inputfile) sst_climatology_file = config['SST_climatology'] nmat_climatology_file = config['MAT_climatology'] data_base_host = config['data_base_host'] data_base_name = config['data_base_name'] icoads_dir = config['ICOADS_dir'] bad_id_file = config['IDs_to_exclude'] print 'SST climatology =', sst_climatology_file print 'NMAT climatology =', nmat_climatology_file print 'Data base host =', data_base_host print 'Data base name =', data_base_name print 'ICOADS directory =', icoads_dir print 'List of bad IDs =', bad_id_file print '' idfile = open(bad_id_file, 'r') ids_to_exclude = [] for line in idfile: line = line.rstrip() while len(line) < 9: line = line+' ' if line != ' ': ids_to_exclude.append(line) idfile.close() #read in climatology files climatology = Dataset(sst_climatology_file) climsst = climatology.variables['sst'][:] climatology = Dataset(nmat_climatology_file) climnmat = climatology.variables['nmat'][:] print 'Read climatology files' #connect to database connection = MySQLdb.connect(host=data_base_host, user='******', db=data_base_name) cursor = connection.cursor() t00 = time.time() for year in range(year1, year2+1): db.make_tables_for_year(cursor, year) db.make_additional_qc_table_for_year(cursor, year) # db.disable_keys(cursor, year) connection.commit() for month in range(1, 13): t0 = time.time() print year, month syr = str(year) smn = "%02d" % (month,) filename = icoads_dir+'/R2.5.1.'+syr+'.'+smn+'.gz' if year > 2007: filename = icoads_dir+'/R2.5.2.'+syr+'.'+smn+'.gz' icoads_file = gzip.open(filename,"r") rec = IMMA() count = 0 reps = [] while rec.read(icoads_file): if not(rec.data['ID'] in ids_to_exclude): try: rep = qc.imma1_record_to_marine_rep(rec, climsst, climnmat) rep.bad_position = qc.position_check(rep.lat, rep.lon) rep.bad_date = qc.date_check(rep.year, rep.month, rep.day, rep.hour) if rep.bad_position == 0 and rep.bad_date == 0: rep.day_check = qc.day_test(rep.year,rep.month,rep.day,rep.hour,rep.lat,rep.lon) else: rep.day_check = 1 rep.no_sst = qc.value_check(rep.sst) rep.sst_below_freezing = qc.sst_freeze_check(rep.sst, 0.0) rep.sst_climatology_fail = qc.climatology_check(rep.sst,rep.sst_norm,8.0) rep.no_sst_normal = qc.no_normal_check(rep.sst_norm) rep.no_mat = qc.value_check(rep.mat) rep.mat_climatology_fail = qc.climatology_check(rep.mat,rep.mat_norm,10.0) rep.no_mat_normal = qc.no_normal_check(rep.mat_norm) rep.blacklist = qc.blacklist(rep.id, rep.dck, rep.year, rep.lat, rep.lon) inyear = rep.year if year == rep.year: reps.append(rep) if len(reps) == 1000: db.add_multiple_marine_reports_to_db(cursor,year,reps) reps = [] # db.add_marine_report_to_db(cursor,inyear,rep) count += 1 except: assert False, "Failed to add records to database" #catch the as yet unadded obs db.add_multiple_marine_reports_to_db(cursor,year,reps) icoads_file.close() t1 = time.time() print count," obs ingested. ",t1-t0 #commit once per month connection.commit() # db.enable_keys(cursor, year) t11 = time.time() print year," done in ",t11-t00 #close the connection to the data base connection.close()