def main(argv): ''' This is the program that runs the base QC on data in the data base (created by Make_DB.py. The checks are the simpler checks, which can be performed on an observation-by-observation basis. ''' print '###############' print 'Running base_qc' print '###############' inputfile = 'configuration.txt' month1 = 1 month2 = 12 try: opts, args = getopt.getopt(argv,"hi:", ["ifile=", "year1=", "year2=", "month1=", "month2="]) except getopt.GetoptError: print 'Usage Make_DB.py -i <configuration_file> '+\ '--year1 <start year> --year2 <end year>'+\ '--month1 <start month> --month2 <end month>' sys.exit(2) inputfile, year1, year2, month1, month2 = qc.get_arguments(opts) print 'Input file is ', inputfile print 'Running from ',year1,' to ',year2 print 'Running from ',month1,' to ',month2 print '' config = qc.get_config(inputfile) data_base_host = config['data_base_host'] data_base_name = config['data_base_name'] print 'Data base host =', data_base_host print 'Data base name =', data_base_name print '' #connect to data base connection = MySQLdb.connect(host=data_base_host, user='******', db=data_base_name) for years,months in qc.year_month_gen(year1, month1, year2, month2): print '\nRunning Base QC for',years,months cursor = connection.cursor() cursor2 = connection.cursor() syr = str(years) '''set up a QC filter and use it to extract obs from the database direct into MarineReport format''' filter = db.Quality_Control_Filter() filter.year = years filter.month = months t0 = time.time() reps = db.get_marine_report_from_db(cursor,years,filter) t1 = time.time() total_time = t1-t0 print "read",total_time '''For each report, do all the basic QC checks then update the QC flags in the data base''' for rep in reps: rep.bad_position = qc.position_check(rep.lat, rep.lon) rep.bad_date = qc.date_check(rep.year, rep.month, rep.day, rep.hour) if rep.bad_position == 0 and rep.bad_date == 0: rep.day_check = qc.day_test(rep.year,rep.month,rep.day,rep.hour,rep.lat,rep.lon) else: rep.day_check = 1 rep.no_sst = qc.value_check(rep.sst) rep.sst_below_freezing = qc.sst_freeze_check(rep.sst, 0.0) rep.sst_climatology_fail = qc.climatology_check(rep.sst,rep.sst_norm,8.0) rep.no_sst_normal = qc.no_normal_check(rep.sst_norm) rep.no_mat = qc.value_check(rep.mat) rep.mat_climatology_fail = qc.climatology_check(rep.mat,rep.mat_norm,10.0) rep.no_mat_normal = qc.no_normal_check(rep.mat_norm) rep.blacklist = qc.blacklist(rep.id, rep.dck, rep.year, rep.lat, rep.lon) t15 = time.time() print "qcd",t15-t1 for rep in reps: result = db.update_db_basic_qc_flags(rep,years,cursor2) t2 = time.time() print "added to db",t2-t15 '''Commit the changes then print a summary''' connection.commit() #db.report_qc_counts(cursor,years,months) t3 = time.time() print "commited",t3-t2 connection.close() print "All Done :)"
def main(argv): ''' The buddy check compares observations to other nearby observations. If the observation differs substantially from the neighbour-average, the observation will be rejected. ''' print '###################' print 'Running bayesian_buddy_check' print '###################' inputfile = 'configuration.txt' month1 = 1 month2 = 12 try: opts, args = getopt.getopt(argv, "hi:", ["ifile=", "year1=", "year2=", "month1=", "month2="]) except getopt.GetoptError: print 'Usage Make_DB.py -i <configuration_file> '+\ '--year1 <start year> --year2 <end year> '+\ '--month1 <start month> --month2 <end month>' sys.exit(2) inputfile, year1, year2, month1, month2 = qc.get_arguments(opts) print 'Input file is ', inputfile print 'Running from year ', year1, ' to ', year2 print 'Running from month ', month1, ' to ', month2 print '' config = qc.get_config(inputfile) sst_climatology_file = config['SST_climatology'] nmat_climatology_file = config['MAT_climatology'] data_base_host = config['data_base_host'] data_base_name = config['data_base_name'] icoads_dir = config['ICOADS_dir'] sst_stdev_climatology_file = config['SST_stdev_climatology'] sst_stdev_1_file = config['SST_buddy_one_box_to_buddy_avg'] sst_stdev_2_file = config['SST_buddy_one_ob_to_box_avg'] sst_stdev_3_file = config['SST_buddy_avg_sampling'] print 'SST climatology =', sst_climatology_file print 'NMAT climatology =', nmat_climatology_file print 'Data base host =', data_base_host print 'Data base name =', data_base_name print 'ICOADS directory =', icoads_dir print '' #read in the pentad climatology of standard deviations climatology = Dataset(sst_stdev_climatology_file) sst_pentad_stdev = climatology.variables['sst'][:] climatology = Dataset(sst_stdev_1_file) sst_stdev_1 = climatology.variables['sst'][:] climatology = Dataset(sst_stdev_2_file) sst_stdev_2 = climatology.variables['sst'][:] climatology = Dataset(sst_stdev_3_file) sst_stdev_3 = climatology.variables['sst'][:] connection = MySQLdb.connect(host=data_base_host, user='******', db=data_base_name) cursor = connection.cursor() #read cursor2 = connection.cursor() #write for years, months in qc.year_month_gen(year1, month1, year2, month2): #want to get a month either side of the #target month, which may be in different years last_year, last_month = qc.last_month_was(years, months) next_year, next_month = qc.next_month_is(years, months) print years, months first_year = min([last_year, years, next_year]) final_year = max([last_year, years, next_year]) if first_year < 1850: first_year = 1850 if final_year > 2014: final_year = 2014 #first and last julian days are +- approximately one month month_lengths = qc.month_lengths(years) jul1 = qc.jul_day(years, months, 1)-25 jul2 = qc.jul_day(years, months, month_lengths[months-1])+25 for check_variable in ['SST','MAT']: t0 = time.time() reps = [] for yyy in range(first_year, final_year+1): qcfilter = db.Quality_Control_Filter() qcfilter.jul1 = jul1 qcfilter.jul2 = jul2 qcfilter.set_multiple_qc_flags_to_pass(['bad_position', 'bad_date', 'blacklist', 'new_track_check', 'fewsome_check']) if check_variable == 'SST': qcfilter.set_multiple_qc_flags_to_pass(['no_sst', 'sst_below_freezing', 'no_sst_normal', 'sst_climatology_fail']) elif check_variable == 'MAT': qcfilter.set_multiple_qc_flags_to_pass(['no_mat', 'no_mat_normal', 'mat_climatology_fail']) else: print "no such type ", check_variable assert False sql_request = db.build_sql_query(yyy, qcfilter) cursor.execute(sql_request) numrows = cursor.rowcount for i in range(numrows): rows = cursor.fetchone() rep = qc.MarineReport.report_from_array(rows) reps.append(rep) t1 = time.time() print len(reps)," observations read in", t1-t0 #Do the buddy check if check_variable == 'SST': qcs = qc_buddy_check.bayesian_buddy_check(reps, sst_stdev_1, sst_stdev_2, sst_stdev_3, 'SST') elif check_variable == 'MAT': qcs = qc_buddy_check.bayesian_buddy_check(reps, sst_stdev_1, sst_stdev_2, sst_stdev_3, 'MAT') else: print "no such type ", check_variable assert False t2 = time.time() print "done check ",t2-t1 #put updated QC flags into data base for rep in reps: if rep.month == months: if check_variable == 'SST': result = db.update_db_qc_single_flag(rep, rep.bayesian_sst_buddy_check, 'extra_qc', 'bayesian_sst_buddy_check', years, cursor2) elif check_variable == 'MAT': result = db.update_db_qc_single_flag(rep, rep.bayesian_mat_buddy_check, 'extra_qc', 'bayesian_mat_buddy_check', years, cursor2) else: print "no such type ", check_variable assert False t3 = time.time() print "Of "+str(len(qcs))+" observations "+\ str(np.sum(qcs))+" failed "+check_variable+\ " buddy check",t3-t2 connection.commit() #Each month db.report_qc_counts(cursor, years, months) connection.close() print "All Done :)"
def main(argv): ''' This program builds the marine data base which will be used to store the subset of ICOADS used in QC and other data processing. The current version reads in IMMA1 data from ICOADS.2.5.1 and the UID is used as the primary key for the data base so that it can be easily matched to individual obs if need be. #KW added para The database is now just a set of ascii files for each year/month. Later it may be the SQL database. The first step of the process is to read in the SST and MAT climatologies from file. These are 1degree latitude by 1 degree longitude by 73 pentad fields in NetCDF format. The data are read into numpy arrays. Next a connection is made to the data base, which may or may not already exist. If it does not exist, a database will be created. The program then loops over all years and months and DROPs existing tables for each year if they already exist and then recreates them. It then loops over all months in the year, opens the appropriate IMMA file and reads in the data one observation at a time. ''' print '########################' print 'Running make_and_full_qc' print '########################' inputfile = 'configuration.txt' month1 = 1 month2 = 1 year1 = 1880 year2 = 1880 # KW Querying second instance of inputfile - I have commented this out for now # inputfile = 'configuration_local.txt' try: opts, args = getopt.getopt(argv, "hi:", ["ifile=", "year1=", "year2=", "month1=", "month2="]) except getopt.GetoptError: # KW changed Make_DB.py to make_and_full_qc.py print 'Usage make_and_full_qc.py -i <configuration_file> '+\ '--year1 <start year> --year2 <end year> '+\ '--month1 <start month> --month2 <end month>' sys.exit(2) inputfile, year1, year2, month1, month2 = qc.get_arguments(opts) print 'Input file is ', inputfile print 'Running from ', year1, ' to ', year2 print '' config = qc.get_config(inputfile) # KW Added a 'switch' to tell the code whether to run in HadISDH only (HadISDHSwitch == True) mode or # full mode (HadISDHSwitch == False) HadISDHSwitch = config['HadISDHSwitch'] sst_climatology_file = config['SST_climatology'] nmat_climatology_file = config['MAT_climatology'] # KW Added climatology files for the humidity variables at_climatology_file = config['AT_climatology'] dpt_climatology_file = config['DPT_climatology'] shu_climatology_file = config['SHU_climatology'] vap_climatology_file = config['VAP_climatology'] crh_climatology_file = config['CRH_climatology'] cwb_climatology_file = config['CWB_climatology'] dpd_climatology_file = config['DPD_climatology'] # KW Added climatology file for the SLP which is needed if no SLP ob exists, or if it has failed qc - or if we choose to derive humidity using climatological P (which we have) slp_climatology_file = config['SLP_climatology'] icoads_dir = config['ICOADS_dir'] #KW Added the 'recent' ICOADS dir for files 2015+ recent_icoads_dir = config['RECENT_ICOADS_dir'] bad_id_file = config['IDs_to_exclude'] # KW added an item for the database dir to write out the QC'd ascii data to - hijacking SQL data_base_dir for now data_base_dir = config['data_base_dir'] # KW added an item as a suffix for the output file name to note which iteration we're on output_suffix = config['output_suffix'] # KW Noting this is set to read the OLD SST stdevs - nothing reads in the newer OSTIA one yet. sst_stdev_climatology_file = config['Old_SST_stdev_climatology'] sst_stdev_1_file = config['SST_buddy_one_box_to_buddy_avg'] sst_stdev_2_file = config['SST_buddy_one_ob_to_box_avg'] sst_stdev_3_file = config['SST_buddy_avg_sampling'] # KW added standard deviation files for AT and DPT - for MDSKate_buddy_check at_stdev_climatology_file = config['AT_stdev_climatology'] dpt_stdev_climatology_file = config['DPT_stdev_climatology'] # KW Added a look for hardwired limits passed through the config file or set to None if ('HardLimits' in config): HardLimit = np.float(config['HardLimits']) else: HardLimit = None print "This is the provided HardLimit: ",HardLimit #pdb.set_trace() print 'SST climatology =', sst_climatology_file print 'NMAT climatology =', nmat_climatology_file # KW Added climatology files for the humidity variables print 'DPT climatology =', dpt_climatology_file print 'SHU climatology =', shu_climatology_file print 'VAP climatology =', vap_climatology_file print 'CRH climatology =', crh_climatology_file print 'CWB climatology =', cwb_climatology_file print 'DPD climatology =', dpd_climatology_file ## KW Added climatology files for SLP for calculation of humidity variables if no good quality SLP ob exists print 'SLP climatology =', slp_climatology_file print 'ICOADS directory =', icoads_dir # KW added 'recent' icoads dir print 'RECENT ICOADS directory =', recent_icoads_dir print 'List of bad IDs =', bad_id_file # KW added an item for the database dir to write out the QC'd ascii data to - hijacking SQL data_base_dir for now print 'QCd Database directory =', data_base_dir print 'QCd File Suffix =', output_suffix print '' ids_to_exclude = process_bad_id_file(bad_id_file) #read in climatology files climsst = read_climatology(sst_climatology_file, 'sst') climnmat = read_climatology(nmat_climatology_file, 'nmat') # KW Added climatology read in files for the humidity variables climat = read_climatology(at_climatology_file, 't2m_clims') climdpt = read_climatology(dpt_climatology_file, 'td2m_clims') climshu = read_climatology(shu_climatology_file, 'q2m_clims') climvap = read_climatology(vap_climatology_file, 'e2m_clims') climcrh = read_climatology(crh_climatology_file, 'rh2m_clims') climcwb = read_climatology(cwb_climatology_file, 'tw2m_clims') climdpd = read_climatology(dpd_climatology_file, 'dpd2m_clims') ## KW Added climatology read in files for SLP for calculating humidity variabls if no SLP value exists climslp = read_climatology(slp_climatology_file, 'p2m_clims') # KW Note that if this points to OLD_SST_stdev_climatology then it is a 73,180,360 array whereas the SST_stdev_climatology file is just 180,360 sst_pentad_stdev = read_climatology(sst_stdev_climatology_file, 'sst') sst_stdev_1 = read_climatology(sst_stdev_1_file, 'sst') sst_stdev_2 = read_climatology(sst_stdev_2_file, 'sst') sst_stdev_3 = read_climatology(sst_stdev_3_file, 'sst') # KW added standard deviation files for AT and DPT - for MDSKate_buddy_check at_pentad_stdev = read_climatology(at_stdev_climatology_file, 't2m_stdevs') dpt_pentad_stdev = read_climatology(dpt_stdev_climatology_file, 'td2m_stdevs') print 'Read climatology files' tim00 = time.time() for year, month in qc.year_month_gen(year1, month1, year2, month2): tim0 = time.time() print year, month last_year, last_month = qc.last_month_was(year, month) next_year, next_month = qc.next_month_is(year, month) if last_year < 1850: last_year = 1850 # KW don't understand why last year forced to be 1850 yet last_month = 1 print last_year, last_month, next_year, next_month reps = ex.Deck() count = 0 # KW This takes a long time to read in each year/month and process # For every candidate year/month the year/month before and after are also read in # Can we store the candidate year/month and following year/month for the next loop? # Hopefully there will be enough memory on spice # HOWEVER - IF WE RUN MANY YEARS IN PARALELL THEN OK TO READ IN EACH TIME for readyear, readmonth in qc.year_month_gen(last_year, last_month, next_year, next_month): print readyear, readmonth syr = str(readyear) smn = "%02d" % (readmonth) # KW THIS BIT IS FOR 2.5.0/1 # filename = icoads_dir+'/R2.5.1.'+syr+'.'+smn+'.gz' # KW FOUND A BUG - changed 'year' to 'readyear' below because it was trying to # read R2.5.2.2007.12.gz because 'year'=2008, 'month'=1 # KW Now added a catch for 'recent' years - at present this is anything from 2015 onwards - data only available in IMMA (not IMMA2) format - no UID! # if ((readyear > 2007) & (readyear < 2015)): # filename = icoads_dir+'/R2.5.2.'+syr+'.'+smn+'.gz' # if (readyear >= 2015): # filename = recent_icoads_dir+'/IMMA.'+syr+'.'+smn+'.gz' # KW THIS BIT IS FOR 3.0.0/1 filename = icoads_dir+'/IMMA1_R3.0.0_'+syr+'-'+smn+'.gz' if (readyear >= 2015): filename = recent_icoads_dir+'/IMMA1_R3.0.1_'+syr+'-'+smn+'.gz' icoads_file = gzip.open(filename,"r") # KW Noted that this creates an object of whole month of IMMA data separated into all available parameters from all available attachments # The rec.read bit later could be speeded up by ignoring the attachments we are not interested in in the first place? # The rec object has a .data dictionary of all variables (see IMMA2.py for variable IDs/keys rec = IMMA() EOF = False while not(EOF): #need to wrap the read in a exception catching thingy #becasuse there are some IMMA records which contain control #characters try: result = rec.read(icoads_file) if result == None: EOF = True # KW are we sure this isn't doing anything silly later when rec is overwritten with a new rec - could # this overwrite ids_to_exclude[0]? rec.data['ID'] = ids_to_exclude[0] except: rec.data['ID'] = ids_to_exclude[0] if not(rec.data['ID'] in ids_to_exclude): #strip everything out of the IMMA record except what we # KW (Kate Robert and John)# need # KW this should work for both IMMA and IMMA1 e.g. C4 (IMMA) and C7 (IMMA1) use same 'key's so it 'should' find # them because both are encoded in IMMA2.py keys = [] for key in rec.data: keys.append(key) for key in keys: # KW Added quite a few things in here - assume these don't have to be all from attachment 0 because UID isn't # Assume they don't have to be in a particular order either # I've put them in the order they appear in the attachments # See: RequiredIMMAColumnsforHadISDH.xlsx # Only a few of these will be written out but they are useful in the QC and bias adjustment process # May remove some of these later if they are not useful - to save time/memory # if not(key in ['YR','MO','DY','HR','LAT','LON', # 'SST','AT','DCK','ID','PT','SI', # 'SIM','DS','VS','SLP','UID','SID']): if not(key in ['YR','MO','DY','HR','LAT','LON', 'DS','VS','II','ID','C1', 'DI','D','WI','W','VI','VV','SLP', 'IT','AT','WBTI','WBT','DPTI','DPT','SI','SST', 'DCK','SID','PT','DUPS', 'COR','TOB','TOT','EOT','TOH','EOH', 'SIM','LOV','HOP','HOT','HOB','HOA','SMF', 'UID']): if key in rec.data: del rec.data[key] # KW So I've noticed that if one of the listed keys above isn't in the ob then a data['key'] isn't # set up (makes sense!) so when I come to print them later it all goes to pot # So, I loop through the non-core0 keys here to add blank keys where they are missing # KW Added 'UID' to this list because it is not present in the RECENT_ICOADS (2015+) for inkey in ['DUPS','COR','TOB','TOT','EOT', 'TOH','EOH','SIM','LOV','HOP','HOT','HOB','HOA','SMF','UID']: if not(inkey in keys): #print("Missing key: ",inkey) rec.data[inkey] = None rep = ex.MarineReport(rec) del rec #************HadISDH ONLY******************************* # KW Added a catch here to check the platform type and whether there is both a T (AT) and DPT present. # Only keep the ob if it is from a ship (0,1,2,3,4,5) or moored platform/buoy (6,8,9,10,15) and has # AT and DPT present. # This may not be desirable for a full run but should save time/memory for HadISDH # If HadISDHSwitch == True then the ob needs to pass the test else all obs are processed # No QC performed yet so cannot call get_qc - qc.value_check returns 0 if present and 1 if noval # Previously I had also pulled through PT=14 but this can be a coastal or island station - so not what we want. # KW Oct 2016 - I've now decided that future runs shoudl NOT include any platforms. We don't have height # info and they can vary from <10 to >200m so its just too screwy # if (not (HadISDHSwitch)) | ((rep.data['PT'] in [0,1,2,3,4,5,6,8,9,10,15]) & if (not (HadISDHSwitch)) | ((rep.data['PT'] in [0,1,2,3,4,5,6,8]) & (qc.value_check(rep.getvar('AT')) == 0) & (qc.value_check(rep.getvar('DPT')) == 0)): # KW TESTED: WORKS IF VALUES ARE BLANK AT LEAST # KW CHECK THAT THIS KICKS OUT OBS WITH REPORTED MISSING VALUES (e.g. -99.9 or 99.9) FOR AT or DPT #******************************************************* # KW Call my rep.setvar routine that I built into the MarineReport in Extended_IMMA.py # Use this to add blank var containers for the humidity variables that are calculated # later rep.setvar(['SHU','VAP','CRH','CWB','DPD']) # KW Get climatologies for slp to calculate humidity values if no good quality qc ob exists rep_slp_clim = get_clim(rep, climslp) #print('SLP: ',rep_slp_clim) #if (count == 10): # pdb.set_trace() rep.add_climate_variable('SLP', rep_slp_clim) # KW Calculate humidity variables here - so we can then kick out anything really silly e.g. RH>150 # Very silly values can cause longer line lengths at output which is an extra problem for post processing # For the longer term these could be set to missing but we just want to focus on 'good' humidity obs for now # Use my new routine as part of the Extended_IMMA MarineReport class rep.calcvar() # This routine returns values as None if there is no climslp or if RH is < 0 or > 150. rep.calcvar(['SHU','VAP','CRH','CWB','DPD']) # Now we have the checker for very silly values - which will just break the loop # No RH - means that there is either an AT or DPT missing # RH must be between 0 and 150 # AT must be between -80 and 65 # DPT must be between -80 and 65 # SHU must be greater than 0.0 # Inadvertantly, this kicks out any ob for which no climatology is available - the ones that would late fail pos or date checks # Later on - we may change this to just set the humidity values to missing rather than delete the ob. SST might be ok after all. if (rep.getvar('CRH') == None): # print('Found a SILLINESS ',rep.getvar('AT'),rep.getvar('DPT')) # pdb.set_trace() # delete the rep to keep things tidy del rep # create a new rec because we're skipping the end of the WHILE loop rec = IMMA() continue if ((rep.getvar('CRH') <= 0.0) | (rep.getvar('CRH') > 150.0)): # print('Found a SILLINESS ',rep.getvar('AT'),rep.getvar('DPT')) # pdb.set_trace() # delete the rep to keep things tidy del rep # create a new rec because we're skipping the end of the WHILE loop rec = IMMA() continue if ((rep.getvar('AT') < -80.) | (rep.getvar('AT') > 65.)): # print('Found a SILLINESS ',rep.getvar('AT'),rep.getvar('DPT')) # pdb.set_trace() # delete the rep to keep things tidy del rep # create a new rec because we're skipping the end of the WHILE loop rec = IMMA() continue if ((rep.getvar('DPT') < -80.) | (rep.getvar('DPT') > 65.)): # print('Found a SILLINESS ',rep.getvar('AT'),rep.getvar('DPT')) # pdb.set_trace() # delete the rep to keep things tidy del rep # create a new rec because we're skipping the end of the WHILE loop rec = IMMA() continue if (rep.getvar('SHU') <= 0.0): # print('Found a SILLINESS ',rep.getvar('AT'),rep.getvar('DPT')) # pdb.set_trace() # delete the rep to keep things tidy del rep # create a new rec because we're skipping the end of the WHILE loop rec = IMMA() continue # Get climatologies for all variables (for outlier test and anomaly creation [done in buddy check and for final print out] - if AT or DPT are missing (None) then do not carry on processing that variable # If we're using OBSclims then there are missing data which will be returned as None (NOT A STRING!!!) # KW Added bit to find and store climatological stdev for AT and DPT - for outlier test rep_sst_clim = get_clim(rep, climsst) rep.add_climate_variable('SST', rep_sst_clim) # KW Set to read in ERA (or OBS+ERA) clim file for AT (not NMAT) # rep_mat_clim = get_clim(rep, climnmat) rep_mat_clim = get_clim(rep, climat) rep_mat_stdev = get_clim(rep, at_pentad_stdev) #print(rep_mat_clim,rep_mat_stdev) #pdb.set_trace() ## KW added to test clim value pulled out # print(rep.getvar('UID'),rep.getvar('AT'),rep_mat_clim,rep.getnorm('AT')) # print(rep.getvar('UID'),rep.getvar('AT'),rep_mat_stdev,rep.getstdev('AT')) # if (count == 10): # pdb.set_trace() ## KW This seems to be pulling out the correct climatological value if ((rep_mat_clim == None) | (rep_mat_stdev == None)): del rep # create a new rec because we're skipping the end of the WHILE loop rec = IMMA() continue else: rep.add_climate_variable('AT', rep_mat_clim) rep.add_stdev_variable('AT', rep_mat_stdev) rep_dpt_clim = get_clim(rep, climdpt) rep_dpt_stdev = get_clim(rep, dpt_pentad_stdev) if ((rep_dpt_clim == None) | (rep_dpt_stdev == None)): del rep rec = IMMA() continue else: rep.add_climate_variable('DPT', rep_dpt_clim) rep.add_stdev_variable('DPT', rep_dpt_stdev) rep_shu_clim = get_clim(rep, climshu) if (rep_shu_clim == None) : # if there is no SHU then either an AT or DPT would be missing I think so loop shoudld already be stopped del rep rec = IMMA() continue else: rep.add_climate_variable('SHU', rep_shu_clim) rep_vap_clim = get_clim(rep, climvap) if (rep_vap_clim == None) : # if there is no SHU then either an AT or DPT would be missing I think so loop shoudld already be stopped del rep rec = IMMA() continue else: rep.add_climate_variable('VAP', rep_vap_clim) rep_crh_clim = get_clim(rep, climcrh) if (rep_crh_clim == None) : # if there is no SHU then either an AT or DPT would be missing I think so loop shoudld already be stopped del rep rec = IMMA() continue else: rep.add_climate_variable('CRH', rep_crh_clim) rep_cwb_clim = get_clim(rep, climcwb) if (rep_cwb_clim == None) : # if there is no SHU then either an AT or DPT would be missing I think so loop shoudld already be stopped del rep rec = IMMA() continue else: rep.add_climate_variable('CWB', rep_cwb_clim) rep_dpd_clim = get_clim(rep, climdpd) if (rep_dpd_clim == None) : # if there is no SHU then either an AT or DPT would be missing I think so loop shoudld already be stopped del rep rec = IMMA() continue else: rep.add_climate_variable('DPD', rep_dpd_clim) #Deck 701 has a whole bunch of otherwise good obs with missing Hours. #Set to 0000UTC and recalculate the ob time if (rep.getvar('DCK') == 701 and rep.getvar('YR') < 1860 and rep.getvar('HR') == None): rep.data['HR'] = 0 rep.calculate_dt() # KW Added a HardLimit variable that has to be passed to the base_qc_report #rep = base_qc_report(rep) rep = base_qc_report(rep,HardLimit) # print(rep.getvar('ID'),rep.getvar('AT'),rep.getvar('DPT'),rep.getvar('SHU'),rep.getvar('CRH'),rep.getvar('VAP')) # pdb.set_trace() reps.append(rep) count += 1 rec = IMMA() icoads_file.close() tim1 = time.time() print count, " obs read and base QC ", tim1-tim0 #filter the obs into passes and fails of basic positional QC # KW NOtes that this uses the month before and after to apply track check - and so actually spends time applying # track check to the month before and month after too, which will then be ignored and redone later, with its following month # Is there scope to save effort here by only checking the candidate month while still passing the surrounding months for info reps.sort() filt = ex.QC_filter() filt.add_qc_filter('POS', 'date', 0) filt.add_qc_filter('POS', 'pos', 0) filt.add_qc_filter('POS', 'blklst', 0) passes, reps = filt.split_reports(reps) passes.sort() tim2 = time.time() print "obs filtered and sorted in ", tim2-tim1, len(reps)+len(passes) # KW So in here we could put some kind of parsing loop to say that if you are looping through more than one month # then you could save the candidate and previous month # KW ALSO NOW ONLY CARRY ON WITH THOSE OBS THAT PASS BASE QC (date, pos, blacklist) # KW commented out the following: ##all fails pass track check # reps.set_qc('POS', 'trk', 0) # reps.set_qc('POS', 'few', 0) # reps.set_qc('SST', 'rep', 0) # reps.set_qc('AT', 'rep', 0) ## KW Added for DPT # reps.set_qc('DPT', 'rep', 0) # reps.set_qc('DPT', 'repsat', 0) # KW End of commenting out # KW now clear and reset reps so that it gets overwritten and filled with only passes del reps reps = ex.Deck() #track check the passes one ship at a time for one_ship in passes.get_one_ship_at_a_time(): one_ship.track_check() # KW I don't think we need to spend time doing this for SST so have commented out # one_ship.find_repeated_values(threshold=0.7, intype='SST') # KW FOr AT and DPT this procedure now also looks at the proportion of obs in a track (>20 obs - same as rep value check) that have .0 precision # Where >=50% obs end in .0 the ATround or DPTround flag is set to 1 one_ship.find_repeated_values(threshold=0.7, intype='AT') # KW Added for DPT # KW For DPT this QC procedure now also searches for persistant streaks of 100% RH (AT == DPT) and flags repsat one_ship.find_repeated_values(threshold=0.7, intype='DPT') for rep in one_ship.rep_feed(): rep.reset_ext() reps.append(rep) del passes reps.sort() tim3 = time.time() print "obs track checked in ", tim3-tim2, len(reps) #******************************* # KW Commented out for now to save time on debug ##SST buddy check ## KW NOtes that this uses the month before and after to apply track check - and so actually spends time applying ## track check to the month before and month after too, which will then be ignored and redone later, with its following month ## Is there scope to save effort here by only checking the candidate month while still passing the surrounding months for info # filt = ex.QC_filter() # filt.add_qc_filter('POS', 'date', 0) # filt.add_qc_filter('POS', 'pos', 0) # filt.add_qc_filter('POS', 'blklst', 0) # filt.add_qc_filter('POS', 'trk', 0) # filt.add_qc_filter('SST', 'noval', 0) # filt.add_qc_filter('SST', 'freez', 0) # filt.add_qc_filter('SST', 'clim', 0) # filt.add_qc_filter('SST', 'nonorm', 0) # ## KW Notes splitting marine obs into passes and fails # passes, reps = filt.split_reports(reps) # ## KW Thinks this only buddy checks those obs that pass the filter of QC above # passes.bayesian_buddy_check('SST', sst_stdev_1, sst_stdev_2, sst_stdev_3) # passes.mds_buddy_check('SST', sst_pentad_stdev) # #****************************************** ## KW Thinks all fails obs that do not pass teh QC filter above are not buddy checked - they are set to 0 ## which means pass but should not be used later because they fail one of the other basic checks # reps.set_qc('SST', 'bbud', 0) # reps.set_qc('SST', 'bud', 0) #**************************************** # KW Commented out to save time # for i in range(0, len(passes)): # rep = passes.pop(0) # reps.append(rep) # # del passes # # reps.sort() #**************************************** tim4 = time.time() print "obs SST buddy checked in ", tim4-tim3, len(reps) #NMAT buddy check # KW NOtes that this uses the month before and after to apply track check - and so actually spends time applying # track check to the month before and month after too, which will then be ignored and redone later, with its following month # Is there scope to save effort here by only checking the candidate month while still passing the surrounding months for info? # For now I've made mdsKATE_buddy_check which only applies actual check to candidate month and year. It also uses actual pentad # for that time of year rather than the average pentad stdev. filt = ex.QC_filter() ## KW Commented out date/pos/blklst as these have already been filtered out # filt.add_qc_filter('POS', 'date', 0) # filt.add_qc_filter('POS', 'pos', 0) # filt.add_qc_filter('POS', 'blklst', 0) filt.add_qc_filter('POS', 'trk', 0) # KW commented out because we want to try to use all obs for AT and SPT # filt.add_qc_filter('POS', 'day', 0) # KW Commented out because we've already filtered so that only present obs are retained # filt.add_qc_filter('AT', 'noval', 0) filt.add_qc_filter('AT', 'clim', 0) filt.add_qc_filter('AT', 'nonorm', 0) # KW Notes that 'reps' are those obs that have failed one of the tests in the filter above passes, reps = filt.split_reports(reps) # KW Notes that passes is an object containing a months worth of marine obs that pass (flag=0) for all above filters # Both the bayesian buddy check and the mds buddy check test for distance to neighbours in space and time and flag # with a 1 where it is too great/fails. # KW NOT GOING TO APPLY BAYESIAN BUDDY CHECK BECAUSE WE CAN'T USE IT FOR DPT AND # ITS EXPERIMENTAL??? # passes.bayesian_buddy_check('AT', sst_stdev_1, sst_stdev_2, sst_stdev_3) # KW Commented out original mds_buddy_check to use mdsKATE_buddy_check instead (like DPT) which uses the seasonal stdev # rather than the average and only applies buddy check to candidate month # ALSO = we now use clim T stdevs from ERA (will eventually be obs+ERA combo?) # passes.mds_buddy_check('AT', sst_pentad_stdev) # KW Added a HardLimit variable that has to be passed to mdsKATE_buddy_check for the stdev multiplier passes.mdsKATE_buddy_check('AT', at_pentad_stdev, year, month, HardLimit) # KW - all fails (reps) are set to have a flag of 0 which means to pass the buddy checks.because there is no point in spending # further time buddy checking them, same as for track checks # KW NOT GOING TO APPLY BAYESIAN BUDDY CHECK BECAUSE WE CAN'T USE IT FOR DPT AND # ITS EXPERIMENTAL??? # reps.set_qc('AT', 'bbud', 8) reps.set_qc('AT', 'bud', 8) for i in range(0, len(passes)): rep = passes.pop(0) reps.append(rep) del passes reps.sort() tim5 = time.time() print "obs MAT buddy checked in ", tim5-tim4, len(reps) # Don't think we need to set - if its not set it will be 9! ## KW Added buddy check for DPT - NOT RUNNING BAYESIAN BECAUSE WE DON'T HAVE APPROPRIATE DATA - SET FLAG TO 8! # reps.set_qc('DPT', 'bbud', 8) #DPT buddy check # KW NOtes that this uses the month before and after to apply track check - and so actually spends time applying # track check to the month before and month after too, which will then be ignored and redone later, with its following month # Is there scope to save effort here by only checking the candidate month while still passing the surrounding months for info filt = ex.QC_filter() # KW commented out date, pos, blklst because we've already got rid of those that fail these # filt.add_qc_filter('POS', 'date', 0) # filt.add_qc_filter('POS', 'pos', 0) # filt.add_qc_filter('POS', 'blklst', 0) filt.add_qc_filter('POS', 'trk', 0) # KW Commented out day because we want to try to use all obs for DPT and AT # filt.add_qc_filter('POS', 'day', 0) # Hmmm so only checking the nightime obs # KW Commented out because we've already filtered so that only present obs are retained # filt.add_qc_filter('DPT', 'noval', 0) filt.add_qc_filter('DPT', 'clim', 0) # KW commented out nonorm because there will always be a norm (if using ERA or combo ERA+obs) # filt.add_qc_filter('DPT', 'nonorm', 0) # KW could change this to ERANorm when we have actual climatologies from data - more useful because there always will be a norm from ERA # KW Notes that 'reps' are those obs that have failed one of the tests in the filter above passes, reps = filt.split_reports(reps) # KW Notes that passes is an object containing a months worth of marine obs that pass (flag=0) for all above filters # Both the bayesian buddy check and the mds buddy check test for distance to neighbours in space and time and flag # with a 1 where it is too great/fails. # passes.bayesian_buddy_check('DPT', sst_stdev_1, sst_stdev_2, sst_stdev_3) # passes.mds_buddy_check('DPT', dpt_pentad_stdev) # KW Added a HardLimit variable that has to be passed to mdsKATE_buddy_check for the stdev multiplier # KW Using Kate's version of MDS buddy check now which has a stdev for each pentad and only checks candidate month passes.mdsKATE_buddy_check('DPT', dpt_pentad_stdev, year, month, HardLimit) # KW - all fails (reps) are set to have a flag of 0 which means to pass the buddy checks.because there is no point in spending # further time buddy checking them, same as for track checks # reps.set_qc('DPT', 'bbud', 8) reps.set_qc('DPT', 'bud', 8) # KW set as 8 for now for i in range(0, len(passes)): rep = passes.pop(0) reps.append(rep) del passes reps.sort() tim6 = time.time() print "obs DPT buddy checked in ", tim6-tim5, len(reps) syr = str(year) smn = "%02d" % (month) # KW changed outfile from icoards_dir to data_base_dir so that it writes to a different place to where the original # data are stored - don't want to mess with John's working version. outfile = open(data_base_dir+'/new_suite_'+syr+smn+'_'+output_suffix+'.txt', 'w') for rep in reps.reps: if rep.data['YR'] == year and rep.data['MO'] == month: outfile.write(rep.print_report()) outfile.close() del reps tim11 = time.time() print year, " so far in ", tim11-tim00
def main(argv): ''' This program builds the marine data base which will be used to store the subset of ICOADS used in QC and other data processing. The current version reads in IMMA1 data from ICOADS.2.5.1 and the UID is used as the primary key for the data base so that it can be easily matched to individual obs if need be. The first step of the process is to read in the SST and MAT climatologies from file. These are 1degree latitude by 1 degree longitude by 73 pentad fields in NetCDF format. The data are read into numpy arrays. Next a connection is made to the data base, which may or may not already exist. If it does not exist, a database will be created. The program then loops over all years and months and DROPs existing tables for each year if they already exist and then recreates them. It then loops over all months in the year, opens the appropriate IMMA file and reads in the data one observation at a time. ''' print '######################' print 'Running Make_and_qc_DB' print '######################' inputfile = 'configuration.txt' month1 = 1 month2 = 12 try: opts, args = getopt.getopt(argv, "hi:", ["ifile=", "year1=", "year2=", "month1=", "month2="]) except getopt.GetoptError: print 'Usage Make_DB.py -i <configuration_file> '+\ '--year1 <start year> --year2 <end year> '+\ '--month1 <start month> --month2 <end month>' sys.exit(2) inputfile, year1, year2, month1, month2 = qc.get_arguments(opts) print 'Input file is ', inputfile print 'Running from ', year1, ' to ', year2 print '' config = qc.get_config(inputfile) sst_climatology_file = config['SST_climatology'] nmat_climatology_file = config['MAT_climatology'] data_base_host = config['data_base_host'] data_base_name = config['data_base_name'] icoads_dir = config['ICOADS_dir'] bad_id_file = config['IDs_to_exclude'] print 'SST climatology =', sst_climatology_file print 'NMAT climatology =', nmat_climatology_file print 'Data base host =', data_base_host print 'Data base name =', data_base_name print 'ICOADS directory =', icoads_dir print 'List of bad IDs =', bad_id_file print '' idfile = open(bad_id_file, 'r') ids_to_exclude = [] for line in idfile: line = line.rstrip() while len(line) < 9: line = line+' ' if line != ' ': ids_to_exclude.append(line) idfile.close() #read in climatology files climatology = Dataset(sst_climatology_file) climsst = climatology.variables['sst'][:] climatology = Dataset(nmat_climatology_file) climnmat = climatology.variables['nmat'][:] print 'Read climatology files' #connect to database connection = MySQLdb.connect(host=data_base_host, user='******', db=data_base_name) cursor = connection.cursor() t00 = time.time() for year in range(year1, year2+1): db.make_tables_for_year(cursor, year) db.make_additional_qc_table_for_year(cursor, year) # db.disable_keys(cursor, year) connection.commit() for month in range(1, 13): t0 = time.time() print year, month syr = str(year) smn = "%02d" % (month,) filename = icoads_dir+'/R2.5.1.'+syr+'.'+smn+'.gz' if year > 2007: filename = icoads_dir+'/R2.5.2.'+syr+'.'+smn+'.gz' icoads_file = gzip.open(filename,"r") rec = IMMA() count = 0 reps = [] while rec.read(icoads_file): if not(rec.data['ID'] in ids_to_exclude): try: rep = qc.imma1_record_to_marine_rep(rec, climsst, climnmat) rep.bad_position = qc.position_check(rep.lat, rep.lon) rep.bad_date = qc.date_check(rep.year, rep.month, rep.day, rep.hour) if rep.bad_position == 0 and rep.bad_date == 0: rep.day_check = qc.day_test(rep.year,rep.month,rep.day,rep.hour,rep.lat,rep.lon) else: rep.day_check = 1 rep.no_sst = qc.value_check(rep.sst) rep.sst_below_freezing = qc.sst_freeze_check(rep.sst, 0.0) rep.sst_climatology_fail = qc.climatology_check(rep.sst,rep.sst_norm,8.0) rep.no_sst_normal = qc.no_normal_check(rep.sst_norm) rep.no_mat = qc.value_check(rep.mat) rep.mat_climatology_fail = qc.climatology_check(rep.mat,rep.mat_norm,10.0) rep.no_mat_normal = qc.no_normal_check(rep.mat_norm) rep.blacklist = qc.blacklist(rep.id, rep.dck, rep.year, rep.lat, rep.lon) inyear = rep.year if year == rep.year: reps.append(rep) if len(reps) == 1000: db.add_multiple_marine_reports_to_db(cursor,year,reps) reps = [] # db.add_marine_report_to_db(cursor,inyear,rep) count += 1 except: assert False, "Failed to add records to database" #catch the as yet unadded obs db.add_multiple_marine_reports_to_db(cursor,year,reps) icoads_file.close() t1 = time.time() print count," obs ingested. ",t1-t0 #commit once per month connection.commit() # db.enable_keys(cursor, year) t11 = time.time() print year," done in ",t11-t00 #close the connection to the data base connection.close()