def record_parameters(profile, bgStdLevels, bgevStdLevels, origLevels, ptLevels, bgLevels): # pack the parameter arrays into the enbackground table # for consumption by the buddy check bgstdlevels = main.pack_array(bgStdLevels) bgevstdlevels = main.pack_array(bgevStdLevels) origlevels = main.pack_array(origLevels) ptlevels = main.pack_array(ptLevels) bglevels = main.pack_array(bgLevels) query = "REPLACE INTO enbackground VALUES(?,?,?,?,?,?);" main.dbinteract(query, [ profile.uid(), bgstdlevels, bgevstdlevels, origlevels, ptlevels, bglevels ])
def process_row(uid, logdir): '''run all tests on the indicated database row''' # reroute stdout, stderr to separate files for each profile to preserve logs sys.stdout = open(logdir + "/" + str(uid) + ".stdout", "w") sys.stderr = open(logdir + "/" + str(uid) + ".stderr", "w") # extract profile profile = main.get_profile_from_db(uid) # mask out error codes in temperature data main.catchFlags(profile) # run tests for itest, test in enumerate(testNames): try: result = run(test, [profile], parameterStore)[0] except: print test, 'exception', sys.exc_info() result = np.zeros(1, dtype=bool) try: query = "UPDATE " + sys.argv[ 1] + " SET " + test + "=? WHERE uid=" + str( profile.uid()) + ";" main.dbinteract(query, [main.pack_array(result)]) except: print 'db exception', sys.exc_info()
def test(p, parameters): """ Runs the quality control check on profile p and returns a numpy array of quality control decisions with False where the data value has passed the check and True where it failed. """ cruise = p.cruise() uid = p.uid() # don't bother if cruise == 0 or None, or if timestamp is corrupt if (cruise in [0, None]) or (None in [p.year(), p.month(), p.day(), p.time()]): return np.zeros(1, dtype=bool) # don't bother if this has already been analyzed command = 'SELECT en_track_check FROM ' + parameters["table"] + ' WHERE uid = ' + str(uid) + ';' en_track_result = main.dbinteract(command) if en_track_result[0][0] is not None: en_track_result = main.unpack_row(en_track_result[0])[0] result = np.zeros(1, dtype=bool) result[0] = np.any(en_track_result) return result # some detector types cannot be assessed by this test; do not raise flag. if p.probe_type() in [None]: return np.zeros(1, dtype=bool) # fetch all profiles on track, sorted chronologically, earliest first (None sorted as highest) command = 'SELECT uid, year, month, day, time, lat, long, probe FROM ' + parameters["table"] + ' WHERE cruise = ' + str(cruise) + ' and year is not null and month is not null and day is not null and time is not null ORDER BY year, month, day, time, uid ASC;' track_rows = main.dbinteract(command) # start all as passing by default: EN_track_results = {} for i in range(len(track_rows)): EN_track_results[track_rows[i][0]] = np.zeros(1, dtype=bool) # copy the list of headers; # remove entries as they are flagged. passed_rows = copy.deepcopy(track_rows) rejects = findOutlier(passed_rows, EN_track_results) while rejects != []: passed_index = [x for x in range(len(passed_rows)) if x not in rejects ] passed_rows = [passed_rows[index] for index in passed_index ] rejects = findOutlier(passed_rows, EN_track_results) # if more than half got rejected, reject everyone if len(passed_rows) < len(track_rows) / 2: for i in range(len(track_rows)): EN_track_results[track_rows[i][0]][0] = True # write all to db result = [] for i in range(len(track_rows)): result.append((main.pack_array(EN_track_results[track_rows[i][0]]), track_rows[i][0])) query = "UPDATE " + sys.argv[1] + " SET en_track_check=? WHERE uid=?" main.interact_many(query, result) return EN_track_results[uid]
def process_row(uid, logdir): '''run all tests on the indicated database row''' # reroute stdout, stderr to separate files for each profile to preserve logs sys.stdout = open(logdir + "/" + str(uid) + ".stdout", "w") sys.stderr = open(logdir + "/" + str(uid) + ".stderr", "w") # extract profile profile = main.get_profile_from_db(uid) # mask out error codes in temperature data main.catchFlags(profile) # run tests for itest, test in enumerate(testNames): try: result = run(test, [profile], parameterStore)[0] except: print test, 'exception', sys.exc_info() result = np.zeros(1, dtype=bool) try: query = "UPDATE " + sys.argv[1] + " SET " + test + "=? WHERE uid=" + str(profile.uid()) + ";" main.dbinteract(query, [main.pack_array(result)]) except: print 'db exception', sys.exc_info()
def test(p, parameters): """ Runs the quality control check on profile p and returns a numpy array of quality control decisions with False where the data value has passed the check and True where it failed. """ country = p.primary_header['Country code'] cruise = p.cruise() originator_cruise = p.originator_cruise() uid = p.uid() # don't bother if this has already been analyzed command = 'SELECT en_track_check FROM ' + parameters["table"] + ' WHERE uid = ' + str(uid) + ';' en_track_result = main.dbinteract(command) if en_track_result[0][0] is not None: en_track_result = main.unpack_row(en_track_result[0])[0] result = np.zeros(1, dtype=bool) result[0] = np.any(en_track_result) return result # make sure this profile makes sense in the track check if not assess_usability(p): return np.zeros(1, dtype=bool) # fetch all profiles on track, sorted chronologically, earliest first (None sorted as highest), then by uid command = 'SELECT uid, year, month, day, time, lat, long, probe, raw FROM ' + parameters["table"] + ' WHERE cruise = ' + str(cruise) + ' and country = "' + str(country) + '" and ocruise = "' + str(originator_cruise) + '" and year is not null and month is not null and day is not null and time is not null ORDER BY year, month, day, time, uid ASC;' track_rows = main.dbinteract(command) # avoid inappropriate profiles track_rows = [tr for tr in track_rows if assess_usability_raw(tr[8][1:-1])] # start all as passing by default EN_track_results = {} for i in range(len(track_rows)): EN_track_results[track_rows[i][0]] = np.zeros(1, dtype=bool) # copy the list of headers; # remove entries as they are flagged. passed_rows = copy.deepcopy(track_rows) rejects = findOutlier(passed_rows, EN_track_results) while rejects != []: passed_index = [x for x in range(len(passed_rows)) if x not in rejects ] passed_rows = [passed_rows[index] for index in passed_index ] rejects = findOutlier(passed_rows, EN_track_results) # if more than half got rejected, reject everyone if len(passed_rows) < len(track_rows) / 2: for i in range(len(track_rows)): EN_track_results[track_rows[i][0]][0] = True # write all to db result = [] for i in range(len(track_rows)): result.append((main.pack_array(EN_track_results[track_rows[i][0]]), track_rows[i][0])) query = "UPDATE " + sys.argv[1] + " SET en_track_check=? WHERE uid=?" main.interact_many(query, result) return EN_track_results[uid]
def builddb(infile, check_originator_flag_type=True, months_to_use=range(1, 13), outfile='iquod.db', dbtable='iquod'): conn = sqlite3.connect(outfile, isolation_level=None) cur = conn.cursor() # Identify tests testNames = main.importQC('qctests') testNames.sort() # set up our table query = "CREATE TABLE IF NOT EXISTS " + dbtable + """( raw text, truth BLOB, uid integer PRIMARY KEY, year integer, month integer, day integer, time real, lat real, long real, country text, cruise integer, ocruise text, probe integer, training integer, flagged integer, """ for i in range(len(testNames)): query += testNames[i].lower() + ' BLOB' if i < len(testNames) - 1: query += ',' else: query += ');' cur.execute(query) # populate table from wod-ascii data fid = open(infile) uids = [] good = 0 bad = 0 while True: # extract profile as wodpy object and raw text start = fid.tell() profile = wod.WodProfile(fid) end = fid.tell() fid.seek(start) raw = fid.read(end - start) fid.seek(end) # set up dictionary for populating query string p = profile.npdict() p['raw'] = "'" + raw + "'" # check for duplicate profiles in raw data if p['uid'] in uids: if profile.is_last_profile_in_file(fid) == True: break else: continue uids.append(p['uid']) # skip pathological profiles isgood = assessProfile(profile, check_originator_flag_type, months_to_use) if not isgood and profile.is_last_profile_in_file(fid) == True: break elif not isgood: continue # encode temperature error codes into truth array truth = encodeTruth(profile) p['truth'] = main.pack_array(truth) # extract country code country = profile.primary_header['Country code'] # originator cruise orig_cruise = profile.originator_cruise() # keep tabs on how many good and how many bad profiles have been added to db # nowire == index of first wire break level wireqc = qctests.CSIRO_wire_break.test(profile, {}) try: nowire = list(wireqc).index(True) except: nowire = len(truth) # flag only counts if its before the wire break: flagged = dbutils.summarize_truth(truth[0:nowire]) if flagged: bad += 1 else: good += 1 query = "INSERT INTO " + dbtable + " (raw, truth, uid, year, month, day, time, lat, long, country, cruise, ocruise, probe, flagged) values (?,?,?,?,?,?,?,?,?,?,?,?,?,?);" values = (p['raw'], p['truth'], p['uid'], p['year'], p['month'], p['day'], p['time'], p['latitude'], p['longitude'], country, p['cruise'], orig_cruise, p['probe_type'], int(flagged)) main.dbinteract(query, values, targetdb=outfile) if profile.is_last_profile_in_file(fid) == True: break conn.commit() print('number of clean profiles written:', good) print('number of flagged profiles written:', bad) print('total number of profiles written:', good + bad)
def db_to_df(table, filter_on_wire_break_test=False, filter_on_tests={}, n_to_extract=numpy.iinfo(numpy.int32).max, applyparse=True, targetdb='iquod.db'): ''' Reads the table from targetdb into a pandas dataframe. If filter_on_wire_break_test is True, the results from that test are used to exclude levels below a wire break from the test results and the wire break test is not returned. filter_on_tests is a generalised form of filter_on_wire_break and is used to exclude results; it takes a list of [testname, action], where levels failing <testname> are excluded towards the surface (if action is 'up'), towards depth (if action is 'down') and the whole profile deleted (if action is 'remove'). Set n_to_extract to limit the number of rows extracted to the specified number. ''' # what tests are available testNames = main.importQC('qctests') testNames.sort() # connect to database conn = sqlite3.connect(targetdb, isolation_level=None) cur = conn.cursor() # extract matrix of test results and true flags into a dataframe query = 'SELECT uid, truth' for test in testNames: query += ', ' + test.lower() query += ' FROM ' + table query += ' WHERE uid IN (SELECT uid FROM ' + table + ' ORDER BY RANDOM() LIMIT ' + str( n_to_extract) + ')' cur.execute(query) rawresults = cur.fetchall() sub = 1000 df_final = None for i in range(math.ceil(len(rawresults) / sub)): df = pandas.DataFrame(rawresults[i * sub:(i + 1) * sub]).astype('bytes') df.columns = ['uid', 'Truth'] + testNames df = df.astype({'uid': 'int'}) if filter_on_wire_break_test: nlevels = get_n_levels_before_fail(df['CSIRO_wire_break']) del df['CSIRO_wire_break'] # No use for this now. testNames = df.columns[2:].values.tolist() for i in range(len(df.index)): for j in range(1, len(df.columns)): qc = unpack_qc(df.iloc[i, j]) # Some QC tests may return only one value so check for this. if len(qc) > 1: qc = qc[:nlevels[i]] df.iat[i, j] = main.pack_array(qc) todrop = set() for action in filter_on_tests: # Check if the action is relevant. if action == 'Optional' or action == 'At least one from group': continue # Initialise variables. nlevels = -1 outcomes = False qcresults = [] for testname in filter_on_tests[action]: for i in range(0, len(df.index)): if action == 'Remove above reject': nlevels = get_reversed_n_levels_before_fail( [df[testname][i]])[0] elif action == 'Remove below reject': nlevels = get_n_levels_before_fail([df[testname][i] ])[0] elif action == 'Remove profile': outcomes = check_for_fail([df[testname][i]])[0] elif action == 'Remove rejected levels': qcresults = unpack_qc_results([df[testname][i]])[0] else: raise NameError('Unrecognised action: ' + action) if (((action == 'Remove above reject' or action == 'Remove below reject') and nlevels == 0) or (action == 'Remove profile' and outcomes == True) or (action == 'Remove rejected levels' and numpy.count_nonzero(qcresults == False) == 0)): # Completely remove a profile if it has no valid levels or if it # has a fail and the action is to remove. todrop.add(i) elif (action != 'Remove profile'): for j in range(1, len(df.columns)): # Retain only the levels that passed testname. # Some QC tests may return only one value so check for this. qc = unpack_qc(df.iloc[i, j]) if len(qc) > 1: if action == 'Remove above reject': qc = qc[nlevels:] elif action == 'Remove below reject': qc = qc[:nlevels] elif action == 'Remove rejected levels': qc = qc[qcresults == False] df.iat[i, j] = main.pack_array(qc) del df[testname] # No need to keep this any longer. df.reset_index(inplace=True, drop=True) todrop = list(todrop) if len(todrop) > 0: df.drop(todrop, inplace=True) df.reset_index(inplace=True, drop=True) testNames = df.columns[2:].values.tolist() if applyparse: df[['Truth']] = df[['Truth']].apply(parse_truth) df[testNames] = df[testNames].apply(parse) if i == 0: df_final = df else: df_final = pandas.concat([df_final, df]) return df_final.reset_index(drop=True)
def run_qc(p, suspect): # check for pre-registered suspect tabulation, if that's what we want: if suspect: query = 'SELECT suspect FROM enspikeandstep WHERE uid = ' + str(p.uid()) + ';' susp = main.dbinteract(query) if len(susp) > 0: return main.unpack_row(susp[0])[0] # Define tolerances used. tolD = np.array([0, 200, 300, 500, 600]) tolDTrop = np.array([0, 300, 400, 500, 600]) tolT = np.array([5.0, 5.0, 2.5, 2.0, 1.5]) # Define an array to hold results. qc = np.zeros(p.n_levels(), dtype=bool) # Get depth and temperature values from the profile. z = p.z() t = p.t() # Find which levels have data. isTemperature = (t.mask==False) isDepth = (z.mask==False) isData = isTemperature & isDepth # Array to hold temperature differences between levels and gradients. dt, gt = composeDT(t, z, p.n_levels()) # Spikes and steps detection. for i in range(1, p.n_levels()): if i >= 2: if (isData[i-2] and isData[i-1] and isData[i]) == False: continue if z[i] - z[i-2] >= 5.0: wt1 = (z[i-1] - z[i-2]) / (z[i] - z[i-2]) else: wt1 = 0.5 else: if (isData[i-1] and isData[i]) == False: continue wt1 = 0.5 dTTol = determineDepthTolerance(z[i-1], np.abs(p.latitude())) gTTol = 0.05 # Check for low temperatures in the Tropics. # This might be more appropriate to appear in a separate EN regional # range check but is included here for now for consistency with the # original code. if (np.abs(p.latitude()) < 20.0 and z[i-1] < 1000.0 and t[i-1] < 1.0): dt[i] = np.ma.masked if suspect == True: qc[i-1] = True continue qc, dt = conditionA(dt, dTTol, qc, wt1, i, suspect) qc, dt = conditionB(dt, dTTol, gTTol, qc, gt, i, suspect) qc = conditionC(dt, dTTol, z, qc, t, i, suspect) # End of loop over levels. # Step or 0.0 at the bottom of a profile. if isData[-1] and dt.mask[-1] == False: dTTol = determineDepthTolerance(z[-1], np.abs(p.latitude())) if np.abs(dt[-1]) > dTTol: if suspect == True: qc[-1] = True if isTemperature[-1]: if t[-1] == 0.0: if suspect == True: qc[-1] = True # If 4 levels or more than half the profile is rejected then reject all. if suspect == False: nRejects = np.count_nonzero(qc) if nRejects >= 4 or nRejects > p.n_levels()/2: qc[:] = True # register suspects, if computed, to db if suspect: query = "REPLACE INTO enspikeandstep VALUES(?,?);" main.dbinteract(query, [p.uid(), main.pack_array(qc)] ) return qc
def db_to_df(table, filter_on_wire_break_test=False, filter_on_tests={}, n_to_extract=numpy.iinfo(numpy.int32).max): ''' Reads the table from iquod.db into a pandas dataframe. If filter_on_wire_break_test is True, the results from that test are used to exclude levels below a wire break from the test results and the wire break test is not returned. filter_on_tests is a generalised form of filter_on_wire_break and is used to exclude results; it takes a list of [testname, action], where levels failing <testname> are excluded towards the surface (if action is 'up'), towards depth (if action is 'down') and the whole profile deleted (if action is 'remove'). Set n_to_extract to limit the number of rows extracted to the specified number. ''' # what tests are available testNames = main.importQC('qctests') testNames.sort() # connect to database conn = sqlite3.connect('iquod.db', isolation_level=None) cur = conn.cursor() # extract matrix of test results and true flags into a dataframe query = 'SELECT uid, truth' for test in testNames: query += ', ' + test.lower() query += ' FROM ' + table query += ' WHERE uid IN (SELECT uid FROM ' + table + ' ORDER BY RANDOM() LIMIT ' + str(n_to_extract) + ')' cur.execute(query) rawresults = cur.fetchall() df = pandas.DataFrame(rawresults).astype('str') df.columns = ['uid', 'Truth'] + testNames if filter_on_wire_break_test: nlevels = get_n_levels_before_fail(df['CSIRO_wire_break']) del df['CSIRO_wire_break'] # No use for this now. testNames = df.columns[2:].values.tolist() for i in range(len(df.index)): for j in range(1, len(df.columns)): qc = unpack_qc(df.iloc[i, j]) # Some QC tests may return only one value so check for this. if len(qc) > 1: qc = qc[:nlevels[i]] df.iat[i, j] = main.pack_array(qc) todrop = set() for action in filter_on_tests: # Check if the action is relevant. if action == 'Optional' or action == 'At least one from group': continue # Initialise variables. nlevels = -1 outcomes = False qcresults = [] for testname in filter_on_tests[action]: for i in range(0, len(df.index)): if action == 'Remove above reject': nlevels = get_reversed_n_levels_before_fail([df[testname][i]])[0] elif action == 'Remove below reject': nlevels = get_n_levels_before_fail([df[testname][i]])[0] elif action == 'Remove profile': outcomes = check_for_fail([df[testname][i]])[0] elif action == 'Remove rejected levels': qcresults = unpack_qc_results([df[testname][i]])[0] else: raise NameError('Unrecognised action: ' + action) if (((action == 'Remove above reject' or action == 'Remove below reject') and nlevels == 0) or (action == 'Remove profile' and outcomes == True) or (action == 'Remove rejected levels' and numpy.count_nonzero(qcresults == False) == 0)): # Completely remove a profile if it has no valid levels or if it # has a fail and the action is to remove. todrop.add(i) elif (action != 'Remove profile'): for j in range(1, len(df.columns)): # Retain only the levels that passed testname. # Some QC tests may return only one value so check for this. qc = unpack_qc(df.iloc[i, j]) if len(qc) > 1: if action == 'Remove above reject': qc = qc[nlevels:] elif action == 'Remove below reject': qc = qc[:nlevels] elif action == 'Remove rejected levels': qc = qc[qcresults == False] df.iat[i, j] = main.pack_array(qc) del df[testname] # No need to keep this any longer. todrop = list(todrop) if len(todrop) > 0: df.drop(todrop, inplace=True) df.reset_index(inplace=True, drop=True) testNames = df.columns[2:].values.tolist() df[['Truth']] = df[['Truth']].apply(parse_truth) df[testNames] = df[testNames].apply(parse) return df
def run_qc(p, suspect, parameters): # check for pre-registered suspect tabulation, if that's what we want: if suspect: query = 'SELECT suspect FROM enspikeandstep WHERE uid = ' + str( p.uid()) + ';' susp = main.dbinteract(query, targetdb=parameters["db"]) if len(susp) > 0: return main.unpack_row(susp[0])[0] # Define tolerances used. tolD = np.array([0, 200, 300, 500, 600]) tolDTrop = np.array([0, 300, 400, 500, 600]) tolT = np.array([5.0, 5.0, 2.5, 2.0, 1.5]) # Define an array to hold results. qc = np.zeros(p.n_levels(), dtype=bool) # Get depth and temperature values from the profile. z = p.z() t = p.t() # Find which levels have data. isTemperature = (t.mask == False) isDepth = (z.mask == False) isData = isTemperature & isDepth # Array to hold temperature differences between levels and gradients. dt, gt = composeDT(t, z, p.n_levels()) # Spikes and steps detection. for i in range(1, p.n_levels()): if i >= 2: if (isData[i - 2] and isData[i - 1] and isData[i]) == False: continue if z[i] - z[i - 2] >= 5.0: wt1 = (z[i - 1] - z[i - 2]) / (z[i] - z[i - 2]) else: wt1 = 0.5 else: if (isData[i - 1] and isData[i]) == False: continue wt1 = 0.5 dTTol = determineDepthTolerance(z[i - 1], np.abs(p.latitude())) gTTol = 0.05 # Check for low temperatures in the Tropics. # This might be more appropriate to appear in a separate EN regional # range check but is included here for now for consistency with the # original code. if (np.abs(p.latitude()) < 20.0 and z[i - 1] < 1000.0 and t[i - 1] < 1.0): dt[i] = np.ma.masked if suspect == True: qc[i - 1] = True continue qc, dt = conditionA(dt, dTTol, qc, wt1, i, suspect) qc, dt = conditionB(dt, dTTol, gTTol, qc, gt, i, suspect) qc = conditionC(dt, dTTol, z, qc, t, i, suspect) # End of loop over levels. # Step or 0.0 at the bottom of a profile. if isData[-1] and dt.mask[-1] == False: dTTol = determineDepthTolerance(z[-1], np.abs(p.latitude())) if np.abs(dt[-1]) > dTTol: if suspect == True: qc[-1] = True if isTemperature[-1]: if t[-1] == 0.0: if suspect == True: qc[-1] = True # If 4 levels or more than half the profile is rejected then reject all. if suspect == False: nRejects = np.count_nonzero(qc) if nRejects >= 4 or nRejects > p.n_levels() / 2: qc[:] = True # register suspects, if computed, to db if suspect: query = "REPLACE INTO enspikeandstep VALUES(?,?);" main.dbinteract(query, [p.uid(), main.pack_array(qc)], targetdb=parameters["db"]) return qc
if profile.is_last_profile_in_file(fid) == True: break else: continue uids.append(p['uid']) # skip pathological profiles isgood = assessProfile(profile) if not isgood and profile.is_last_profile_in_file(fid) == True: break elif not isgood: continue # encode temperature error codes into truth array truth = encodeTruth(profile) p['truth'] = main.pack_array(truth) # keep tabs on how many good and how many bad profiles have been added to db # nowire == index of first wire break level wireqc = qctests.CSIRO_wire_break.test(profile, {}) try: nowire = list(wireqc).index(True) except: nowire = len(truth) # flag only counts if its before the wire break: flagged = dbutils.summarize_truth(truth[0:nowire]) if flagged: bad += 1 else: good += 1