def plot_roc(targetdb='iquod.db'): # get qc tests testNames = main.importQC('qctests') # connect to database conn = sqlite3.connect(targetdb, isolation_level=None) cur = conn.cursor() # extract matrix of test results and true flags into a dataframe query = 'SELECT truth, raw, ' + ','.join(testNames) + ' FROM ' + sys.argv[1] + ' WHERE training=0;' cur.execute(query) rawresults = cur.fetchall() df = pandas.DataFrame(rawresults).astype('str') df.columns = ['Truth', 'raw'] + testNames # unpack truth and qc data truth = df[['Truth']].apply(unpack_truth).values.tolist() df = df.assign(leveltruth=pandas.Series(truth)) df[['Truth']] = df[['Truth']].apply(dbutils.parse_truth) for t in testNames: df[[t]] = df[[t]].apply(dbutils.parse) # prepare ROC function assessROC = generateROC() df['roc'] = df.apply(assessROC, axis=1) # set up dirs for figures os.makedirs(figdir) os.makedirs(figdir + '/FP') os.makedirs(figdir + '/FN') os.makedirs(figdir + '/TP') os.makedirs(figdir + '/TN') df.apply(plotRow, axis=1)
def setUp(self): filenames = main.readInput('datafiles.json') profiles = main.extractProfiles(filenames) # identify and import tests testNames = main.importQC('qctests') testNames.sort() for testName in testNames: exec('from qctests import ' + testName) # Set up any keyword arguments needed by tests. kwargs = {'profiles': profiles} testResults = [] testVerbose = [] trueResults = [] trueVerbose = [] firstProfile = True delete = [] currentFile = '' self.profiles = [] for iprofile, pinfo in enumerate(profiles): # Load the profile data. if pinfo.file_name != currentFile: if currentFile != '': f.close() currentFile = pinfo.file_name f = open(currentFile) if f.tell() != pinfo.file_position: f.seek(pinfo.file_position) self.profiles.append(wod.WodProfile(f))
def setUp(self): filenames = main.readInput('datafiles.json') profiles = main.extractProfiles(filenames) # identify and import tests testNames = main.importQC('qctests') testNames.sort() for testName in testNames: exec('from qctests import ' + testName) # Set up any keyword arguments needed by tests. kwargs = {'profiles' : profiles} testResults = [] testVerbose = [] trueResults = [] trueVerbose = [] firstProfile = True delete = [] currentFile = '' self.profiles = [] for iprofile, pinfo in enumerate(profiles): # Load the profile data. if pinfo.file_name != currentFile: if currentFile != '': f.close() currentFile = pinfo.file_name f = open(currentFile) if f.tell() != pinfo.file_position: f.seek(pinfo.file_position) self.profiles.append(wod.WodProfile(f))
def plot_roc(): # get qc tests testNames = main.importQC('qctests') # connect to database conn = sqlite3.connect('iquod.db', isolation_level=None) cur = conn.cursor() # extract matrix of test results and true flags into a dataframe query = 'SELECT truth, raw, ' + ','.join(testNames) + ' FROM ' + sys.argv[1] + ' WHERE training=0;' cur.execute(query) rawresults = cur.fetchall() df = pandas.DataFrame(rawresults).astype('str') df.columns = ['Truth', 'raw'] + testNames # unpack truth and qc data truth = df[['Truth']].apply(unpack_truth).values.tolist() df = df.assign(leveltruth=pandas.Series(truth)) df[['Truth']] = df[['Truth']].apply(dbutils.parse_truth) for t in testNames: df[[t]] = df[[t]].apply(dbutils.parse) # prepare ROC function assessROC = generateROC() df['roc'] = df.apply(assessROC, axis=1) # set up dirs for figures os.makedirs(figdir) os.makedirs(figdir + '/FP') os.makedirs(figdir + '/FN') os.makedirs(figdir + '/TP') os.makedirs(figdir + '/TN') df.apply(plotRow, axis=1)
def importQC_test(self): ''' make sure main.importQC returns a valid list of tests that actually exist ''' tests = main.importQC("qctests") assert isinstance(tests, list), 'importQC did not return a list' for test in tests: assert os.path.isfile('qctests/'+test+'.py'), 'test ' + test + ' is not found.'
def importQC_test(self): ''' make sure main.importQC returns a valid list of tests that actually exist ''' tests = main.importQC("qctests") assert isinstance(tests, list), 'importQC did not return a list' for test in tests: assert os.path.isfile('qctests/' + test + '.py'), 'test ' + test + ' is not found.'
def query2df(meta, filter, tablename, database='iquod.db'): ''' meta: list of strings of metadata to extract filter: string describing WHERE filter for SQL query, such as: 'uid==1234' 'cruise!=99 and month==10' etc tablename: sql table to extract from database: filename of database file return a dataframe with columns for every QC test plus specified metadata. also parses out truth if requested in the metadata list ''' # get qc tests testNames = main.importQC('qctests') # connect to database conn = sqlite3.connect(database, isolation_level=None) cur = conn.cursor() # extract matrix of test results into a dataframe query = 'SELECT ' if len(meta) > 0: query += ','.join(meta) + ',' query += ','.join(testNames) + ' FROM ' + tablename if filter: query += ' WHERE ' + filter cur.execute(query) rawresults = cur.fetchall() df = pandas.DataFrame(rawresults).astype('str') df.columns = meta + testNames for t in testNames: df[[t]] = df[[t]].apply(dbutils.parse) # deal with truth data if present # column 'leveltruth' will persist per-level truth, # while 'Truth' summarizes by or'ing all levels together if 'truth' in meta: def unpack_truth(results): return results.apply(dbutils.unpack_qc) truth = df[['truth']].apply(unpack_truth).values.tolist() df = df.assign(leveltruth=pandas.Series(truth)) df[['truth']] = df[['truth']].apply(dbutils.parse_truth) return df
def dump_row(uid, table, database='iquod.db'): ''' print all database keys and values for uid ''' # extract and parse row conn = sqlite3.connect(database, isolation_level=None) cur = conn.cursor() query = 'SELECT * FROM ' + table + ' WHERE uid=' + str(uid) cur.execute(query) rawresults = cur.fetchall() df = pandas.DataFrame(rawresults).astype('str') df.columns = [description[0] for description in cur.description] testNames = main.importQC('qctests') testNames = [t.lower() for t in testNames] for t in testNames: df[[t]] = df[[t]].apply(dbutils.parse) df[['truth']] = df[['truth']].apply(dbutils.parse_truth) for col in list(df): print col, ':', df.ix[0][col]
def plot_uid_pathology(uid, table, database='iquod.db'): # extract and parse row conn = sqlite3.connect(database, isolation_level=None) cur = conn.cursor() query = 'SELECT * FROM ' + table + ' WHERE uid=' + str(uid) cur.execute(query) rawresults = cur.fetchall() df = pandas.DataFrame(rawresults).astype('str') df.columns = [description[0] for description in cur.description] testNames = main.importQC('qctests') testNames = [t.lower() for t in testNames] for t in testNames: df[[t]] = df[[t]].apply(dbutils.parse) def unpack_truth(results): return results.apply(dbutils.unpack_qc) truth = df[['truth']].apply(unpack_truth).values.tolist() df = df.assign(leveltruth=pandas.Series(truth)) df[['truth']] = df[['truth']].apply(dbutils.parse_truth) plotPathology(df.ix[0], '.')
query = "UPDATE " + sys.argv[ 1] + " SET " + test + "=? WHERE uid=" + str( profile.uid()) + ";" main.dbinteract(query, [main.pack_array(result)]) except: print 'db exception', sys.exc_info() ######################################## # main ######################################## if len(sys.argv) > 2: # Identify and import tests testNames = main.importQC('qctests') testNames.sort() print('{} quality control checks have been found'.format(len(testNames))) testNames = main.checkQCTestRequirements(testNames) print('{} quality control checks are able to be run:'.format( len(testNames))) for testName in testNames: print(' {}'.format(testName)) # set up a directory for logging logdir = "autoqc-logs-" + str(calendar.timegm(time.gmtime())) os.makedirs(logdir) # Parallel processing. print('\nPlease wait while QC is performed\n')
def builddb(infile, check_originator_flag_type=True, months_to_use=range(1, 13), outfile='iquod.db', dbtable='iquod'): conn = sqlite3.connect(outfile, isolation_level=None) cur = conn.cursor() # Identify tests testNames = main.importQC('qctests') testNames.sort() # set up our table query = "CREATE TABLE IF NOT EXISTS " + dbtable + """( raw text, truth BLOB, uid integer PRIMARY KEY, year integer, month integer, day integer, time real, lat real, long real, country text, cruise integer, ocruise text, probe integer, training integer, flagged integer, """ for i in range(len(testNames)): query += testNames[i].lower() + ' BLOB' if i < len(testNames) - 1: query += ',' else: query += ');' cur.execute(query) # populate table from wod-ascii data fid = open(infile) uids = [] good = 0 bad = 0 while True: # extract profile as wodpy object and raw text start = fid.tell() profile = wod.WodProfile(fid) end = fid.tell() fid.seek(start) raw = fid.read(end - start) fid.seek(end) # set up dictionary for populating query string p = profile.npdict() p['raw'] = "'" + raw + "'" # check for duplicate profiles in raw data if p['uid'] in uids: if profile.is_last_profile_in_file(fid) == True: break else: continue uids.append(p['uid']) # skip pathological profiles isgood = assessProfile(profile, check_originator_flag_type, months_to_use) if not isgood and profile.is_last_profile_in_file(fid) == True: break elif not isgood: continue # encode temperature error codes into truth array truth = encodeTruth(profile) p['truth'] = main.pack_array(truth) # extract country code country = profile.primary_header['Country code'] # originator cruise orig_cruise = profile.originator_cruise() # keep tabs on how many good and how many bad profiles have been added to db # nowire == index of first wire break level wireqc = qctests.CSIRO_wire_break.test(profile, {}) try: nowire = list(wireqc).index(True) except: nowire = len(truth) # flag only counts if its before the wire break: flagged = dbutils.summarize_truth(truth[0:nowire]) if flagged: bad += 1 else: good += 1 query = "INSERT INTO " + dbtable + " (raw, truth, uid, year, month, day, time, lat, long, country, cruise, ocruise, probe, flagged) values (?,?,?,?,?,?,?,?,?,?,?,?,?,?);" values = (p['raw'], p['truth'], p['uid'], p['year'], p['month'], p['day'], p['time'], p['latitude'], p['longitude'], country, p['cruise'], orig_cruise, p['probe_type'], int(flagged)) main.dbinteract(query, values, targetdb=outfile) if profile.is_last_profile_in_file(fid) == True: break conn.commit() print('number of clean profiles written:', good) print('number of flagged profiles written:', bad) print('total number of profiles written:', good + bad)
def db_to_df(table, filter_on_wire_break_test=False, filter_on_tests={}, n_to_extract=numpy.iinfo(numpy.int32).max, applyparse=True, targetdb='iquod.db'): ''' Reads the table from targetdb into a pandas dataframe. If filter_on_wire_break_test is True, the results from that test are used to exclude levels below a wire break from the test results and the wire break test is not returned. filter_on_tests is a generalised form of filter_on_wire_break and is used to exclude results; it takes a list of [testname, action], where levels failing <testname> are excluded towards the surface (if action is 'up'), towards depth (if action is 'down') and the whole profile deleted (if action is 'remove'). Set n_to_extract to limit the number of rows extracted to the specified number. ''' # what tests are available testNames = main.importQC('qctests') testNames.sort() # connect to database conn = sqlite3.connect(targetdb, isolation_level=None) cur = conn.cursor() # extract matrix of test results and true flags into a dataframe query = 'SELECT uid, truth' for test in testNames: query += ', ' + test.lower() query += ' FROM ' + table query += ' WHERE uid IN (SELECT uid FROM ' + table + ' ORDER BY RANDOM() LIMIT ' + str( n_to_extract) + ')' cur.execute(query) rawresults = cur.fetchall() sub = 1000 df_final = None for i in range(math.ceil(len(rawresults) / sub)): df = pandas.DataFrame(rawresults[i * sub:(i + 1) * sub]).astype('bytes') df.columns = ['uid', 'Truth'] + testNames df = df.astype({'uid': 'int'}) if filter_on_wire_break_test: nlevels = get_n_levels_before_fail(df['CSIRO_wire_break']) del df['CSIRO_wire_break'] # No use for this now. testNames = df.columns[2:].values.tolist() for i in range(len(df.index)): for j in range(1, len(df.columns)): qc = unpack_qc(df.iloc[i, j]) # Some QC tests may return only one value so check for this. if len(qc) > 1: qc = qc[:nlevels[i]] df.iat[i, j] = main.pack_array(qc) todrop = set() for action in filter_on_tests: # Check if the action is relevant. if action == 'Optional' or action == 'At least one from group': continue # Initialise variables. nlevels = -1 outcomes = False qcresults = [] for testname in filter_on_tests[action]: for i in range(0, len(df.index)): if action == 'Remove above reject': nlevels = get_reversed_n_levels_before_fail( [df[testname][i]])[0] elif action == 'Remove below reject': nlevels = get_n_levels_before_fail([df[testname][i] ])[0] elif action == 'Remove profile': outcomes = check_for_fail([df[testname][i]])[0] elif action == 'Remove rejected levels': qcresults = unpack_qc_results([df[testname][i]])[0] else: raise NameError('Unrecognised action: ' + action) if (((action == 'Remove above reject' or action == 'Remove below reject') and nlevels == 0) or (action == 'Remove profile' and outcomes == True) or (action == 'Remove rejected levels' and numpy.count_nonzero(qcresults == False) == 0)): # Completely remove a profile if it has no valid levels or if it # has a fail and the action is to remove. todrop.add(i) elif (action != 'Remove profile'): for j in range(1, len(df.columns)): # Retain only the levels that passed testname. # Some QC tests may return only one value so check for this. qc = unpack_qc(df.iloc[i, j]) if len(qc) > 1: if action == 'Remove above reject': qc = qc[nlevels:] elif action == 'Remove below reject': qc = qc[:nlevels] elif action == 'Remove rejected levels': qc = qc[qcresults == False] df.iat[i, j] = main.pack_array(qc) del df[testname] # No need to keep this any longer. df.reset_index(inplace=True, drop=True) todrop = list(todrop) if len(todrop) > 0: df.drop(todrop, inplace=True) df.reset_index(inplace=True, drop=True) testNames = df.columns[2:].values.tolist() if applyparse: df[['Truth']] = df[['Truth']].apply(parse_truth) df[testNames] = df[testNames].apply(parse) if i == 0: df_final = df else: df_final = pandas.concat([df_final, df]) return df_final.reset_index(drop=True)
try: query = "UPDATE " + sys.argv[1] + " SET " + test + "=? WHERE uid=" + str(profile.uid()) + ";" main.dbinteract(query, [main.pack_array(result)]) except: print 'db exception', sys.exc_info() ######################################## # main ######################################## if len(sys.argv)>2: # Identify and import tests testNames = main.importQC('qctests') testNames.sort() print('{} quality control checks have been found'.format(len(testNames))) testNames = main.checkQCTestRequirements(testNames) print('{} quality control checks are able to be run:'.format(len(testNames))) for testName in testNames: print(' {}'.format(testName)) # set up a directory for logging logdir = "autoqc-logs-" + str(calendar.timegm(time.gmtime())) os.makedirs(logdir) # Parallel processing. print('\nPlease wait while QC is performed\n') # set up global parmaeter store
import util.post as post import util.main as main import sys, sqlite3 tests = main.importQC(sys.argv[1]) tablename = 'quota' def add_test(testname, tablename, database='iquod.db'): conn = sqlite3.connect(database, isolation_level=None) cur = conn.cursor() query = 'ALTER TABLE ' + tablename + ' ADD ' + testname + ' BLOB;' cur.execute(query) for test in tests: add_test(test.lower(), tablename)
def db_to_df(table, filter_on_wire_break_test=False, filter_on_tests={}, n_to_extract=numpy.iinfo(numpy.int32).max): ''' Reads the table from iquod.db into a pandas dataframe. If filter_on_wire_break_test is True, the results from that test are used to exclude levels below a wire break from the test results and the wire break test is not returned. filter_on_tests is a generalised form of filter_on_wire_break and is used to exclude results; it takes a list of [testname, action], where levels failing <testname> are excluded towards the surface (if action is 'up'), towards depth (if action is 'down') and the whole profile deleted (if action is 'remove'). Set n_to_extract to limit the number of rows extracted to the specified number. ''' # what tests are available testNames = main.importQC('qctests') testNames.sort() # connect to database conn = sqlite3.connect('iquod.db', isolation_level=None) cur = conn.cursor() # extract matrix of test results and true flags into a dataframe query = 'SELECT uid, truth' for test in testNames: query += ', ' + test.lower() query += ' FROM ' + table query += ' WHERE uid IN (SELECT uid FROM ' + table + ' ORDER BY RANDOM() LIMIT ' + str(n_to_extract) + ')' cur.execute(query) rawresults = cur.fetchall() df = pandas.DataFrame(rawresults).astype('str') df.columns = ['uid', 'Truth'] + testNames if filter_on_wire_break_test: nlevels = get_n_levels_before_fail(df['CSIRO_wire_break']) del df['CSIRO_wire_break'] # No use for this now. testNames = df.columns[2:].values.tolist() for i in range(len(df.index)): for j in range(1, len(df.columns)): qc = unpack_qc(df.iloc[i, j]) # Some QC tests may return only one value so check for this. if len(qc) > 1: qc = qc[:nlevels[i]] df.iat[i, j] = main.pack_array(qc) todrop = set() for action in filter_on_tests: # Check if the action is relevant. if action == 'Optional' or action == 'At least one from group': continue # Initialise variables. nlevels = -1 outcomes = False qcresults = [] for testname in filter_on_tests[action]: for i in range(0, len(df.index)): if action == 'Remove above reject': nlevels = get_reversed_n_levels_before_fail([df[testname][i]])[0] elif action == 'Remove below reject': nlevels = get_n_levels_before_fail([df[testname][i]])[0] elif action == 'Remove profile': outcomes = check_for_fail([df[testname][i]])[0] elif action == 'Remove rejected levels': qcresults = unpack_qc_results([df[testname][i]])[0] else: raise NameError('Unrecognised action: ' + action) if (((action == 'Remove above reject' or action == 'Remove below reject') and nlevels == 0) or (action == 'Remove profile' and outcomes == True) or (action == 'Remove rejected levels' and numpy.count_nonzero(qcresults == False) == 0)): # Completely remove a profile if it has no valid levels or if it # has a fail and the action is to remove. todrop.add(i) elif (action != 'Remove profile'): for j in range(1, len(df.columns)): # Retain only the levels that passed testname. # Some QC tests may return only one value so check for this. qc = unpack_qc(df.iloc[i, j]) if len(qc) > 1: if action == 'Remove above reject': qc = qc[nlevels:] elif action == 'Remove below reject': qc = qc[:nlevels] elif action == 'Remove rejected levels': qc = qc[qcresults == False] df.iat[i, j] = main.pack_array(qc) del df[testname] # No need to keep this any longer. todrop = list(todrop) if len(todrop) > 0: df.drop(todrop, inplace=True) df.reset_index(inplace=True, drop=True) testNames = df.columns[2:].values.tolist() df[['Truth']] = df[['Truth']].apply(parse_truth) df[testNames] = df[testNames].apply(parse) return df