Ejemplo n.º 1
0
def plot_roc(targetdb='iquod.db'):

    # get qc tests
    testNames = main.importQC('qctests')

    # connect to database
    conn = sqlite3.connect(targetdb, isolation_level=None)
    cur = conn.cursor()

    # extract matrix of test results and true flags into a dataframe
    query = 'SELECT truth, raw, ' + ','.join(testNames) + ' FROM ' + sys.argv[1] + ' WHERE training=0;'
    cur.execute(query)
    rawresults = cur.fetchall()
    df = pandas.DataFrame(rawresults).astype('str')
    df.columns = ['Truth', 'raw'] + testNames

    # unpack truth and qc data
    truth = df[['Truth']].apply(unpack_truth).values.tolist()
    df = df.assign(leveltruth=pandas.Series(truth))
    df[['Truth']] = df[['Truth']].apply(dbutils.parse_truth)
    for t in testNames:
        df[[t]] = df[[t]].apply(dbutils.parse)

    # prepare ROC function
    assessROC = generateROC()
    df['roc'] = df.apply(assessROC, axis=1)

    # set up dirs for figures
    os.makedirs(figdir)
    os.makedirs(figdir + '/FP')
    os.makedirs(figdir + '/FN')
    os.makedirs(figdir + '/TP')
    os.makedirs(figdir + '/TN')

    df.apply(plotRow, axis=1)
Ejemplo n.º 2
0
    def setUp(self):
        filenames = main.readInput('datafiles.json')
        profiles = main.extractProfiles(filenames)

        # identify and import tests
        testNames = main.importQC('qctests')
        testNames.sort()
        for testName in testNames:
            exec('from qctests import ' + testName)

        # Set up any keyword arguments needed by tests.
        kwargs = {'profiles': profiles}

        testResults = []
        testVerbose = []
        trueResults = []
        trueVerbose = []
        firstProfile = True
        delete = []
        currentFile = ''
        self.profiles = []
        for iprofile, pinfo in enumerate(profiles):
            # Load the profile data.
            if pinfo.file_name != currentFile:
                if currentFile != '': f.close()
                currentFile = pinfo.file_name
                f = open(currentFile)
            if f.tell() != pinfo.file_position: f.seek(pinfo.file_position)
            self.profiles.append(wod.WodProfile(f))
Ejemplo n.º 3
0
   def setUp(self):
        filenames = main.readInput('datafiles.json')
        profiles = main.extractProfiles(filenames)

        # identify and import tests
        testNames = main.importQC('qctests')
        testNames.sort()
        for testName in testNames:
          exec('from qctests import ' + testName)

        # Set up any keyword arguments needed by tests.
        kwargs = {'profiles' : profiles}

        testResults  = []
        testVerbose  = []
        trueResults  = []
        trueVerbose  = []
        firstProfile = True
        delete       = []
        currentFile  = ''
        self.profiles = []
        for iprofile, pinfo in enumerate(profiles):
          # Load the profile data.
          if pinfo.file_name != currentFile:
            if currentFile != '': f.close()
            currentFile = pinfo.file_name
            f = open(currentFile)
          if f.tell() != pinfo.file_position: f.seek(pinfo.file_position)
          self.profiles.append(wod.WodProfile(f))
Ejemplo n.º 4
0
def plot_roc():

    # get qc tests
    testNames = main.importQC('qctests')

    # connect to database
    conn = sqlite3.connect('iquod.db', isolation_level=None)
    cur = conn.cursor()

    # extract matrix of test results and true flags into a dataframe
    query = 'SELECT truth, raw, ' + ','.join(testNames) + ' FROM ' + sys.argv[1] + ' WHERE training=0;'
    cur.execute(query)
    rawresults = cur.fetchall()
    df = pandas.DataFrame(rawresults).astype('str')
    df.columns = ['Truth', 'raw'] + testNames

    # unpack truth and qc data
    truth = df[['Truth']].apply(unpack_truth).values.tolist()
    df = df.assign(leveltruth=pandas.Series(truth))
    df[['Truth']] = df[['Truth']].apply(dbutils.parse_truth)
    for t in testNames:
        df[[t]] = df[[t]].apply(dbutils.parse)

    # prepare ROC function
    assessROC = generateROC()
    df['roc'] = df.apply(assessROC, axis=1)

    # set up dirs for figures
    os.makedirs(figdir)
    os.makedirs(figdir + '/FP')
    os.makedirs(figdir + '/FN')
    os.makedirs(figdir + '/TP')
    os.makedirs(figdir + '/TN')

    df.apply(plotRow, axis=1)
Ejemplo n.º 5
0
    def importQC_test(self):
        '''
        make sure main.importQC returns a valid list of tests that actually exist
        '''

        tests = main.importQC("qctests")
        assert isinstance(tests, list), 'importQC did not return a list'

        for test in tests:
            assert os.path.isfile('qctests/'+test+'.py'), 'test ' + test + ' is not found.'
Ejemplo n.º 6
0
    def importQC_test(self):
        '''
        make sure main.importQC returns a valid list of tests that actually exist
        '''

        tests = main.importQC("qctests")
        assert isinstance(tests, list), 'importQC did not return a list'

        for test in tests:
            assert os.path.isfile('qctests/' + test +
                                  '.py'), 'test ' + test + ' is not found.'
Ejemplo n.º 7
0
def query2df(meta, filter, tablename, database='iquod.db'):
    '''
    meta: list of strings of metadata to extract
    filter: string describing WHERE filter for SQL query, such as:
        'uid==1234'
        'cruise!=99 and month==10' etc
    tablename: sql table to extract from
    database: filename of database file

    return a dataframe with columns for every QC test plus specified metadata.
    also parses out truth if requested in the metadata list
    '''

    # get qc tests
    testNames = main.importQC('qctests')

    # connect to database
    conn = sqlite3.connect(database, isolation_level=None)
    cur = conn.cursor()

    # extract matrix of test results into a dataframe
    query = 'SELECT '
    if len(meta) > 0:
        query += ','.join(meta) + ','
    query += ','.join(testNames) + ' FROM ' + tablename
    if filter:
        query += ' WHERE ' + filter
    cur.execute(query)
    rawresults = cur.fetchall()
    df = pandas.DataFrame(rawresults).astype('str')
    df.columns = meta + testNames
    for t in testNames:
        df[[t]] = df[[t]].apply(dbutils.parse)

    # deal with truth data if present
    # column 'leveltruth' will persist per-level truth,
    # while 'Truth' summarizes by or'ing all levels together
    if 'truth' in meta:

        def unpack_truth(results):
            return results.apply(dbutils.unpack_qc)

        truth = df[['truth']].apply(unpack_truth).values.tolist()
        df = df.assign(leveltruth=pandas.Series(truth))
        df[['truth']] = df[['truth']].apply(dbutils.parse_truth)

    return df
Ejemplo n.º 8
0
def dump_row(uid, table, database='iquod.db'):
    '''
    print all database keys and values for uid
    '''

    # extract and parse row
    conn = sqlite3.connect(database, isolation_level=None)
    cur = conn.cursor()
    query = 'SELECT * FROM ' + table + ' WHERE uid=' + str(uid)
    cur.execute(query)
    rawresults = cur.fetchall()
    df = pandas.DataFrame(rawresults).astype('str')
    df.columns = [description[0] for description in cur.description]
    testNames = main.importQC('qctests')
    testNames = [t.lower() for t in testNames]
    for t in testNames:
        df[[t]] = df[[t]].apply(dbutils.parse)
    df[['truth']] = df[['truth']].apply(dbutils.parse_truth)

    for col in list(df):
        print col, ':', df.ix[0][col]
Ejemplo n.º 9
0
def plot_uid_pathology(uid, table, database='iquod.db'):

    # extract and parse row
    conn = sqlite3.connect(database, isolation_level=None)
    cur = conn.cursor()
    query = 'SELECT * FROM ' + table + ' WHERE uid=' + str(uid)
    cur.execute(query)
    rawresults = cur.fetchall()
    df = pandas.DataFrame(rawresults).astype('str')
    df.columns = [description[0] for description in cur.description]
    testNames = main.importQC('qctests')
    testNames = [t.lower() for t in testNames]
    for t in testNames:
        df[[t]] = df[[t]].apply(dbutils.parse)

    def unpack_truth(results):
        return results.apply(dbutils.unpack_qc)

    truth = df[['truth']].apply(unpack_truth).values.tolist()
    df = df.assign(leveltruth=pandas.Series(truth))
    df[['truth']] = df[['truth']].apply(dbutils.parse_truth)

    plotPathology(df.ix[0], '.')
Ejemplo n.º 10
0
            query = "UPDATE " + sys.argv[
                1] + " SET " + test + "=? WHERE uid=" + str(
                    profile.uid()) + ";"
            main.dbinteract(query, [main.pack_array(result)])
        except:
            print 'db exception', sys.exc_info()


########################################
# main
########################################

if len(sys.argv) > 2:

    # Identify and import tests
    testNames = main.importQC('qctests')
    testNames.sort()
    print('{} quality control checks have been found'.format(len(testNames)))
    testNames = main.checkQCTestRequirements(testNames)
    print('{} quality control checks are able to be run:'.format(
        len(testNames)))
    for testName in testNames:
        print('  {}'.format(testName))

    # set up a directory for logging
    logdir = "autoqc-logs-" + str(calendar.timegm(time.gmtime()))
    os.makedirs(logdir)

    # Parallel processing.
    print('\nPlease wait while QC is performed\n')
Ejemplo n.º 11
0
def builddb(infile,
            check_originator_flag_type=True,
            months_to_use=range(1, 13),
            outfile='iquod.db',
            dbtable='iquod'):

    conn = sqlite3.connect(outfile, isolation_level=None)
    cur = conn.cursor()

    # Identify tests
    testNames = main.importQC('qctests')
    testNames.sort()

    # set up our table
    query = "CREATE TABLE IF NOT EXISTS " + dbtable + """(
                raw text,
                truth BLOB,
                uid integer PRIMARY KEY,
                year integer,
                month integer,
                day integer,
                time real,
                lat real,
                long real,
                country text,
                cruise integer,
                ocruise text,
                probe integer,
                training integer,
                flagged integer,
                """
    for i in range(len(testNames)):
        query += testNames[i].lower() + ' BLOB'
        if i < len(testNames) - 1:
            query += ','
        else:
            query += ');'

    cur.execute(query)

    # populate table from wod-ascii data
    fid = open(infile)
    uids = []
    good = 0
    bad = 0

    while True:
        # extract profile as wodpy object and raw text
        start = fid.tell()
        profile = wod.WodProfile(fid)
        end = fid.tell()
        fid.seek(start)
        raw = fid.read(end - start)
        fid.seek(end)
        # set up dictionary for populating query string
        p = profile.npdict()
        p['raw'] = "'" + raw + "'"

        # check for duplicate profiles in raw data
        if p['uid'] in uids:
            if profile.is_last_profile_in_file(fid) == True:
                break
            else:
                continue
        uids.append(p['uid'])

        # skip pathological profiles
        isgood = assessProfile(profile, check_originator_flag_type,
                               months_to_use)
        if not isgood and profile.is_last_profile_in_file(fid) == True:
            break
        elif not isgood:
            continue

        # encode temperature error codes into truth array
        truth = encodeTruth(profile)
        p['truth'] = main.pack_array(truth)

        # extract country code
        country = profile.primary_header['Country code']

        # originator cruise
        orig_cruise = profile.originator_cruise()

        # keep tabs on how many good and how many bad profiles have been added to db
        # nowire == index of first wire break level
        wireqc = qctests.CSIRO_wire_break.test(profile, {})
        try:
            nowire = list(wireqc).index(True)
        except:
            nowire = len(truth)
        # flag only counts if its before the wire break:
        flagged = dbutils.summarize_truth(truth[0:nowire])
        if flagged:
            bad += 1
        else:
            good += 1

        query = "INSERT INTO " + dbtable + " (raw, truth, uid, year, month, day, time, lat, long, country, cruise, ocruise, probe, flagged) values (?,?,?,?,?,?,?,?,?,?,?,?,?,?);"
        values = (p['raw'], p['truth'], p['uid'], p['year'], p['month'],
                  p['day'], p['time'], p['latitude'], p['longitude'], country,
                  p['cruise'], orig_cruise, p['probe_type'], int(flagged))
        main.dbinteract(query, values, targetdb=outfile)
        if profile.is_last_profile_in_file(fid) == True:
            break

    conn.commit()
    print('number of clean profiles written:', good)
    print('number of flagged profiles written:', bad)
    print('total number of profiles written:', good + bad)
Ejemplo n.º 12
0
def db_to_df(table,
             filter_on_wire_break_test=False,
             filter_on_tests={},
             n_to_extract=numpy.iinfo(numpy.int32).max,
             applyparse=True,
             targetdb='iquod.db'):
    '''
    Reads the table from targetdb into a pandas dataframe.
    If filter_on_wire_break_test is True, the results from that test are used to exclude
         levels below a wire break from the test results and the wire break test is not returned.
    filter_on_tests is a generalised form of filter_on_wire_break and is used to exclude results; it takes a list of
         [testname, action], where levels failing <testname> are excluded towards the surface (if action is 'up'), towards depth (if action is 'down') and the whole profile deleted (if action is 'remove').
    Set n_to_extract to limit the number of rows extracted to the specified number.
    '''

    # what tests are available
    testNames = main.importQC('qctests')
    testNames.sort()

    # connect to database
    conn = sqlite3.connect(targetdb, isolation_level=None)
    cur = conn.cursor()

    # extract matrix of test results and true flags into a dataframe
    query = 'SELECT uid, truth'
    for test in testNames:
        query += ', ' + test.lower()
    query += ' FROM ' + table
    query += ' WHERE uid IN (SELECT uid FROM ' + table + ' ORDER BY RANDOM() LIMIT ' + str(
        n_to_extract) + ')'

    cur.execute(query)
    rawresults = cur.fetchall()

    sub = 1000
    df_final = None
    for i in range(math.ceil(len(rawresults) / sub)):
        df = pandas.DataFrame(rawresults[i * sub:(i + 1) *
                                         sub]).astype('bytes')
        df.columns = ['uid', 'Truth'] + testNames
        df = df.astype({'uid': 'int'})
        if filter_on_wire_break_test:
            nlevels = get_n_levels_before_fail(df['CSIRO_wire_break'])
            del df['CSIRO_wire_break']  # No use for this now.
            testNames = df.columns[2:].values.tolist()
            for i in range(len(df.index)):
                for j in range(1, len(df.columns)):
                    qc = unpack_qc(df.iloc[i, j])
                    # Some QC tests may return only one value so check for this.
                    if len(qc) > 1:
                        qc = qc[:nlevels[i]]
                    df.iat[i, j] = main.pack_array(qc)

        todrop = set()
        for action in filter_on_tests:
            # Check if the action is relevant.
            if action == 'Optional' or action == 'At least one from group':
                continue

            # Initialise variables.
            nlevels = -1
            outcomes = False
            qcresults = []
            for testname in filter_on_tests[action]:
                for i in range(0, len(df.index)):
                    if action == 'Remove above reject':
                        nlevels = get_reversed_n_levels_before_fail(
                            [df[testname][i]])[0]
                    elif action == 'Remove below reject':
                        nlevels = get_n_levels_before_fail([df[testname][i]
                                                            ])[0]
                    elif action == 'Remove profile':
                        outcomes = check_for_fail([df[testname][i]])[0]
                    elif action == 'Remove rejected levels':
                        qcresults = unpack_qc_results([df[testname][i]])[0]
                    else:
                        raise NameError('Unrecognised action: ' + action)

                    if (((action == 'Remove above reject'
                          or action == 'Remove below reject') and nlevels == 0)
                            or
                        (action == 'Remove profile' and outcomes == True) or
                        (action == 'Remove rejected levels'
                         and numpy.count_nonzero(qcresults == False) == 0)):
                        # Completely remove a profile if it has no valid levels or if it
                        # has a fail and the action is to remove.
                        todrop.add(i)
                    elif (action != 'Remove profile'):
                        for j in range(1, len(df.columns)):
                            # Retain only the levels that passed testname.
                            # Some QC tests may return only one value so check for this.
                            qc = unpack_qc(df.iloc[i, j])
                            if len(qc) > 1:
                                if action == 'Remove above reject':
                                    qc = qc[nlevels:]
                                elif action == 'Remove below reject':
                                    qc = qc[:nlevels]
                                elif action == 'Remove rejected levels':
                                    qc = qc[qcresults == False]
                                df.iat[i, j] = main.pack_array(qc)

                del df[testname]  # No need to keep this any longer.
                df.reset_index(inplace=True, drop=True)

        todrop = list(todrop)
        if len(todrop) > 0:
            df.drop(todrop, inplace=True)
        df.reset_index(inplace=True, drop=True)
        testNames = df.columns[2:].values.tolist()
        if applyparse:
            df[['Truth']] = df[['Truth']].apply(parse_truth)
            df[testNames] = df[testNames].apply(parse)

        if i == 0:
            df_final = df
        else:
            df_final = pandas.concat([df_final, df])

    return df_final.reset_index(drop=True)
Ejemplo n.º 13
0
    try:
      query = "UPDATE " + sys.argv[1] + " SET " + test + "=? WHERE uid=" + str(profile.uid()) + ";"
      main.dbinteract(query, [main.pack_array(result)])
    except:
      print 'db exception', sys.exc_info()


########################################
# main
########################################

if len(sys.argv)>2:

  # Identify and import tests
  testNames = main.importQC('qctests')
  testNames.sort()
  print('{} quality control checks have been found'.format(len(testNames)))
  testNames = main.checkQCTestRequirements(testNames)
  print('{} quality control checks are able to be run:'.format(len(testNames)))
  for testName in testNames:
    print('  {}'.format(testName))

  # set up a directory for logging
  logdir = "autoqc-logs-" + str(calendar.timegm(time.gmtime()))
  os.makedirs(logdir)

  # Parallel processing.
  print('\nPlease wait while QC is performed\n')

  # set up global parmaeter store
Ejemplo n.º 14
0
import util.post as post
import util.main as main
import sys, sqlite3

tests = main.importQC(sys.argv[1])
tablename = 'quota'


def add_test(testname, tablename, database='iquod.db'):
    conn = sqlite3.connect(database, isolation_level=None)
    cur = conn.cursor()
    query = 'ALTER TABLE ' + tablename + ' ADD ' + testname + ' BLOB;'
    cur.execute(query)


for test in tests:
    add_test(test.lower(), tablename)
Ejemplo n.º 15
0
def db_to_df(table,
             filter_on_wire_break_test=False, 
             filter_on_tests={},
             n_to_extract=numpy.iinfo(numpy.int32).max):

    '''
    Reads the table from iquod.db into a pandas dataframe.
    If filter_on_wire_break_test is True, the results from that test are used to exclude
         levels below a wire break from the test results and the wire break test is not returned.
    filter_on_tests is a generalised form of filter_on_wire_break and is used to exclude results; it takes a list of
         [testname, action], where levels failing <testname> are excluded towards the surface (if action is 'up'), towards depth (if action is 'down') and the whole profile deleted (if action is 'remove').
    Set n_to_extract to limit the number of rows extracted to the specified number.
    '''

    # what tests are available
    testNames = main.importQC('qctests')
    testNames.sort()

    # connect to database
    conn = sqlite3.connect('iquod.db', isolation_level=None)
    cur = conn.cursor()

    # extract matrix of test results and true flags into a dataframe
    query = 'SELECT uid, truth'
    for test in testNames:
        query += ', ' + test.lower()
    query += ' FROM ' + table   
    query += ' WHERE uid IN (SELECT uid FROM ' + table + ' ORDER BY RANDOM() LIMIT ' + str(n_to_extract) + ')' 

    cur.execute(query)
    rawresults = cur.fetchall()
    df = pandas.DataFrame(rawresults).astype('str')
    df.columns = ['uid', 'Truth'] + testNames

    if filter_on_wire_break_test:
        nlevels = get_n_levels_before_fail(df['CSIRO_wire_break'])
        del df['CSIRO_wire_break'] # No use for this now.
        testNames = df.columns[2:].values.tolist()
        for i in range(len(df.index)):
            for j in range(1, len(df.columns)):
                qc = unpack_qc(df.iloc[i, j])
                # Some QC tests may return only one value so check for this.
                if len(qc) > 1:
                    qc = qc[:nlevels[i]]
                df.iat[i, j] = main.pack_array(qc)

    todrop = set()
    for action in filter_on_tests:
        # Check if the action is relevant.
        if action == 'Optional' or action == 'At least one from group': continue

        # Initialise variables.
        nlevels   = -1
        outcomes  = False
        qcresults = []
        for testname in filter_on_tests[action]:
            for i in range(0, len(df.index)):
                if action == 'Remove above reject':
                    nlevels = get_reversed_n_levels_before_fail([df[testname][i]])[0]
                elif action == 'Remove below reject':
                    nlevels = get_n_levels_before_fail([df[testname][i]])[0]
                elif action == 'Remove profile':
                    outcomes = check_for_fail([df[testname][i]])[0]
                elif action == 'Remove rejected levels':
                    qcresults = unpack_qc_results([df[testname][i]])[0]
                else:
                    raise NameError('Unrecognised action: ' + action)

                if (((action == 'Remove above reject' or action == 'Remove below reject') and nlevels == 0) or
                    (action == 'Remove profile' and outcomes == True) or
                    (action == 'Remove rejected levels' and numpy.count_nonzero(qcresults == False) == 0)):
                    # Completely remove a profile if it has no valid levels or if it
                    # has a fail and the action is to remove.
                    todrop.add(i)
                elif (action != 'Remove profile'):
                    for j in range(1, len(df.columns)):
                        # Retain only the levels that passed testname.
                        # Some QC tests may return only one value so check for this.
                        qc = unpack_qc(df.iloc[i, j])
                        if len(qc) > 1:
                            if action == 'Remove above reject':
                                qc = qc[nlevels:]
                            elif action == 'Remove below reject':
                                qc = qc[:nlevels] 
                            elif action == 'Remove rejected levels':
                                qc = qc[qcresults == False]            
                            df.iat[i, j] = main.pack_array(qc)

            del df[testname] # No need to keep this any longer.

    todrop = list(todrop)
    if len(todrop) > 0:
        df.drop(todrop, inplace=True)
    df.reset_index(inplace=True, drop=True)
    testNames = df.columns[2:].values.tolist()
    df[['Truth']] = df[['Truth']].apply(parse_truth)
    df[testNames] = df[testNames].apply(parse)

    return df