def read_file_to_dataframe(filename):
    """Reads a WOD file (filename) and returns data as a dataframe.
    data inlcudes columns Temperature, Salinity, Depth, Year, Month, Day,
    Longitude, Latitude, Datetime"""

    file = open(filename)

    # empty list for gatherting profiles.
    list_data = []

    # loop through profiles
    profile = wod.WodProfile(file)
    while not profile.is_last_profile_in_file(file):
        year = profile.year()
        lat = profile.latitude()
        lon = profile.longitude()
        s = profile.s()
        d = profile.z()
        t = profile.t()
        month = profile.month()
        day = profile.day()
        date = datetime.datetime(year, month, day)
        tmp = {
            'Year': year,
            'Month': month,
            'Day': day,
            'Longitude': lon,
            'Latitude': lat,
            'Salinity': s,
            'Temperature': t,
            'Depth': d,
            'Datetime': date
        }
        list_data.append(tmp)
        profile = wod.WodProfile(file)
    # again for last profile
    year = profile.year()
    lat = profile.latitude()
    lon = profile.longitude()
    s = profile.s()
    d = profile.z()
    t = profile.t()
    month = profile.month()
    day = profile.day()
    tmp = {
        'Year': year,
        'Month': month,
        'Day': day,
        'Longitude': lon,
        'Latitude': lat,
        'Salinity': s,
        'Temperature': t,
        'Depth': d
    }
    list_data.append(tmp)

    # convert to data frame
    data = pd.DataFrame(list_data)

    return data
Example #2
0
    def setUp(self):

        # WOD13 format data
        classic = open("tests/testData/classic.dat")
        # example from pp 124 of http://data.nodc.noaa.gov/woa/WOD/DOC/wodreadme.pdf
        self.classic1 = wod.WodProfile(classic)
        self.classic1_df = self.classic1.df()
        self.classic1_dict = self.classic1.npdict()
        self.classic1_head = self.classic1.header()
        # example with missing salinity information
        self.classic2 = wod.WodProfile(classic)

        # IQuOD 0.1 format data
        # short example (unpacked by hand to validate)
        iquod = open("tests/testData/iquod.dat")
        self.iquod1 = wod.WodProfile(iquod)
        self.iquod1_df = self.iquod1.df()
        self.iquod1_dict = self.iquod1.npdict()
        self.iquod1_head = self.iquod1.header()    
        # example with some metadata
        self.iquod2 = wod.WodProfile(iquod)
        self.iquod2_df = self.iquod2.df()
        self.iquod2_dict = self.iquod2.npdict()

        # data with some interesting pathologies
        path = open("tests/testData/pathological.dat")
        self.path1 = wod.WodProfile(path)
        self.path1_df = self.path1.df()
        self.path1_dict = self.path1.npdict()
        self.path1_head = self.path1.header()
    
        return
Example #3
0
    def setUp(self):
        filenames = main.readInput('datafiles.json')
        profiles = main.extractProfiles(filenames)

        # identify and import tests
        testNames = main.importQC('qctests')
        testNames.sort()
        for testName in testNames:
            exec('from qctests import ' + testName)

        # Set up any keyword arguments needed by tests.
        kwargs = {'profiles': profiles}

        testResults = []
        testVerbose = []
        trueResults = []
        trueVerbose = []
        firstProfile = True
        delete = []
        currentFile = ''
        self.profiles = []
        for iprofile, pinfo in enumerate(profiles):
            # Load the profile data.
            if pinfo.file_name != currentFile:
                if currentFile != '': f.close()
                currentFile = pinfo.file_name
                f = open(currentFile)
            if f.tell() != pinfo.file_position: f.seek(pinfo.file_position)
            self.profiles.append(wod.WodProfile(f))
Example #4
0
def extractProfiles(filenames):
  '''
  Read all profiles from the files and store in a list. Only the profile
  descriptions are read, not the profile data, in order to avoid using
  too much memory.
  '''
  profiles = []
  for filename in filenames:
      with open(filename) as f:
          profiles.append(wod.WodProfile(f, load_profile_data=False))
          while profiles[-1].is_last_profile_in_file(f) == False:
              profiles.append(wod.WodProfile(f, load_profile_data=False))

  # assert all elements of profiles are WodProfiles
  for i in profiles:
    assert isinstance(i, wod.WodProfile), i + ' is not a WodProfile'

  return profiles
Example #5
0
    def setUp(self):

        #create an artificial profile to trigger the temperature flag
        #sets first temperature to 99.9; otherwise identical to data/example.dat
        file = open("tests/testData/example.dat")

        self.demoProfile = wod.WodProfile(file)
        self.dataframe = self.demoProfile.df()
        self.dictionary = self.demoProfile.npdict()
        self.head = self.demoProfile.header()
        return
Example #6
0
def text2wod(raw):
  '''
  given the raw text of a wod ascii profile, return a wodpy object representing the same.
  '''
  
  fProfile = tempfile.TemporaryFile()
  fProfile.write(raw) # a file-like object containing only the profile from the queried row
  fProfile.seek(0)
  profile = wod.WodProfile(fProfile)
  fProfile.close()
 
  return profile
Example #7
0
def profileData(pinfo, currentFile, f):
  '''
  takes a profile info stub as returned by extractProfiles and extracts the whole profile
  from file f.
  '''

  if pinfo.file_name != currentFile:
    if currentFile != '': f.close()
    currentFile = pinfo.file_name
    f = open(currentFile)
  if f.tell() != pinfo.file_position: f.seek(pinfo.file_position)
  return wod.WodProfile(f), currentFile, f
Example #8
0
def get_profiles(ffile, N=-1):
    """Extracts profiles from a WOD file."""

    fid = open(ffile)
    pfs = []
    counter = 0
    while True:
        pf = wod.WodProfile(fid)
        if pf.is_last_profile_in_file(fid): break
        pfs.append(pf)
        counter += 1
        if N >= 0:
            if counter == N: break
    return pfs
Example #9
0
def builddb(infile,
            check_originator_flag_type=True,
            months_to_use=range(1, 13),
            outfile='iquod.db',
            dbtable='iquod'):

    conn = sqlite3.connect(outfile, isolation_level=None)
    cur = conn.cursor()

    # Identify tests
    testNames = main.importQC('qctests')
    testNames.sort()

    # set up our table
    query = "CREATE TABLE IF NOT EXISTS " + dbtable + """(
                raw text,
                truth BLOB,
                uid integer PRIMARY KEY,
                year integer,
                month integer,
                day integer,
                time real,
                lat real,
                long real,
                country text,
                cruise integer,
                ocruise text,
                probe integer,
                training integer,
                flagged integer,
                """
    for i in range(len(testNames)):
        query += testNames[i].lower() + ' BLOB'
        if i < len(testNames) - 1:
            query += ','
        else:
            query += ');'

    cur.execute(query)

    # populate table from wod-ascii data
    fid = open(infile)
    uids = []
    good = 0
    bad = 0

    while True:
        # extract profile as wodpy object and raw text
        start = fid.tell()
        profile = wod.WodProfile(fid)
        end = fid.tell()
        fid.seek(start)
        raw = fid.read(end - start)
        fid.seek(end)
        # set up dictionary for populating query string
        p = profile.npdict()
        p['raw'] = "'" + raw + "'"

        # check for duplicate profiles in raw data
        if p['uid'] in uids:
            if profile.is_last_profile_in_file(fid) == True:
                break
            else:
                continue
        uids.append(p['uid'])

        # skip pathological profiles
        isgood = assessProfile(profile, check_originator_flag_type,
                               months_to_use)
        if not isgood and profile.is_last_profile_in_file(fid) == True:
            break
        elif not isgood:
            continue

        # encode temperature error codes into truth array
        truth = encodeTruth(profile)
        p['truth'] = main.pack_array(truth)

        # extract country code
        country = profile.primary_header['Country code']

        # originator cruise
        orig_cruise = profile.originator_cruise()

        # keep tabs on how many good and how many bad profiles have been added to db
        # nowire == index of first wire break level
        wireqc = qctests.CSIRO_wire_break.test(profile, {})
        try:
            nowire = list(wireqc).index(True)
        except:
            nowire = len(truth)
        # flag only counts if its before the wire break:
        flagged = dbutils.summarize_truth(truth[0:nowire])
        if flagged:
            bad += 1
        else:
            good += 1

        query = "INSERT INTO " + dbtable + " (raw, truth, uid, year, month, day, time, lat, long, country, cruise, ocruise, probe, flagged) values (?,?,?,?,?,?,?,?,?,?,?,?,?,?);"
        values = (p['raw'], p['truth'], p['uid'], p['year'], p['month'],
                  p['day'], p['time'], p['latitude'], p['longitude'], country,
                  p['cruise'], orig_cruise, p['probe_type'], int(flagged))
        main.dbinteract(query, values, targetdb=outfile)
        if profile.is_last_profile_in_file(fid) == True:
            break

    conn.commit()
    print('number of clean profiles written:', good)
    print('number of flagged profiles written:', bad)
    print('total number of profiles written:', good + bad)
Example #10
0
#filename = '../data/quota_subset.dat'
filename = '../../AutoQC_raw/quota/test/chunk.dat'
n = 30

fid = open(filename)
fid.read()
fileSize = fid.tell()
chunkSize = int(math.ceil(fileSize / n))

fileNo = 0
start = 0
end = 0

target = open('split-' + str(fileNo) + '.dat', 'w')
fid.seek(0)
while not (fid.read(1) == ''):
    #write next chunk to open target
    fid.seek(end)
    start = fid.tell()
    profile = wod.WodProfile(fid)
    end = fid.tell()
    fid.seek(start)
    extract = fid.read(end - start)
    target.write(extract)

    #wrap the file and start a new one once we've crossed the max size
    if target.tell() > chunkSize:
        target.close()
        fileNo += 1
        target = open('split-' + str(fileNo) + '.dat', 'w')
Example #11
0
def main():
    # print('This executes the wod_prof_db package\n')

    parser = argparse.ArgumentParser(
        description="setup WOD profile lookup database")
    parser.add_argument(
        "source_dir",
        type=str,
        help=
        "full path to directory containing source data (e.g. download folder)")
    parser.add_argument("dest_dir",
                        type=str,
                        nargs='?',
                        help="directory path where output array will reside")
    parser.add_argument("wild_card",
                        type=str,
                        nargs='?',
                        help="wild card string to narrow input files")
    args = parser.parse_args()

    cur_dir = subprocess.check_output("pwd", shell=True)[:-1]

    print("source dir is " + args.source_dir)
    source_dir = args.source_dir  # dir of source data (wod files)

    if args.dest_dir:
        print("dest dir is " + args.dest_dir)
        dest_dir = args.dest_dir
    else:
        print("creating profile_pool dir in current dir\n")
        dest_dir = cur_dir + "/profile_db/"  # where to put database

    if not os.path.isdir(dest_dir):
        os.system("mkdir " + dest_dir)
        print("creating destination directory")

    # use glob to form a list of input files:
    if args.wild_card:
        prof_files = glob.glob(source_dir + '/ocldb' + args.wild_card)
        print(prof_files)
    else:
        prof_files = glob.glob(source_dir + '/ocldb*')
        print(prof_files)
    # prof_files.sort(key=lambda x: [int(x.split('-')[2])])  # no need for sort

    # prepare look-up table array/list/dict
    # maybe list less ideal because it's slow and lists may require more memory to fill up
    dbase = []  # dbase is the list of profiles that contains profile info

    # loop over input files, retrieve the necessary info and store it in the
    # appropriate place in
    print("\nputting together database: list filling loop\n")
    for dafile in prof_files:
        print("\nWorking on file: " + dafile + "\n")
        fid = open(dafile)
        profile = wod.WodProfile(fid)
        prof_data, prof_ok = get_prof_data(profile)
        if prof_ok:
            dbase.append(prof_data)
        last_prof = profile.is_last_profile_in_file(fid)
        while not last_prof:
            profile = wod.WodProfile(fid)
            prof_data, prof_ok = get_prof_data(profile)
            if prof_ok:
                dbase.append(prof_data)
            last_prof = profile.is_last_profile_in_file(fid)
    dbase = np.array(dbase,
                     dtype=[("probe_type", '|S21'), ('nlevs', 'int32'),
                            ('year', 'int32'), ('month', 'int32'),
                            ('day', 'int32'), ('date', 'O'),
                            ('lat', 'float32'), ('lon', 'float32'),
                            ('pmin', 'float32'), ('pmax', 'float32'),
                            ('dpm', 'float32'), ('dzm', 'float32'),
                            ("ps_qc", 'int32'), ("pt_qc", 'int32'),
                            ('pres', 'O'), ('sal', 'O'), ('temp', 'O'),
                            ('z', 'O'), ('usal', 'O'), ('utemp', 'O'),
                            ('uz', 'O')])
    np.savez_compressed(dest_dir + "cal_wod_profile_info_database",
                        dbase=dbase)