def test_read_btlnbr_as_string(self): with closing(StringIO()) as fff: fff.write('SIO1,33.24\n') fff.write('01,32.10\n') fff.flush() fff.seek(0) dfile = DataFile() dfile['BTLNBR'] = Column('BTLNBR') dfile['CTDSAL'] = Column('CTDSAL') exchange.read_data(dfile, fff, ['BTLNBR', 'CTDSAL']) self.assertEqual(dfile['BTLNBR'].values, ['SIO1', '01']) self.assertEqual( dfile['CTDSAL'].values, [Decimal('33.24'), Decimal('32.10')])
def read(self, fileobj): """How to read LDEO ASEP files from an NODC accession.""" def is_fname_ok(fname): if '.csv' not in fname: return False if fname.find('/') > -1: raise ValueError( u'CTD Exchange Zip files should not contain directories.') return True def reader(dfile, fileobj, retain_order, header_only): ctdex.read(dfile, fileobj, retain_order, header_only) dfile.globals['_FILENAME'] = fileobj.name dfiles = [] datapath = None datadirname = '0-data' with tarfile_open(mode='r:gz', fileobj=fileobj) as fff: for member in fff.getmembers(): if datapath is None: if datadirname in member.name: datapath = member.name.split( datadirname)[0] + datadirname + '/' log.info('NODC accession data path: {0}'.format(datapath)) else: continue if not member.name.startswith(datapath): continue bname = os.path.basename(member.name) if bname.endswith('pdf'): continue if '_ros.' in bname: continue # don't want upcasts if '_ctd_U.' in bname: continue dfile = DataFile() ggg = fff.extractfile(member) if ggg is None: log.error(u'Unable to extract file {0!r}'.format(member)) else: ldeo_asep.read(dfile, ggg) dfiles.append(dfile) self.files = sorted(dfiles, key=lambda dfile: lexico(dfile.globals['STNNBR']))
def guess_ftype_dftype_format(fileobj, file_type=None, file_name=None): """Return a tuple of guessed file type, Datafile or DatafileCollection, and the format module. """ from libcchdo.model.datafile import ( DataFile, SummaryFile, DataFileCollection) file_type = guess_file_type_from_file(fileobj, file_type, file_name) if 'zip' in file_type or file_type.startswith('archive'): dfile = DataFileCollection() elif file_type.startswith('sum'): dfile = SummaryFile() else: dfile = DataFile() format_module = guess_format_module(fileobj, file_type) return (file_type, dfile, format_module)
def test_diff_decplaces(self): """Derivative is still different when decimal places are different.""" dfo = DataFile() dfo.create_columns(['CTDPRS', 'CTDOXY']) dfo['CTDPRS'].append(_decimal('1')) dfo['CTDOXY'].append(_decimal('0.140')) dfd = DataFile() dfd.create_columns(['CTDPRS', 'CTDOXY']) dfd['CTDPRS'].append(_decimal('1')) dfd['CTDOXY'].append(_decimal('0.14')) p_different, p_not_in_orig, p_not_in_deriv, p_common = \ different_columns(dfo, dfd, ['CTDPRS']) self.assertEqual(p_different, ['CTDOXY']) dfile = merge_datafiles(dfo, dfd, ['CTDPRS'], ['CTDOXY']) self.assertEqual(decimal_to_str(dfile['CTDOXY'][0]), '0.14')
def read(self, handle): """How to read CTD WOCE EGEE files from a Zip.""" zip = Zip.ZeroCommentZipFile(handle, 'r') try: for file in zip.namelist(): tempstream = StringIO(zip.read(file)) ctdfile = DataFile() try: woce_egee.read(ctdfile, tempstream) except Exception, e: log.info('Failed to read file %s in %s' % (file, handle)) print_exc() raise e self.append(ctdfile) tempstream.close() finally: zip.close()
def sbe_asc_to_ctd_exchange(args): output, expo = (sys.stdout, '') if (args.expo): expo = args.expo if (args.output): output = args.output d = DataFile() f = args.files if len(args.files) == 1: if output is not sys.stdout: output = output + "_ct1.csv" _single_file(asc, args.files, output, expo=expo) if len(args.files) > 1: if output is not sys.stdout: output = output + '_ct1.zip' _multi_file(asc, args.files, output, expo=expo)
def test_read_unknown_parameter_fillvalue(self): """Reading data for a parameter with unknown format should still check for out of band. """ with closing(StringIO()) as fff: fff.name = 'testfile' fff.write('-999,9,1,012\n') fff.write('11,2,-999,123\n') fff.flush() fff.seek(0) dfile = DataFile() dfile['CTDPRS'] = Column('CTDPRS') dfile['UNKPARAM'] = Column('UNKPARAM') dfile['BTLNBR'] = Column('BTLNBR') exchange.read_data( dfile, fff, ['CTDPRS', 'CTDPRS_FLAG_W', 'UNKPARAM', 'BTLNBR']) self.assertEqual(None, dfile['CTDPRS'].values[0]) self.assertEqual('012', dfile['BTLNBR'].values[0]) self.assertEqual('123', dfile['BTLNBR'].values[1]) self.assertEqual(None, dfile['UNKPARAM'].values[1])
def test_merge_datafiles_no_column(self): """Error to merge columns in neither datafile.""" df0 = DataFile() df0.create_columns(['CTDPRS', 'NITRAT']) df0['CTDPRS'].append(1, 2) df0['CTDPRS'].append(2, 2) df0['NITRAT'].append(10, 2) df0['NITRAT'].append(11, 2) df1 = DataFile() df1.create_columns(['CTDPRS', 'NITRAT']) df1['CTDPRS'].append(1, 2) df1['CTDPRS'].append(2, 2) df1['NITRAT'].append(20, 3) df1['NITRAT'].append(21, 4) with self.assertRaisesRegexp( ValueError, 'No columns selected to merge are different.'): merge_datafiles(df0, df1, ['CTDPRS'], ['CTDSAL']) lines = [ "Instructed to merge parameters that are not in either datafile: ['CTDSAL']", ] self.assertTrue(self.ensure_lines(lines))
def test_merge_datafiles_flags(self): """It should be possible to only merge flag "columns". This includes updating and adding flags. If adding flags and the original column does not exist, warn and fail. """ df0 = DataFile() df0.create_columns(['CTDPRS', 'NITRAT', 'FLUOR']) df0['CTDPRS'].append(1, 2) df0['CTDPRS'].append(2, 2) df0['CTDPRS'].append(3, 2) df0['NITRAT'].append(10, 2) df0['NITRAT'].append(11, 2) df0['NITRAT'].append(12, 2) df0['FLUOR'].append(100) df0['FLUOR'].append(101) df0['FLUOR'].append(102) df1 = DataFile() df1.create_columns(['CTDPRS', 'NITRAT', 'FLUOR']) df1['CTDPRS'].append(1, 2) df1['CTDPRS'].append(2, 2) df1['CTDPRS'].append(4, 2) df1['NITRAT'].append(20, 3) df1['NITRAT'].append(21, 4) df1['NITRAT'].append(22, 4) df1['FLUOR'].append(200, 2) df1['FLUOR'].append(201, 3) df1['FLUOR'].append(202, 3) mdf = merge_datafiles(df0, df1, ['CTDPRS'], ['NITRAT_FLAG_W', 'FLUOR_FLAG_W']) self.assertEqual(mdf['NITRAT'].values, [10, 11, 12]) self.assertEqual(mdf['NITRAT'].flags_woce, [3, 4, 2]) self.assertEqual(mdf['FLUOR'].values, [100, 101, 102]) self.assertEqual(mdf['FLUOR'].flags_woce, [2, 3, 9])
def _read_oliver_sun(dfc, fileobj, cfg): """Read HRP2 format from Oliver Sun.""" mat = loadmat(fileobj) filekey = mat.keys()[0] casts = mat[filekey][0] for cast in casts: dfile = DataFile() dfc.append(dfile) dfile.globals['EXPOCODE'] = cfg['expocode'] # TODO dfile.globals['DEPTH'] = 0 for key, item in zip(cast.dtype.names, cast): if item.shape == (1, 1): key = cfg['global_mapping'].get(key, None) if key: dfile.globals[key] = item[0, 0] else: try: dfile[key] = Column(key) dfile[key].values = list(item.flatten()) # Act as if all files had QC and assign it to OceanSITES 1. # Assuming that someone has already gone through level 0 # data and we are receiving level 1 or higher. dfile[key].flags_woce = [2] * len(dfile[key].values) except KeyError: pass try: dfile.globals['STNNBR'] except KeyError: dfile.globals['STNNBR'] = '999' woce.fuse_datetime(dfile)
def read(self, handle, metadata=None): """How to read a Bottle Bermuda Atlantic Time-Series Study file. This function reads bats_bottle.txt. Arguments: self - (special case, see NOTE) dictionary metadata - (optional) BATS cruise metadata to be used to find port dates NOTE: The result for this method is a special case. The bottle file format contains the entire BATS holdings while the internal data format splits data up by cruises. Because cruises for timeseries are split by file for cruise, the end result is a dictionary with cruise_ids as keys to DatafileCollections (cruises) containing Datafiles (casts). """ sections = _read_header_sections(self, handle) _read_variables(self, handle) parameters = _get_variables(self, handle, sections) # Add DON for note in Variables list stating DON is reported for TON prior # to BATS 121 parameters.append(['DON', None, 'umol/kg']) manual_parameters = [ ['BTLNBR', ''], ['_DATETIME', ''], ['LATITUDE', ''], ['LONGITUDE', ''], ['_ACTUAL_DEPTH', 'METERS'], ] columns = [x[0] for x in manual_parameters] units = [x[1] for x in manual_parameters] s = None for i, (var, d, u) in enumerate(parameters): if var == 'Depth': s = i + 1 continue # Only want to add parameters after Depth. The others were done manually. if s is None: continue try: var = bats_to_param[var] except KeyError: pass columns.append(var) units.append(u) template_df = DataFile() template_df.create_columns(columns, units) template_df.check_and_replace_parameters(convert=False) for sec, lines in sections.items(): if sec == 'Variables list': continue if sec != 'Comments': continue template_df.globals['_{0}'.format(sec)] = '\n'.join(lines) df = None params_auto = parameters[s:] dfi = 0 for i, l in enumerate(handle): parts = l.split() id = parts[0] (cruise_type, type_id, cruise_num, cruise_id, cast_type, cast_id, nisk_id) = _parse_bats_id(id) ship = _ship_from_cruise_num(cruise_num) if not ship: ship = 'R/V Atlantic Explorer' if (df is None or df.globals['_OS_ID'] != cruise_id or df.globals['STNNBR'] != cruise_type or df.globals['CASTNO'] != cast_id): if df is not None: # Done reading one cast. Finalize it. log.info(u'finalizing cast {0} {1} {2}'.format( df.globals['_OS_ID'], df.globals['STNNBR'], df.globals['CASTNO'])) try: meta = metadata[cruise_id] port_date = meta['dates'][0] except (TypeError, KeyError): port_date = None if not port_date: port_date = min(df['_DATETIME']) df.globals['EXPOCODE'] = create_expocode( ship_code(ship, raise_on_unknown=False), port_date) log.info(df.globals['EXPOCODE']) df.globals['DEPTH'] = max(df['_ACTUAL_DEPTH']) collapse_globals(df, ['_DATETIME', 'LATITUDE', 'LONGITUDE']) # Normalize all the parameter column lengths. There may be # columns that did not get data written to them so make sure # they are just as long as the rest length = len(df) for c in df.columns.values(): c.set_length(length) try: dfc = self[df.globals['_OS_ID']] except KeyError: dfc = self[df.globals['_OS_ID']] = DataFileCollection() dfc.files.append(df) dfi = 0 # Create a new cast df = copy(template_df) df.globals['SECT_ID'] = BATS_SECT_ID df.globals['_SHIP'] = ship df.globals['_OS_ID'] = cruise_id df.globals['STNNBR'] = cruise_type df.globals['CASTNO'] = cast_id df['BTLNBR'].set(dfi, nisk_id) dt_ascii = datetime.strptime(parts[1] + parts[3], '%Y%m%d%H%M') dt_deci = bats_time_to_dt(parts[2]) #if dt_ascii != dt_deci: # log.warn( # u'Dates differ on data row {0}: {5} {1!r}={2} ' # '{3!r}={4}'.format(i, parts[1] + parts[3], dt_ascii, parts[2], # dt_deci, dt_deci - dt_ascii)) df['_DATETIME'].set(dfi, dt_ascii) df['LATITUDE'].set(dfi, Decimal(parts[4])) df['LONGITUDE'].set(dfi, Decimal(correct_longitude(parts[5]))) df['_ACTUAL_DEPTH'].set_check_range(dfi, Decimal(parts[6])) parts_auto = parts[s:] for p, v in zip(params_auto, parts_auto): param = p[0] try: param = bats_to_param[param] except KeyError: pass if cruise_num < 121 and param == 'TON': param = 'DON' if (equal_with_epsilon(v, -9) or equal_with_epsilon(v, -9.9) or equal_with_epsilon(v, -9.99)): df[param].set_check_range(dfi, None) # TODO determine whether -10 is just bad formatting for -9.9 elif equal_with_epsilon(v, -10): #log.warn(u'Possible missing data value {0}'.format(v)) df[param].set_check_range(dfi, None) elif v == 0: log.warn(u'Data under detection limit, set flag to ' 'WOCE water sample questionable measurement') df[param].set_check_range(dfi, None, flag=3) else: df[param].set_check_range(dfi, Decimal(v)) dfi += 1 # Since this is a super long file that contains multiple cruises and # casts, as the file is processed it is split apart into a list of # DataFileCollection(s) containing DataFile objects for each casts if i % 100 == 0: log.info(u'processed {0} lines'.format(i))
def test_read(self): self.file = DataFile() self.bufr = StringIO(self.input) ctdwoce.read(self.file, self.bufr) self.bufr.close()
def read(self, fileobj, is_fname_ok, reader, *args, **kwargs): """Generic zip file reader for zip files with multiple datafiles inside.""" for tempfile in generate_files(fileobj, is_fname_ok): dfile = DataFile() reader(dfile, tempfile, *args, **kwargs) self.append(dfile)
def australian_navy_ctd(args): """Download and convert Australian Navy CTD data.""" from pydap.client import open_url from libcchdo.thredds import crawl from libcchdo.formats.ctd.zip import exchange as ctdzipex from libcchdo.formats.zip import write as zwrite dfcs = [] cf_param_to_cchdo_param = { 'sea_water_pressure': 'CTDPRS', 'sea_water_temperature': 'CTDTMP', 'sea_water_practical_salinity': 'CTDSAL', } ignored_qc_flags = [ 'time_qc_flag', 'position_qc_flag', ] qc_conventions = { 'Proposed IODE qc scheme March 2012': { 1: 2, # good 2: 5, # not_evaluated_or_unknown 3: 3, # suspect 4: 4, # bad 9: 9, # missing }, } dfc = DataFileCollection() catalog = "http://www.metoc.gov.au/thredds/catalog/RAN_CTD_DATA/catalog.xml" for url in crawl(catalog): df = DataFile() log.info(u'Reading %s', url) dset = open_url(url) vars = dset.keys() for vname in vars: var = dset[vname] attrs = var.attributes if 'standard_name' in attrs: std_name = attrs['standard_name'] if std_name == 'time': df.globals['_DATETIME'] = \ datetime(1950, 1, 1) + timedelta(var[:]) elif std_name == 'latitude': df.globals['LATITUDE'] = var[:] elif std_name == 'longitude': df.globals['LONGITUDE'] = var[:] elif std_name in cf_param_to_cchdo_param: cparam = cf_param_to_cchdo_param[std_name] if '_FillValue' in attrs: fill_value = attrs['_FillValue'] values = [] for x in var[:]: if equal_with_epsilon(x, fill_value): values.append(None) else: values.append(x) else: values = var[:] try: df[cparam].values = values except KeyError: df[cparam] = Column(cparam) df[cparam].values = values elif 'status_flag' in std_name: flagged_param = std_name.replace('status_flag', '').strip() cparam = cf_param_to_cchdo_param[flagged_param] qc_convention = attrs['quality_control_convention'] if qc_convention in qc_conventions: qc_map = qc_conventions[qc_convention] df[cparam].flags_woce = [qc_map[x] for x in var[:]] else: log.debug('unhandled standard_name %s', std_name) elif ('long_name' in attrs and attrs['long_name'] == 'profile identifier'): profile_id = var[:] cruise_id = profile_id / 10**4 profile_id = profile_id - cruise_id * 10**4 df.globals['EXPOCODE'] = str(cruise_id) df.globals['STNNBR'] = str(profile_id) df.globals['CASTNO'] = str(1) elif vname in ignored_qc_flags: df.globals['_' + vname] = var[:] elif (vname.endswith('whole_profile_flag') or vname.endswith('sd_test')): pass else: log.debug('unhandled variable %s', vname) # attach new file to appropriate collection if dfc.files: if dfc.files[0].globals['EXPOCODE'] != df.globals['EXPOCODE']: dfcs.append(dfc) dfc = DataFileCollection() dfc.append(df) with closing(args.output) as out_file: next_id = 0 def get_filename(dfc): try: return '{0}_ct1.zip'.format(dfc.files[0].globals['EXPOCODE']) except IndexError: next_id += 1 return '{0}_ct1.zip'.format(next_id) zwrite(dfcs, out_file, ctdzipex, get_filename)
def get_ctdex_name(input_file): dfile = DataFile() ctdex.read(dfile, input_file, header_only=True) return ctdex.get_datafile_filename(dfile)
def _single_file(reader, files, output, **kwargs): d = DataFile() reader.read(d, files[0], **kwargs) if output is not sys.stdout: output = open(output, 'w') ctdex.write(d, output)
class TestDataFile(TestCase): def setUp(self): self.file = DataFile() self.c = self.file.columns['EXPOCODE'] = Column('EXPOCODE') def tearDown(self): self.file = None def test_init(self): self.assertEqual(len(self.file.columns), 1) self.assertEqual(self.file.footer, None) self.assertEqual(self.file.globals, {'stamp': '', 'header': ''}) def test_expocodes(self): self.c.append('A') self.assertEqual(['A'], self.file.expocodes()) self.c.append('B') self.assertEqual(['A', 'B'], self.file.expocodes()) self.c.append('A') self.assertEqual( ['A', 'B'], self.file.expocodes()) # Expocodes returns unique expocodes. def test_len(self): c = self.file.columns['EXPOCODE'] del self.file.columns['EXPOCODE'] self.assertEqual(len(self.file), 0) self.file.columns['EXPOCODE'] = c self.assertEqual(len(self.file), 0) self.c.append('A') self.assertEqual(len(self.file), 1) self.c.append('A') self.assertEqual(len(self.file), 2) def test_sorted_columns(self): self.file.columns['CASTNO'] = Column('CASTNO') self.file.columns['STNNBR'] = Column('STNNBR') expected = ['EXPOCODE', 'STNNBR', 'CASTNO'] received = map(lambda c: c.parameter.mnemonic_woce(), self.file.sorted_columns()) # If lengths are equal and all expected in received, then assume equal self.assertEqual(len(expected), len(received)) self.assertTrue(all([x in received for x in expected])) def test_get_property_for_columns(self): pass # This is tested by the following tests. def test_column_headers(self): self.assertEqual(['EXPOCODE'], self.file.column_headers()) self.file.columns['STNNBR'] = Column('STNNBR') expected = ['EXPOCODE', 'STNNBR'] received = self.file.column_headers() # If lengths are equal and all expected in received, then assume equal self.assertEqual(len(expected), len(received)) self.assertTrue(all([x in received for x in expected])) def test_formats(self): self.file.columns['CTDOXY'] = Column('CTDOXY') self.file.check_and_replace_parameters() # Order of columns may be wrong self.assertEqual(['%11s', '%9.4f'], self.file.formats()) def test_to_dict(self): self.file.to_dict() pass # TODO def test_str(self): str(self.file) def test_create_columns(self): parameters = ['CTDOXY'] units = ['UMOL/KG'] self.file.create_columns(parameters, units) def test_column_append(self): self.assertEqual(self.c.values, []) self.c.set(2, 'test') self.assertEqual(self.c.values, [None, None, 'test']) self.assertEqual(self.c.flags_woce, []) self.c.append('test2', 'flag2') self.assertEqual(self.c.values, [None, None, 'test', 'test2']) self.assertEqual(self.c.flags_woce, [None, None, None, 'flag2']) def test_calculate_depths(self): self.file['_ACTUAL_DEPTH'] = Column('_ACTUAL_DEPTH') self.assertEqual(('actual', []), self.file.calculate_depths()) del self.file['_ACTUAL_DEPTH'] self.file.globals['LATITUDE'] = 0 self.file.create_columns(['CTDPRS', 'CTDSAL', 'CTDTMP']) self.assertEqual(('unesco1983', []), self.file.calculate_depths()) self.file['CTDPRS'].values = [1] self.file['CTDSAL'].values = [1] self.file['CTDTMP'].values = [1] self.assertEqual( ('sverdrup', [_decimal('1.021723814950101286444879340E-8')]), self.file.calculate_depths()) def test_check_and_replace_parameter_contrived(self): """Contrived parameters are not checked.""" col = Column('_DATETIME') col.check_and_replace_parameter(self.file, convert=False)
def setUp(self): self.file = DataFile()
def setUp(self): self.file = DataFile() self.c = self.file.columns['EXPOCODE'] = Column('EXPOCODE')
class TestBottleNetCDF(unittest.TestCase): def setUp(self): self.infile = open( sample_file('nc_hyd', 'i08s_33RR20070204_00001_00001_hy1.nc'), 'r') def tearDown(self): self.infile.close() def assertAlmostEqualOrNones(self, x, y): if x is None: self.assert_(y is None) else: self.assertAlmostEqual(x, y) def test_read(self): self.file = DataFile() botnc.read(self.file, self.infile) nitrite_values = (0.11, None, 0.08, 0.08, 0.08, 0.08, 0.06, 0.03, 0.06, 0.04, 0.03, None, 0.03, None, 0.03, None) map(lambda x: self.assertAlmostEqualOrNones(*x), zip(nitrite_values, self.file.columns['NITRIT'].values)) freon11_values = (6.063, 6.055, 5.795, 5.619, 5.486, 5.508, 5.487, 5.683, 5.422, 5.190, 5.222, None, 5.289, None, 5.250, 5.254) map(lambda x: self.assertAlmostEqualOrNones(*x), zip(freon11_values, self.file.columns['CFC-11'].values)) freon113_values = (None, ) * 16 map(lambda x: self.assertAlmostEqualOrNones(*x), zip(freon113_values, self.file.columns['CFC113'].values)) expocodes = ['33RR20070204'] * 16 self.assertEqual(expocodes, self.file.columns['EXPOCODE'].values) def test_read_multiple(self): self.file = DataFile() botnc.read(self.file, self.infile) nitrite_values = (0.11, None, 0.08, 0.08, 0.08, 0.08, 0.06, 0.03, 0.06, 0.04, 0.03, None, 0.03, None, 0.03, None) map(lambda x: self.assertAlmostEqualOrNones(*x), zip(nitrite_values, self.file.columns['NITRIT'].values)) freon11_values = (6.063, 6.055, 5.795, 5.619, 5.486, 5.508, 5.487, 5.683, 5.422, 5.190, 5.222, None, 5.289, None, 5.250, 5.254) map(lambda x: self.assertAlmostEqualOrNones(*x), zip(freon11_values, self.file.columns['CFC-11'].values)) freon113_values = (None, ) * 16 map(lambda x: self.assertAlmostEqualOrNones(*x), zip(freon113_values, self.file.columns['CFC113'].values)) expocodes = ['33RR20070204'] * 16 self.assertEqual(expocodes, self.file.columns['EXPOCODE'].values) # Read second file infile2 = open(sample_file('nc_hyd', 'p03a_00199_00001_hy1.nc'), 'r') botnc.read(self.file, infile2) # Make sure all columns have the same length length = None for c in self.file.columns.values(): if not length: length = len(c.values) else: self.assertEquals(len(c.values), length) if c.is_flagged_woce(): self.assertEquals(len(c.flags_woce), length) if c.is_flagged_igoss(): self.assertEquals(len(c.flags_igoss), length) # Test parameter in first file not in second is filled with None. freon113_values += (None, ) * 36 map(lambda x: self.assertAlmostEqualOrNones(*x), zip(freon113_values, self.file.columns['CFC113'].values)) # Test parameter in both files are filled in correctly. freon11_values += (1.437, 1.501, 1.515, 1.525, 1.578, 1.596, 1.602, 1.725, 1.650, 1.703, 1.694, 1.437, 1.059, 0.702, 0.303, 0.130, 0.040, 0.015, -0.001, 0.002, 0.000, None, None, 0.012, None, 0.006, None, None, None, 0.014, None, 0.000, None, 0.014, None, -0.001) map(lambda x: self.assertAlmostEqualOrNones(*x), zip(freon11_values, self.file.columns['CFC-11'].values)) infile2.close() def test_write(self): self.file = DataFile() g = self.file.globals self.file['EXPOCODE'] = Column('EXPOCODE') self.file['EXPOCODE'].append('TESTEXPO') self.file['SECT_ID'] = Column('SECT_ID') self.file['SECT_ID'].append('TEST') self.file['STNNBR'] = Column('CASTNO') self.file['STNNBR'].append(5) self.file['CASTNO'] = Column('STNNBR') self.file['CASTNO'].append(20) self.file['DEPTH'] = Column('DEPTH') self.file['DEPTH'].append(-1) self.file['LATITUDE'] = Column('LATITUDE') self.file['LATITUDE'].append(90) self.file['LONGITUDE'] = Column('LONGITUDE') self.file['LONGITUDE'].append(180) self.file['_DATETIME'] = Column('_DATETIME') self.file['_DATETIME'].append(datetime.utcnow()) self.file['BTLNBR'] = Column('BTLNBR') self.file['BTLNBR'].append(5, 9) self.file['CTDOXY'] = Column('CTDOXY') self.file['CTDOXY'].append(1, 2) self.file.check_and_replace_parameters() p = self.file['CTDOXY'].parameter p.description = 'ctd oxygen' p.bound_lower = 0 p.bound_upper = 200 botnc.write(self.file, NamedTemporaryFile())
def test_functional_scripts_ctdex(self): """Test merging CTD Exchange files.""" from argparse import Namespace from libcchdo.scripts import merge_ctdex_and_ctdex with TemporaryFile() as origin, \ TemporaryFile() as deriv, \ NamedTemporaryFile(delete=False) as output: origin.write("""\ CTD,20120515ODF # REPORTED CAST DEPTH IS CTD_DEPTH + DISTANCE_ABOVE_BOTTOM AT MAX PRESSURE NUMBER_HEADERS = 11 EXPOCODE = 33AT20120419 SECT_ID = A20 STNNBR = 1 CASTNO = 1 DATE = 20120421 TIME = 1552 LATITUDE = 6.8682 LONGITUDE = -53.4793 DEPTH = 66 INSTRUMENT_ID = 796 CTDPRS,CTDPRS_FLAG_W,CTDTMP,CTDTMP_FLAG_W,CTDSAL,CTDSAL_FLAG_W,CTDOXY,CTDOXY_FLAG_W,CTDNOBS,CTDETIME DBAR,,ITS-90,,PSS-78,,UMOL/KG,,, 0.0,6, 27.7514,6, 31.2862,6, 229.5,6, 1, 629.9 2.0,2, 27.7223,2, 31.3925,2, 229.5,2, 11, 640.0 """) origin.flush() origin.seek(0) deriv.write("""\ CTD,20120515ODF # REPORTED CAST DEPTH IS CTD_DEPTH + DISTANCE_ABOVE_BOTTOM AT MAX PRESSURE NUMBER_HEADERS = 11 EXPOCODE = 33AT20120419 SECT_ID = A20 STNNBR = 1 CASTNO = 1 DATE = 20120421 TIME = 1552 LATITUDE = 6.8682 LONGITUDE = -53.4793 DEPTH = 66 INSTRUMENT_ID = 796 CTDPRS,CTDPRS_FLAG_W,CTDTMP,CTDTMP_FLAG_W,CTDSAL,CTDSAL_FLAG_W,CTDOXY,CTDOXY_FLAG_W,TRANSM,TRANSM_FLAG_W,CTDNOBS,CTDETIME DBAR,,ITS-90,,PSS-78,,UMOL/KG,,0-5VDC,,, 0.0,6, 27.7514,6, 31.2862,2, 222.2,6, 4.3348,1, 1, 629.9 2.0,2, 27.7223,2, 31.3925,2, 229.5,2, 4.3334,1, 11, 640.0 """) deriv.flush() deriv.seek(0) args = Namespace() args.origin = origin args.derivative = deriv args.parameters_to_merge = None args.merge_different = True args.output = output args.guess_key = True merge_ctdex_and_ctdex(args) with open(output.name) as fff: dfile = DataFile() ctdex.read(dfile, fff) self.assertEqual(dfile['CTDSAL'].flags_woce, [2, 2]) self.assertEqual(map(str, dfile['TRANSM'].values), ['4.3348', '4.3334']) self.assertEqual(dfile['TRANSM'].flags_woce, [1, 1]) unlink(output.name)
def test_integration_merge_btl(self): with TemporaryFile() as origin, \ TemporaryFile() as deriv: origin.write("""\ BOTTLE,19700101CCHSIOYYY # header 1 EXPOCODE,SECT_ID,STNNBR,CASTNO,SAMPNO,BTLNBR,BTLNBR_FLAG_W,DEPTH,TDN,DELC14,DELC14_FLAG_W,PH_SWS,PH_SWS_FLAG_W ,,,,,,,METERS,UMOL/KG,/MILLE,,, 316N145_9, TRNS1, 574, 1, 36, 36,2,1000,5,-999.000,9,11,9 316N145_9, TRNS1, 574, 1, 35, 35,2,1000,5,-999.000,9,22,9 316N145_9, TRNS1, 574, 1, 34, 34,2,1000,5,-999.000,9,33,9 316N145_9, TRNS1, 574, 1, 32, 32,2,1000,5,-999.000,9,44,9 END_DATA """) origin.flush() origin.seek(0) deriv.write("""\ BOTTLE,19700101CCHSIOYYY # header 2 EXPOCODE,SECT_ID,STNNBR,CASTNO,SAMPNO,BTLNBR,BTLNBR_FLAG_W,DEPTH,TDN,DELC14,DELC14_FLAG_W,PH_SWS,PH_SWS_FLAG_W ,,,,,,,METERS,UMOL/KG,/MILLE,,, 316N145_9, TRNS1, 574, 1, 36, 36,2,1000,5, 10.000,9,-999.0,9 316N145_9, TRNS1, 574, 1, 35, 35,2,1000,5,-999.000,1,-999.0,9 316N145_9, TRNS1, 574, 1, 34, 34,2,1000,5,-999.000,9,-999.0,9 316N145_9, TRNS1, 600, 1, 1, 1,2,1000,5,-999.000,9,-999.0,9 END_DATA """) deriv.flush() deriv.seek(0) dfo = DataFile() dfd = DataFile() btlex.read(dfo, origin) btlex.read(dfd, deriv) p_different, p_not_in_orig, p_not_in_deriv, p_common = \ different_columns(dfo, dfd, BOTTLE_KEY_COLS) parameters = p_different + p_not_in_orig keys = determine_bottle_keys(dfo, dfd) self.assertEqual( keys, ('EXPOCODE', 'STNNBR', 'CASTNO', 'SAMPNO', 'BTLNBR')) parameters = list(OrderedSet(parameters) - OrderedSet(keys)) # Parameters with underscores in them may be confused when matching # flags with them. E.g. PH_SWS_FLAG_W should be matched with PH_SWS # not PH. dfile = merge_datafiles(dfo, dfd, keys, parameters) self.assertEqual(dfile['DELC14'][0], _decimal('10.000')) self.assertEqual(dfile['DELC14'].flags_woce[1], 1) # Header should be the origin file's header self.assertNotIn('header 2', dfile.globals['header']) self.assertIn('header 1', dfile.globals['header']) # Header should contain the merged parameters self.assertIn('Merged parameters: PH_SWS, DELC14, DELC14_FLAG_W', dfile.globals['header']) # No double new lines self.assertNotIn('\n\n', dfile.globals['header']) # new line for header is not included in the writers self.assertEqual('\n', dfile.globals['header'][-1]) # Key columns should not have been converted to floats. This happens # for some reason if pandas combine/update have been used. self.assertEqual(str(dfile['STNNBR'][0]), '574') self.assertEqual(str(dfile['CASTNO'][0]), '1') self.assertEqual(str(dfile['SAMPNO'][0]), '36') self.assertEqual(str(dfile['BTLNBR'][0]), '36') self.assertEqual(str(dfile['PH_SWS'][0]), 'None') # Extra keys in derivative file should not be merged in. self.assertNotIn(600, dfile['STNNBR']) # Make sure warning is printed regarding extra key in deriv file. lines = [[ 'Key ', 'does not exist in origin from derivative rows', '600' ]] self.assertTrue(self.ensure_lines(lines))
def read(dfc, fileobj, cfg): """Read generic HRP matlab file.""" mat, hrp = load_mat_hrp(fileobj) data = hrp_data_as_dict(hrp) coords = zip(data['lon'][0], data['lat'][0]) del data['lat'] del data['lon'] for key in data.keys(): log.info(u'parameter shape: {0} {1}'.format(key, data[key].shape)) param_map = cfg["parameter_mapping"] for param in data.keys(): if param not in param_map: del data[param] else: new_key = param_map[param] if new_key != param: data[new_key] = data[param] del data[param] for coord in coords: dfile = DataFile() dfc.append(dfile) dfile.globals['LONGITUDE'] = _decimal(coord[0]) dfile.globals['LATITUDE'] = _decimal(coord[1]) # create the columns after extraneous keys have been deleted dfile.create_columns(data.keys()) for dep, dfile in enumerate(dfc): dfile.globals['STNNBR'] = dep + 1 ref_range = ndarray_data_slice(data['PRESSURE'][:, dep]) for param, pdata in data.items(): col = dfile[param] data_col = pdata[:, dep] drange = ndarray_data_slice(data_col) if ref_range is None: ref_range = drange determiner = param elif drange != ref_range: if drange[0] == drange[1]: log.info(u'No data for {0}. Skip.'.format(param)) continue if not is_data_range_inside(drange, ref_range): log.error(u'{0} has data range {1} outside {2}. ' 'Skip.'.format(param, drange, ref_range)) continue col.values = map(_decimal, list(data_col[ref_range[0]:ref_range[1]])) # Act as if all files had QC and assign it to OceanSITES 1. Assuming # that someone has already gone through level 0 data and we are # receiving level 1 or higher. We can set all flags to 2. col.flags_woce = [9 if isnan(val) else 2 for val in col.values] # Somehow, HRP matlab data can have nans in the coordinate arrays. We can't # recalculate depth from that or make other assumptions so we can only # delete them. for iii, dfile in reversed(list(enumerate(dfc))): if (isnan(dfile.globals['LATITUDE']) or isnan(dfile.globals['LONGITUDE'])): log.warn(u'Unable to determine coordinate for matlab row ' '{0}. Discarding.'.format(iii)) dfc.files.remove(dfile)
def test_merge_datafiles(self): """Merge datafiles. When merging data files, there are two cases to consider: Case 1: Adding new column If the derivative file has less records, fill in missing records with fill values and missing flags. Case 2: Updating column data It should also be possible to specifically only merge flags. Make sure if only merging flags to not merge the data. Parameter units should be updated from the derivative. """ df0 = DataFile() df0.create_columns(['CTDPRS', 'NITRAT', 'NITRIT', 'CTDOXY']) df0['CTDPRS'].append(1, 2) df0['CTDPRS'].append(2, 2) df0['NITRAT'].append(10, 2) df0['NITRAT'].append(11, 2) df0['NITRIT'].append(30, 5) df0['NITRIT'].append(31, 6) df0['CTDOXY'].append(40, 2) df0['CTDOXY'].append(41, 3) df1 = DataFile() df1.create_columns(['CTDPRS', 'NITRAT', 'CTDSAL', 'CTDOXY']) df1['CTDPRS'].append(2, 2) df1['CTDPRS'].append(3, 2) df1['CTDSAL'].append(20, 2) df1['CTDSAL'].append(21, 2) df1['NITRAT'].append(12, 4) df1['NITRAT'].append(13, 4) df1['CTDOXY'].append(40, 2) df1['CTDOXY'].append(41, 3) df1['CTDOXY'].parameter.units = Unit('UMOL/KG') # Case 1 column add mdf = merge_datafiles( df0, df1, ['CTDPRS'], ['NITRAT', 'NITRAT_FLAG_W', 'CTDSAL', 'CTDSAL_FLAG_W', 'CTDOXY']) self.assertEqual(mdf['CTDPRS'].values, [1, 2]) # Make sure missing values and flags are filled in. self.assertEqual(mdf['CTDSAL'].values, [None, 20]) self.assertEqual(mdf['CTDSAL'].flags_woce, [9, 2]) # Case 2 data upate self.assertEqual(mdf['NITRAT'].values, [10, 12]) self.assertEqual(mdf['NITRAT'].flags_woce, [2, 4]) # Columns in origin should be kept self.assertEqual(mdf['NITRIT'].values, [30, 31]) self.assertEqual(mdf['NITRIT'].flags_woce, [5, 6]) # Units should be overwritten for merged columns self.assertEqual(mdf['CTDOXY'].parameter.units, df1['CTDOXY'].parameter.units) # Make sure warning is printed regarding unit overwrite. # This doubles to make sure derivate columns do not wholesale overwrite # the origin column, they must be merged using the row match algo. lines = [ "Changed units for CTDOXY from '' to 'UMOL/KG'", ] self.assertTrue(self.ensure_lines(lines))
def test_merge_collections(self): """When merging collections, map files, then merge mapped files. """ odfc = DataFileCollection() ddfc = DataFileCollection() df0 = DataFile() df0.globals['EXPOCODE'] = 'a' df0.globals['STNNBR'] = 1 df0.globals['CASTNO'] = 1 df0.create_columns(['CTDPRS', 'NITRAT', 'NITRIT']) df0['CTDPRS'].append(1, 2) df0['CTDPRS'].append(2, 2) df0['NITRAT'].append(10, 2) df0['NITRAT'].append(11, 2) df0['NITRIT'].append(10, 2) df0['NITRIT'].append(11, 2) odfc.append(df0) df1 = DataFile() df1.globals['EXPOCODE'] = 'a' df1.globals['STNNBR'] = 1 df1.globals['CASTNO'] = 1 df1.create_columns(['CTDPRS', 'NITRAT', 'NITRIT']) df1['CTDPRS'].append(1, 2) df1['CTDPRS'].append(3, 2) df1['NITRAT'].append(20, 2) df1['NITRAT'].append(21, 2) df1['NITRIT'].append(10, 2) df1['NITRIT'].append(11, 2) ddfc.append(df1) def merger(origin, deriv): return merge_datafiles(origin, deriv, ['CTDPRS'], ['NITRAT', 'NITRIT']) merged_dfc = merge_collections(odfc, ddfc, merger) self.assertEqual(merged_dfc.files[0]['CTDPRS'].values, [1, 2]) self.assertEqual(merged_dfc.files[0]['NITRAT'].values, [20, 11]) self.assertEqual(merged_dfc.files[0]['NITRIT'].values, [10, 11]) lines = [ # df1 has an different CTDPRS record (3) 'Key (3,) does not exist in origin from derivative rows', # NITRIT columns are the same "Instructed to merge parameters that are not different: ['NITRIT']" ] self.assertTrue(self.ensure_lines(lines))
def test_different_columns(self): """Columns between two datafiles differ under a wide variety of cases. Case 1: Column values are different Case 1 corollary: Flag values are different Case 2: Units are different Case 3: Column not in original Case 4: Column not in derivative """ with TemporaryFile() as origin, TemporaryFile() as deriv: origin.write("""\ BOTTLE,19700101CCHSIOYYY # header 1 EXPOCODE,SECT_ID,STNNBR,CASTNO,SAMPNO,BTLNBR,BTLNBR_FLAG_W,LATITUDE,LONGITUDE,DATE,TIME,DEPTH,NITRAT,NITRAT_FLAG_W,NITRIT,DELC14,DELC14_FLAG_W ,,,,,,,,,,,METERS,UMOL/KG,,UMOL/KG,/MILLE, 316N145_9, TRNS1, 574, 1, 16, 36, 2, 0, 0, 19700101, 0000,1000,3.00,2,10.0,-999.000,9 316N145_9, TRNS1, 574, 1, 15, 35, 2, 0, 0, 19700101, 0000,1000,4.00,2,10.0,-999.000,9 END_DATA """) origin.flush() origin.seek(0) deriv.write("""\ BOTTLE,19700101CCHSIOYYY # header 2 EXPOCODE,SECT_ID,STNNBR,CASTNO,SAMPNO,BTLNBR,BTLNBR_FLAG_W,LATITUDE,LONGITUDE,DATE,TIME,DEPTH,TDN,TDN_FLAG_W,NITRIT,DELC14,DELC14_FLAG_W,PH_SWS,PH_SWS_FLAG_W ,,,,,,,,,,,METERS,UMOL/KG,,NMOL/KG,/MILLE,,, 316N145_9, TRNS1, 574, 1, 16, 36, 2, 0, 0, 19700101, 0000,1000,6.00,3,10.0,-999.000,1,-999.0,9 316N145_9, TRNS1, 574, 1, 15, 35, 2, 0, 0, 19700101, 0000,1000,5.00,3,10.0, 10.000,9,-999.0,9 END_DATA """) deriv.flush() deriv.seek(0) dforigin = DataFile() dfderiv = DataFile() btlex.read(dforigin, origin) btlex.read(dfderiv, deriv) self.assertEqual( # NITRIT comes after because NMOL/KG is not an expected unit and # gets pushed to the end when sorting ( ['DELC14', 'DELC14_FLAG_W', 'NITRIT'], # PH_SWS_FLAG_W has underscores inside the parameter name. All # parts need to be included ['PH_SWS', 'PH_SWS_FLAG_W', 'TDN', 'TDN_FLAG_W'], ['NITRAT', 'NITRAT_FLAG_W'], [ 'EXPOCODE', 'SECT_ID', 'STNNBR', 'CASTNO', 'SAMPNO', 'BTLNBR', 'BTLNBR_FLAG_W', 'LATITUDE', 'LONGITUDE', 'DEPTH', '_DATETIME' ]), different_columns(dforigin, dfderiv, ( 'EXPOCODE', 'SECT_ID', 'STNNBR', 'CASTNO', 'SAMPNO', 'BTLNBR', ))) lines = [ "DELC14 differs at origin row 1:\t(None, Decimal('10.000'))", "DELC14_FLAG_W differs at origin row 0:\t(9, 1)", ] self.assertTrue(self.ensure_lines(lines)) # Columns are not different if merged results are not different. dfo = DataFile() dfd = DataFile() dfo.create_columns(['CTDPRS', 'CTDOXY']) dfo.check_and_replace_parameters() dfd.create_columns(['CTDPRS', 'CTDOXY']) dfd.check_and_replace_parameters() dfo['CTDPRS'].values = [1, 2, 3] dfo['CTDOXY'].values = [10, 20, 30] dfd['CTDPRS'].values = [3, 2, 1] dfd['CTDOXY'].values = [30, 20, 10] self.assertEqual(([], [], [], ['CTDPRS', 'CTDOXY']), different_columns(dfo, dfd, ('CTDPRS', )))