def test_concatenate_size(self): #-------------------------------------------------------------------------------------- # Test with a concatenate size greater than the length of the virtual dataset #-------------------------------------------------------------------------------------- generator = acquire_data( hdf_files=self.fnames, var_names=['temperature', 'salinity', 'pressure'], concatenate_size=175) out = generator.next() # assert the result... truth1 = out['temperature']['values'] == self.t_result truth2 = out['salinity']['values'] == self.s_result truth3 = out['pressure']['values'] == self.p_result self.assertTrue(truth1.all()) self.assertTrue(truth2.all()) self.assertTrue(truth3.all()) with self.assertRaises(StopIteration): out = generator.next() #-------------------------------------------------------------------------------------------------------------------------- # Test with a concatenate size less than the length of the virtual dataset such that mod(length, concatenate_size) == 0 #-------------------------------------------------------------------------------------------------------------------------- concatenate_size = 14 generator = acquire_data( hdf_files=self.fnames, var_names=['temperature', 'salinity', 'pressure'], concatenate_size=concatenate_size) self.check_pieces_3_variables_2d(generator, self.slice_tuple, concatenate_size) #-------------------------------------------------------------------------------------------------------------------------- # Test with a concatenate size less than the length of the virtual dataset such that mod(length, concatenate_size) != 0 #-------------------------------------------------------------------------------------------------------------------------- bounds = ( slice(2, 4), slice(2, 8) ) # on the x axis, choose indices 3..9, on the y axis, choose indices 2..7 #@todo calculate the slice_tuple # slice_tuple = concatenate_size = 26 generator = acquire_data( hdf_files=self.fnames, var_names=['temperature', 'salinity', 'pressure'], concatenate_size=concatenate_size, bounds=bounds) # assert the result... self.check_pieces_3_variables_2d(generator, bounds, concatenate_size)
def test_concatenate_size(self): #-------------------------------------------------------------------------------------- # Test with a concatenate size greater than the length of the virtual dataset #-------------------------------------------------------------------------------------- generator = acquire_data(hdf_files = self.fnames, var_names = ['temperature', 'salinity', 'pressure'], concatenate_size = 175 ) out = generator.next() # assert the result... truth1 = out['temperature']['values'] == self.t_result truth2 = out['salinity']['values'] == self.s_result truth3 = out['pressure']['values'] == self.p_result self.assertTrue(truth1.all()) self.assertTrue(truth2.all()) self.assertTrue(truth3.all()) with self.assertRaises(StopIteration): out = generator.next() #-------------------------------------------------------------------------------------------------------------------------- # Test with a concatenate size less than the length of the virtual dataset such that mod(length, concatenate_size) == 0 #-------------------------------------------------------------------------------------------------------------------------- concatenate_size = 14 generator = acquire_data(hdf_files = self.fnames, var_names = ['temperature', 'salinity', 'pressure'], concatenate_size = concatenate_size ) self.check_pieces_3_variables_2d(generator, self.slice_tuple, concatenate_size) #-------------------------------------------------------------------------------------------------------------------------- # Test with a concatenate size less than the length of the virtual dataset such that mod(length, concatenate_size) != 0 #-------------------------------------------------------------------------------------------------------------------------- bounds = (slice(2,4), slice(2,8)) # on the x axis, choose indices 3..9, on the y axis, choose indices 2..7 #@todo calculate the slice_tuple # slice_tuple = concatenate_size = 26 generator = acquire_data(hdf_files = self.fnames, var_names = ['temperature', 'salinity', 'pressure'], concatenate_size = concatenate_size, bounds=bounds ) # assert the result... self.check_pieces_3_variables_2d(generator, bounds, concatenate_size)
def test_concatenate_size(self): #-------------------------------------------------------------------------------------- # Test with a concatenate size greater than the length of the virtual dataset #-------------------------------------------------------------------------------------- generator = acquire_data(hdf_files = self.fnames, var_names = ['temperature', 'salinity', 'pressure'], concatenate_size = 175 ) out = generator.next() # assert the result... truth1 = out['temperature']['values'] == self.t_result truth2 = out['salinity']['values'] == self.s_result truth3 = out['pressure']['values'] == self.p_result self.assertTrue(truth1.all()) self.assertTrue(truth2.all()) self.assertTrue(truth3.all()) with self.assertRaises(StopIteration): out = generator.next() #-------------------------------------------------------------------------------------------------------------------------- # Test with a concatenate size less than the length of the virtual dataset such that mod(length, concatenate_size) == 0 #-------------------------------------------------------------------------------------------------------------------------- concatenate_size = 25 generator = acquire_data(hdf_files = self.fnames, var_names = ['temperature', 'salinity', 'pressure'], concatenate_size = concatenate_size ) self.check_pieces_3_variables_1d(generator, self.sl, concatenate_size) #-------------------------------------------------------------------------------------------------------------------------- # Test with a concatenate size less than the length of the virtual dataset such that mod(length, concatenate_size) != 0 #-------------------------------------------------------------------------------------------------------------------------- sl = slice(3,63) concatenate_size = 26 generator = acquire_data(hdf_files = self.fnames, var_names = ['temperature', 'salinity', 'pressure'], concatenate_size = concatenate_size, bounds=(sl) ) # assert the result... self.check_pieces_3_variables_1d(generator, sl, concatenate_size)
def test_concatenate_size(self): #-------------------------------------------------------------------------------------- # Test with a concatenate size greater than the length of the virtual dataset #-------------------------------------------------------------------------------------- generator = acquire_data( hdf_files=self.fnames, var_names=['temperature', 'salinity', 'pressure'], concatenate_size=175) out = generator.next() # assert the result... truth1 = out['temperature']['values'] == self.t_result truth2 = out['salinity']['values'] == self.s_result truth3 = out['pressure']['values'] == self.p_result self.assertTrue(truth1.all()) self.assertTrue(truth2.all()) self.assertTrue(truth3.all()) with self.assertRaises(StopIteration): out = generator.next() #-------------------------------------------------------------------------------------------------------------------------- # Test with a concatenate size less than the length of the virtual dataset such that mod(length, concatenate_size) == 0 #-------------------------------------------------------------------------------------------------------------------------- concatenate_size = 25 generator = acquire_data( hdf_files=self.fnames, var_names=['temperature', 'salinity', 'pressure'], concatenate_size=concatenate_size) self.check_pieces_3_variables_1d(generator, self.sl, concatenate_size) #-------------------------------------------------------------------------------------------------------------------------- # Test with a concatenate size less than the length of the virtual dataset such that mod(length, concatenate_size) != 0 #-------------------------------------------------------------------------------------------------------------------------- sl = slice(3, 63) concatenate_size = 26 generator = acquire_data( hdf_files=self.fnames, var_names=['temperature', 'salinity', 'pressure'], concatenate_size=concatenate_size, bounds=(sl)) # assert the result... self.check_pieces_3_variables_1d(generator, sl, concatenate_size)
def sub_listen(msg, headers): assertions(isinstance(msg, StreamGranuleContainer), 'replayed message is not a granule.') hdf_string = msg.identifiables[data_stream_id].values sha1 = hashlib.sha1(hdf_string).hexdigest().upper() assertions(sha1 == msg.identifiables[encoding_id].sha1, 'Checksum failed.') assertions( msg.identifiables[element_count_id].value == 1, 'record replay count is incorrect %d.' % msg.identifiables[element_count_id].value) output_file = FileSystem.mktemp() output_file.write(msg.identifiables[data_stream_id].values) output_file_path = output_file.name output_file.close() output_vectors = acquire_data([output_file_path], fields, 2).next() for field in fields: comparison = (input_vectors[field]['values'] == output_vectors[field]['values']) assertions( comparison.all(), 'vector mismatch: %s vs %s' % (input_vectors[field]['values'], output_vectors[field]['values'])) FileSystem.unlink(output_file_path) ar.set(True)
def _get_time_index(self, granule, timeval): ''' @brief Obtains the index where a time's value is @param granule must be a complete dataset (hdf_string provided) @param timeval the vector value @return Index value for timeval or closest approx such that timeval is IN the subset ''' assert isinstance(granule, StreamGranuleContainer), 'object is not a granule.' assert granule.identifiables[ self.data_stream_id].values, 'hdf_string is not provided.' hdf_string = granule.identifiables[self.data_stream_id].values file_path = self._get_hdf_from_string(hdf_string) #------------------------------------------------------------------------------------- # Determine the field_id for the temporal coordinate vector (aka time) #------------------------------------------------------------------------------------- time_field = self.definition.identifiables[ self.time_id].coordinate_ids[0] value_path = granule.identifiables[ time_field].values_path or self.definition.identifiables[ time_field].values_path record_count = granule.identifiables[self.element_count_id].value #------------------------------------------------------------------------------------- # Go through the time vector and get the indexes that correspond to the timeval # It will find a value such that # t_n <= i < t_(n+1), where i is the index #------------------------------------------------------------------------------------- var_name = value_path.split('/').pop() res = acquire_data([file_path], [var_name], record_count).next() time_vector = res[var_name]['values'] retval = 0 for i in xrange(len(time_vector)): if time_vector[i] == timeval: retval = i break elif i == 0 and time_vector[i] > timeval: retval = i break elif (i + 1) < len(time_vector): # not last val if time_vector[i] < timeval and time_vector[i + 1] > timeval: retval = i break else: # last val retval = i break FileSystem.unlink(file_path) return retval
def _slice(self, granule, slice_): ''' @brief Creates a granule which is a slice of the granule parameter @param granule the superset @param slice_ The slice values for which to create the granule @return Crafted subset granule of the parameter granule. ''' retval = copy.deepcopy(granule) fields = self._list_data(self.definition, granule) record_count = slice_.stop - slice_.start assert record_count > 0, 'slice is malformed' pairs = self._pair_up(granule) var_names = list([i[0] for i in pairs]) # Get the var_names from the pairs log.debug('var_names: %s', var_names) file_path = self._get_hdf_from_string( granule.identifiables[self.data_stream_id].values) codec = HDFEncoder() vectors = acquire_data([file_path], var_names, record_count, slice_).next() for row, value in vectors.iteritems(): vp = self._find_vp(pairs, row) # Determine the range_id reverse dictionary lookup #@todo: improve this pattern for field, path in fields.iteritems(): if vp == path: range_id = field break bounds_id = retval.identifiables[range_id].bounds_id # Recalculate the bounds for this fields and update the granule range = value['range'] retval.identifiables[bounds_id].value_pair[0] = float(range[0]) retval.identifiables[bounds_id].value_pair[1] = float(range[1]) codec.add_hdf_dataset(vp, value['values']) record_count = len(value['values']) #----- DEBUGGING --------- log.debug('slice- row: %s', row) log.debug('slice- value_path: %s', vp) log.debug('slice- range_id: %s', range_id) log.debug('slice- bounds_id: %s', bounds_id) log.debug('slice- limits: %s', value['range']) #------------------------- retval.identifiables[self.element_count_id].value = record_count hdf_string = codec.encoder_close() self._patch_granule(retval, hdf_string) FileSystem.unlink(file_path) return retval
def _get_time_index(self, granule, timeval): ''' @brief Obtains the index where a time's value is @param granule must be a complete dataset (hdf_string provided) @param timeval the vector value @return Index value for timeval or closest approx such that timeval is IN the subset ''' assert isinstance(granule, StreamGranuleContainer), 'object is not a granule.' assert granule.identifiables[self.data_stream_id].values, 'hdf_string is not provided.' hdf_string = granule.identifiables[self.data_stream_id].values file_path = self._get_hdf_from_string(hdf_string) #------------------------------------------------------------------------------------- # Determine the field_id for the temporal coordinate vector (aka time) #------------------------------------------------------------------------------------- time_field = self.definition.identifiables[self.time_id].coordinate_ids[0] value_path = granule.identifiables[time_field].values_path or self.definition.identifiables[time_field].values_path record_count = granule.identifiables[self.element_count_id].value #------------------------------------------------------------------------------------- # Go through the time vector and get the indexes that correspond to the timeval # It will find a value such that # t_n <= i < t_(n+1), where i is the index #------------------------------------------------------------------------------------- var_name = value_path.split('/').pop() res = acquire_data([file_path], [var_name], record_count).next() time_vector = res[var_name]['values'] retval = 0 for i in xrange(len(time_vector)): if time_vector[i] == timeval: retval = i break elif i==0 and time_vector[i] > timeval: retval = i break elif (i+1) < len(time_vector): # not last val if time_vector[i] < timeval and time_vector[i+1] > timeval: retval = i break else: # last val retval = i break FileSystem.unlink(file_path) return retval
def _slice(self,granule,slice_): ''' @brief Creates a granule which is a slice of the granule parameter @param granule the superset @param slice_ The slice values for which to create the granule @return Crafted subset granule of the parameter granule. ''' retval = copy.deepcopy(granule) fields = self._list_data(self.definition,granule) record_count = slice_.stop - slice_.start assert record_count > 0, 'slice is malformed' pairs = self._pair_up(granule) var_names = list([i[0] for i in pairs]) # Get the var_names from the pairs log.debug('var_names: %s',var_names) file_path = self._get_hdf_from_string(granule.identifiables[self.data_stream_id].values) codec = HDFEncoder() vectors = acquire_data([file_path],var_names,record_count,slice_ ).next() for row, value in vectors.iteritems(): vp = self._find_vp(pairs, row) # Determine the range_id reverse dictionary lookup #@todo: improve this pattern for field,path in fields.iteritems(): if vp==path: range_id = field break bounds_id = retval.identifiables[range_id].bounds_id # Recalculate the bounds for this fields and update the granule range = value['range'] retval.identifiables[bounds_id].value_pair[0] = float(range[0]) retval.identifiables[bounds_id].value_pair[1] = float(range[1]) codec.add_hdf_dataset(vp, value['values']) record_count = len(value['values']) #----- DEBUGGING --------- log.debug('slice- row: %s', row) log.debug('slice- value_path: %s', vp) log.debug('slice- range_id: %s', range_id) log.debug('slice- bounds_id: %s', bounds_id) log.debug('slice- limits: %s', value['range']) #------------------------- retval.identifiables[self.element_count_id].value = record_count hdf_string = codec.encoder_close() self._patch_granule(retval, hdf_string) FileSystem.unlink(file_path) return retval
def sub_listen(msg, headers): assertions(isinstance(msg,StreamGranuleContainer),'replayed message is not a granule.') hdf_string = msg.identifiables[data_stream_id].values sha1 = hashlib.sha1(hdf_string).hexdigest().upper() assertions(sha1 == msg.identifiables[encoding_id].sha1,'Checksum failed.') assertions(msg.identifiables[element_count_id].value==1, 'record replay count is incorrect %d.' % msg.identifiables[element_count_id].value) output_file = FileSystem.mktemp() output_file.write(msg.identifiables[data_stream_id].values) output_file_path = output_file.name output_file.close() output_vectors = acquire_data([output_file_path],fields,2).next() for field in fields: comparison = (input_vectors[field]['values']==output_vectors[field]['values']) assertions(comparison.all(), 'vector mismatch: %s vs %s' % (input_vectors[field]['values'],output_vectors[field]['values'])) FileSystem.unlink(output_file_path) ar.set(True)
def test_acquire_data_closes_files_when_exception(self, h5mock): self.assertRaises(TypeError, acquire_data(['anything'], [], [], [])) h5mock.File.close.assert_called_once_with()
def test_bounds(self): #--------------------------------------------------------------------------------------------------- # Test with bad input not a slice and not a tuple... #--------------------------------------------------------------------------------------------------- generator = acquire_data(hdf_files = self.fnames, var_names = ['temperature', 'salinity'], concatenate_size = 26, bounds = 's' ) # Assert an error. with self.assertRaises(BadRequest): out = generator.next() #@todo can we make the error more transparent to the use - easier to correct their mistake? #--------------------------------------------------------------------------------------------------- # Test with 2 tuple of slices on the 1d dataset #--------------------------------------------------------------------------------------------------- generator = acquire_data(hdf_files = self.fnames, var_names = ['temperature', 'salinity'], concatenate_size = 26, bounds = (slice(63,120), slice(63,120)) ) # Assert an error. with self.assertRaises(BadRequest): out = generator.next() #--------------------------------------------------------------------------------------------------- # Test with bounds greater than the dataset length #--------------------------------------------------------------------------------------------------- generator = acquire_data(hdf_files = self.fnames, var_names = ['temperature', 'salinity'], concatenate_size = 200, bounds = (slice(0,200)) ) out = generator.next() # Assert result is the whole dataset truth1 = out['temperature']['values'] == self.t_result truth2 = out['salinity']['values'] == self.s_result self.assertTrue(truth1.all()) self.assertTrue(truth2.all()) # try to get the stop iteration by iterating again with self.assertRaises(StopIteration): out = generator.next() #--------------------------------------------------------------------------------------------------- # Test with normal bounds slice #--------------------------------------------------------------------------------------------------- generator = acquire_data(hdf_files = self.fnames, var_names = ['temperature', 'salinity', 'pressure'], concatenate_size = 60, bounds = (slice(30,80)) ) out = generator.next() # assert result truth1 = out['temperature']['values'] == self.t_result[30:80] truth2 = out['salinity']['values'] == self.s_result[30:80] self.assertTrue(truth1.all()) self.assertTrue(truth2.all()) with self.assertRaises(StopIteration): out = generator.next() #--------------------------------------------------------------------------------------------------- # Test with no bounds #--------------------------------------------------------------------------------------------------- concatenate_size = 60 generator = acquire_data(hdf_files = self.fnames, var_names = ['temperature', 'salinity', 'pressure'], concatenate_size = concatenate_size ) # assert result self.check_pieces_3_variables_1d(generator, self.sl, concatenate_size) #--------------------------------------------------------------------------------------------------- # Test with concatenate larger than bounds slice #--------------------------------------------------------------------------------------------------- generator = acquire_data(hdf_files = self.fnames, var_names = ['temperature', 'salinity', 'pressure'], concatenate_size = 60, bounds = (slice(30,50)) ) out = generator.next() # assert result truth1 = out['temperature']['values'] == self.t_result[30:50] truth2 = out['salinity']['values'] == self.s_result[30:50] truth3 = out['pressure']['values'] == self.p_result[30:50] self.assertTrue(truth1.all()) self.assertTrue(truth2.all()) self.assertTrue(truth3.all()) with self.assertRaises(StopIteration): out = generator.next() #--------------------------------------------------------------------------------------------------- # Test with concatenate smaller than bounds slice #--------------------------------------------------------------------------------------------------- sl = slice(30,100) concatenate_size = 10 generator = acquire_data(hdf_files = self.fnames, var_names = ['temperature', 'salinity', 'pressure'], concatenate_size = concatenate_size, bounds = (sl) ) # assert result self.check_pieces_3_variables_1d(generator, sl, concatenate_size)
def test_var_names(self): # Test with no names # assert an error? generator = acquire_data(hdf_files = self.fnames, var_names = None, concatenate_size = 26, bounds = (slice(63,120)) ) with self.assertRaises(NotFound): out = generator.next() #--------------------------------------------------------------------------------------------------- # Test with all names #--------------------------------------------------------------------------------------------------- generator = acquire_data(hdf_files = self.fnames, var_names = ['temperature', 'salinity', 'pressure'], concatenate_size = 26, bounds = (slice(63,120)) ) out = generator.next() # assert result truth1 = out['temperature']['values'] == self.t_result[63:89] truth2 = out['salinity']['values'] == self.s_result[63:89] truth3 = out['pressure']['values'] == self.p_result[63:89] self.assertTrue(truth1.all()) self.assertTrue(truth2.all()) self.assertTrue(truth3.all()) #--------------------------------------------------------------------------------------------------- # Test with some names #--------------------------------------------------------------------------------------------------- generator = acquire_data(hdf_files = self.fnames, var_names = ['temperature', 'salinity'], concatenate_size = 26, bounds = (slice(63,120)) ) out = generator.next() # assert result truth1 = out['temperature']['values'] == self.t_result[63:89] truth2 = out['salinity']['values'] == self.s_result[63:89] self.assertTrue(truth1.all()) self.assertTrue(truth2.all()) self.assertTrue('pressure' not in out) #--------------------------------------------------------------------------------------------------- # Test with name not in dataset #--------------------------------------------------------------------------------------------------- generator = acquire_data(hdf_files = self.fnames, var_names = ['biological_quotient'], concatenate_size = 26, bounds = (slice(63,120)) ) # assert an error with self.assertRaises(NotFound): out = generator.next()
def test_replay_integration(self): ''' test_replay_integration ''' import numpy as np # Keep the import it's used in the vector comparison below even though pycharm says its unused. cc = self.container XP = self.XP assertions = self.assertTrue ### Every thing below here can be run as a script: log.debug('Got it') pubsub_management_service = PubsubManagementServiceClient(node=cc.node) ingestion_management_service = IngestionManagementServiceClient(node=cc.node) dataset_management_service = DatasetManagementServiceClient(node=cc.node) data_retriever_service = DataRetrieverServiceClient(node=cc.node) datastore_name = 'dm_test_replay_integration' producer = Publisher(name=(XP,'stream producer')) ingestion_configuration_id = ingestion_management_service.create_ingestion_configuration( exchange_point_id=XP, couch_storage=CouchStorage(datastore_name=datastore_name,datastore_profile='SCIDATA'), hdf_storage=HdfStorage(), number_of_workers=1 ) ingestion_management_service.activate_ingestion_configuration( ingestion_configuration_id=ingestion_configuration_id ) definition = SBE37_CDM_stream_definition() data_stream_id = definition.data_stream_id encoding_id = definition.identifiables[data_stream_id].encoding_id element_count_id = definition.identifiables[data_stream_id].element_count_id stream_def_id = pubsub_management_service.create_stream_definition( container=definition ) stream_id = pubsub_management_service.create_stream( stream_definition_id=stream_def_id ) dataset_id = dataset_management_service.create_dataset( stream_id=stream_id, datastore_name=datastore_name, view_name='datasets/dataset_by_id' ) ingestion_management_service.create_dataset_configuration( dataset_id=dataset_id, archive_data=True, archive_metadata=True, ingestion_configuration_id = ingestion_configuration_id ) definition.stream_resource_id = stream_id packet = _create_packet(definition) input_file = FileSystem.mktemp() input_file.write(packet.identifiables[data_stream_id].values) input_file_path = input_file.name input_file.close() fields=[ 'conductivity', 'height', 'latitude', 'longitude', 'pressure', 'temperature', 'time' ] input_vectors = acquire_data([input_file_path],fields , 2).next() producer.publish(msg=packet, to_name=(XP,'%s.data' % stream_id)) replay_id, replay_stream_id = data_retriever_service.define_replay(dataset_id) ar = gevent.event.AsyncResult() def sub_listen(msg, headers): assertions(isinstance(msg,StreamGranuleContainer),'replayed message is not a granule.') hdf_string = msg.identifiables[data_stream_id].values sha1 = hashlib.sha1(hdf_string).hexdigest().upper() assertions(sha1 == msg.identifiables[encoding_id].sha1,'Checksum failed.') assertions(msg.identifiables[element_count_id].value==1, 'record replay count is incorrect %d.' % msg.identifiables[element_count_id].value) output_file = FileSystem.mktemp() output_file.write(msg.identifiables[data_stream_id].values) output_file_path = output_file.name output_file.close() output_vectors = acquire_data([output_file_path],fields,2).next() for field in fields: comparison = (input_vectors[field]['values']==output_vectors[field]['values']) assertions(comparison.all(), 'vector mismatch: %s vs %s' % (input_vectors[field]['values'],output_vectors[field]['values'])) FileSystem.unlink(output_file_path) ar.set(True) subscriber = Subscriber(name=(XP,'replay listener'),callback=sub_listen) g = gevent.Greenlet(subscriber.listen, binding='%s.data' % replay_stream_id) g.start() data_retriever_service.start_replay(replay_id) ar.get(timeout=10) FileSystem.unlink(input_file_path)
def test_bounds(self): #--------------------------------------------------------------------------------------------------- # Test with 1 tuple of slices on the 1d dataset #--------------------------------------------------------------------------------------------------- concatenate_size = 26 generator = acquire_data( hdf_files=self.fnames, var_names=['temperature', 'salinity', 'pressure'], concatenate_size=concatenate_size, bounds=(slice(2, 4))) # Assert that it interprets the only slice provided as pertaining to the vertical dimension out = generator.next() self.check_pieces_3_variables_2d(generator, (slice(2, 4), self.slice_tuple[1]), concatenate_size) #--------------------------------------------------------------------------------------------------- # Test with bounds greater than the dataset length #--------------------------------------------------------------------------------------------------- # bounds = (slice(0,200), slice(0,200)) # concatenate_size = 20 # # generator = acquire_data(hdf_files = self.fnames, # var_names = ['temperature', 'salinity', 'pressure'], # concatenate_size = concatenate_size, # bounds = bounds # ) # # out = generator.next() # # # Assert result is the whole dataset #--------------------------------------------------------------------------------------------------- # Test with normal bounds slice #--------------------------------------------------------------------------------------------------- bounds = (slice(2, 3), slice(2, 5)) concatenate_size = 60 generator = acquire_data( hdf_files=self.fnames, var_names=['temperature', 'salinity', 'pressure'], concatenate_size=concatenate_size, bounds=bounds) out = generator.next() # assert result self.check_pieces_3_variables_2d(generator, (slice(2, 4), self.slice_tuple[1]), concatenate_size) #--------------------------------------------------------------------------------------------------- # Test with no bounds #--------------------------------------------------------------------------------------------------- concatenate_size = 60 generator = acquire_data( hdf_files=self.fnames, var_names=['temperature', 'salinity', 'pressure'], concatenate_size=concatenate_size) # assert result self.check_pieces_3_variables_2d(generator, self.slice_tuple, concatenate_size) #--------------------------------------------------------------------------------------------------- # Test with concatenate larger than bounds overall extent #--------------------------------------------------------------------------------------------------- bounds = (slice(2, 5), slice(2, 10)) concatenate_size = 200 generator = acquire_data( hdf_files=self.fnames, var_names=['temperature', 'salinity', 'pressure'], concatenate_size=concatenate_size, bounds=bounds) out = generator.next() # assert result self.check_pieces_3_variables_2d(generator, self.slice_tuple, concatenate_size)
def _merge(self, msgs): ''' @brief Merges all the granules and datasets into one large dataset (Union) @param msgs raw granules from couch @return complete dataset @description n D := U [ msgs_i ] i=0 ''' granule = None file_list = list() count = len(msgs) used_vals = list() #------------------------------------------------------------------------------------- # Merge each granule to another granule one by one. # After each merge operation keep track of what files belong where on the timeline #------------------------------------------------------------------------------------- for i in xrange(count): if i == 0: granule = msgs[0]['granule'] psc = PointSupplementConstructor( point_definition=self.definition) res = ReplayProcess.merge_granule(definition=self.definition, granule1=granule, granule2=None) granule = res['granule'] file_pair = res['files'] log.debug('file_pair: %s', file_pair) if file_pair[0] not in file_list and file_pair[0][ 0] not in used_vals: file_list.append(tuple(file_pair[0])) used_vals.append(file_pair[0][0]) else: res = ReplayProcess.merge_granule(definition=self.definition, granule1=granule, granule2=msgs[i]['granule']) granule = res['granule'] file_pair = res['files'] log.debug('file_pair: %s', file_pair) if file_pair[0] not in file_list and file_pair[0][ 0] not in used_vals: file_list.append(tuple(file_pair[0])) used_vals.append(file_pair[0][0]) if file_pair[1] not in file_list and file_pair[1][ 0] not in used_vals: file_list.append(tuple(file_pair[1])) used_vals.append(file_pair[1][0]) if not granule: return log.debug('file_list: %s', file_list) #------------------------------------------------------------------------------------- # Order the lists using a stable sort from python (by the first value in the tuples # Then peel off just the file names # Then get the appropriate URL for the file using FileSystem #------------------------------------------------------------------------------------- file_list.sort() file_list = list(i[1] for i in file_list) file_list = list([ FileSystem.get_hierarchical_url(FS.CACHE, '%s' % i) for i in file_list ]) pairs = self._pair_up(granule) var_names = list([i[0] for i in pairs]) record_count = granule.identifiables[self.element_count_id].value codec = HDFEncoder() log.debug('acquire_data:') log.debug('\tfile_list: %s', file_list) log.debug('\tfields: %s', var_names) log.debug('\trecords: %s', record_count) data = acquire_data(file_list, var_names, record_count).next() for row, value in data.iteritems(): value_path = self._find_vp(pairs, row) codec.add_hdf_dataset(value_path, nparray=value['values']) #------------------------------------------------------------------------------------- # Debugging #------------------------------------------------------------------------------------- log.debug('row: %s', row) log.debug('value path: %s', value_path) log.debug('value: %s', value['values']) hdf_string = codec.encoder_close() self._patch_granule(granule, hdf_string) return granule
def test_var_names(self): # Test with no names # assert an error? generator = acquire_data(hdf_files=self.fnames, var_names=None, concatenate_size=26, bounds=(slice(63, 120))) with self.assertRaises(NotFound): out = generator.next() #--------------------------------------------------------------------------------------------------- # Test with all names #--------------------------------------------------------------------------------------------------- generator = acquire_data( hdf_files=self.fnames, var_names=['temperature', 'salinity', 'pressure'], concatenate_size=26, bounds=(slice(63, 120))) out = generator.next() # assert result truth1 = out['temperature']['values'] == self.t_result[63:89] truth2 = out['salinity']['values'] == self.s_result[63:89] truth3 = out['pressure']['values'] == self.p_result[63:89] self.assertTrue(truth1.all()) self.assertTrue(truth2.all()) self.assertTrue(truth3.all()) #--------------------------------------------------------------------------------------------------- # Test with some names #--------------------------------------------------------------------------------------------------- generator = acquire_data(hdf_files=self.fnames, var_names=['temperature', 'salinity'], concatenate_size=26, bounds=(slice(63, 120))) out = generator.next() # assert result truth1 = out['temperature']['values'] == self.t_result[63:89] truth2 = out['salinity']['values'] == self.s_result[63:89] self.assertTrue(truth1.all()) self.assertTrue(truth2.all()) self.assertTrue('pressure' not in out) #--------------------------------------------------------------------------------------------------- # Test with name not in dataset #--------------------------------------------------------------------------------------------------- generator = acquire_data(hdf_files=self.fnames, var_names=['biological_quotient'], concatenate_size=26, bounds=(slice(63, 120))) # assert an error with self.assertRaises(NotFound): out = generator.next()
def test_replay_integration(self): ''' test_replay_integration ''' import numpy as np # Keep the import it's used in the vector comparison below even though pycharm says its unused. cc = self.container XP = self.XP assertions = self.assertTrue ### Every thing below here can be run as a script: log.debug('Got it') pubsub_management_service = PubsubManagementServiceClient(node=cc.node) ingestion_management_service = IngestionManagementServiceClient( node=cc.node) dataset_management_service = DatasetManagementServiceClient( node=cc.node) data_retriever_service = DataRetrieverServiceClient(node=cc.node) datastore_name = 'dm_test_replay_integration' producer = Publisher(name=(XP, 'stream producer')) ingestion_configuration_id = ingestion_management_service.create_ingestion_configuration( exchange_point_id=XP, couch_storage=CouchStorage(datastore_name=datastore_name, datastore_profile='SCIDATA'), hdf_storage=HdfStorage(), number_of_workers=1) ingestion_management_service.activate_ingestion_configuration( ingestion_configuration_id=ingestion_configuration_id) definition = SBE37_CDM_stream_definition() data_stream_id = definition.data_stream_id encoding_id = definition.identifiables[data_stream_id].encoding_id element_count_id = definition.identifiables[ data_stream_id].element_count_id stream_def_id = pubsub_management_service.create_stream_definition( container=definition) stream_id = pubsub_management_service.create_stream( stream_definition_id=stream_def_id) dataset_id = dataset_management_service.create_dataset( stream_id=stream_id, datastore_name=datastore_name, view_name='datasets/dataset_by_id') ingestion_management_service.create_dataset_configuration( dataset_id=dataset_id, archive_data=True, archive_metadata=True, ingestion_configuration_id=ingestion_configuration_id) definition.stream_resource_id = stream_id packet = _create_packet(definition) input_file = FileSystem.mktemp() input_file.write(packet.identifiables[data_stream_id].values) input_file_path = input_file.name input_file.close() fields = [ 'conductivity', 'height', 'latitude', 'longitude', 'pressure', 'temperature', 'time' ] input_vectors = acquire_data([input_file_path], fields, 2).next() producer.publish(msg=packet, to_name=(XP, '%s.data' % stream_id)) replay_id, replay_stream_id = data_retriever_service.define_replay( dataset_id) ar = gevent.event.AsyncResult() def sub_listen(msg, headers): assertions(isinstance(msg, StreamGranuleContainer), 'replayed message is not a granule.') hdf_string = msg.identifiables[data_stream_id].values sha1 = hashlib.sha1(hdf_string).hexdigest().upper() assertions(sha1 == msg.identifiables[encoding_id].sha1, 'Checksum failed.') assertions( msg.identifiables[element_count_id].value == 1, 'record replay count is incorrect %d.' % msg.identifiables[element_count_id].value) output_file = FileSystem.mktemp() output_file.write(msg.identifiables[data_stream_id].values) output_file_path = output_file.name output_file.close() output_vectors = acquire_data([output_file_path], fields, 2).next() for field in fields: comparison = (input_vectors[field]['values'] == output_vectors[field]['values']) assertions( comparison.all(), 'vector mismatch: %s vs %s' % (input_vectors[field]['values'], output_vectors[field]['values'])) FileSystem.unlink(output_file_path) ar.set(True) subscriber = Subscriber(name=(XP, 'replay listener'), callback=sub_listen) g = gevent.Greenlet(subscriber.listen, binding='%s.data' % replay_stream_id) g.start() data_retriever_service.start_replay(replay_id) ar.get(timeout=10) FileSystem.unlink(input_file_path)
def subset(self,granule,coverages): ''' @param granule @return dataset subset based on the fields ''' assert isinstance(granule, StreamGranuleContainer), 'object is not a granule.' field_ids = self.field_ids element_count_id = self.element_count_id values_path = list() domain_ids = list() coverage_ids = list() coverages = list(coverages) log.debug('Coverages include %s of type %s', coverages, type(coverages)) #----------------------------------------------------------------------------------------------------------- # Iterate through the fields IAW stream definition and check for rangesets and coordinate axises # - If its a coordinate axis, it belongs regardless of what the client desires. (It's part of the domain) # - If its a rangeset make sure that it's part of what the client asked for, if not discard it #----------------------------------------------------------------------------------------------------------- for field_id in field_ids: range_id = self.definition.identifiables[field_id].range_id #------------------------------------------------------------------------------------- # Coordinate Axis # - Keep track of this in our domains # - Add it to the paths we need to grab from the file(s) #------------------------------------------------------------------------------------- if isinstance(self.definition.identifiables[range_id], CoordinateAxis): log.debug('got a domain: %s' % range_id) domain_ids.append(field_id) if granule.identifiables.has_key(range_id): value_path = granule.identifiables[range_id].values_path or self.definition.identifiables[range_id].values_path values_path.append(value_path) else: value_path = self.definition.identifiables[range_id].values_path values_path.append(value_path) continue #------------------------------------------------------------------------------------- # Range Set # - If it's part of the coverages we want to keep # - Add it to the list of ranges we're tracking # - Add the value path to the paths we're tracking. #------------------------------------------------------------------------------------- if isinstance(self.definition.identifiables[range_id], RangeSet): # If its a rangeset, a specified coverage and the granule has it, add it to the list if field_id in coverages: if granule.identifiables.has_key(range_id): log.debug('got a range: %s' % range_id) coverage_ids.append(field_id) if granule.identifiables.has_key(range_id): value_path = granule.identifiables[range_id].values_path or self.definition.identifiables[range_id].values_path values_path.append(value_path) else: value_path = self.definition.identifiables[range_id].values_path values_path.append(value_path) continue # ---- # We need to track the range and bounds because, # you guessed it, we need to update the bounds # ---- range_id = self.definition.identifiables[field_id].range_id bounds_id = self.definition.identifiables[range_id].bounds_id #--- # Lastly, if the field is there and we don't want it, we need to strip it #--- if not (field_id in coverages): log.debug('%s doesn\'t belong in %s.', field_id, coverages) log.debug('rebool: %s', bool(field_id in coverages)) if granule.identifiables.has_key(range_id): log.debug('Removing %s from granule', range_id) del granule.identifiables[range_id] if granule.identifiables.has_key(bounds_id): log.debug('Removing %s from granule', bounds_id) del granule.identifiables[bounds_id] log.debug('Domains: %s', domain_ids) log.debug('Ranges: %s', coverage_ids) log.debug('Values_paths: %s', values_path) file_path = self._get_hdf_from_string(granule.identifiables[self.data_stream_id].values) full_coverage = list(domain_ids + coverage_ids) log.debug('Full coverage: %s' % full_coverage) log.debug('Calling acquire_data with: %s, %s, %s', [file_path],values_path,granule.identifiables[element_count_id].value) codec = HDFEncoder() pairs = self._pair_up(granule) var_names = list([i[0] for i in pairs]) record_count = granule.identifiables[self.element_count_id].value data = acquire_data([file_path], var_names, record_count).next() for row,value in data.iteritems(): vp = self._find_vp(pairs, row) codec.add_hdf_dataset(vp, value['values']) hdf_string = codec.encoder_close() self._patch_granule(granule,hdf_string) FileSystem.unlink(file_path) return granule
def test_bounds(self): #--------------------------------------------------------------------------------------------------- # Test with bad input not a slice and not a tuple... #--------------------------------------------------------------------------------------------------- generator = acquire_data(hdf_files=self.fnames, var_names=['temperature', 'salinity'], concatenate_size=26, bounds='s') # Assert an error. with self.assertRaises(BadRequest): out = generator.next() #@todo can we make the error more transparent to the use - easier to correct their mistake? #--------------------------------------------------------------------------------------------------- # Test with 2 tuple of slices on the 1d dataset #--------------------------------------------------------------------------------------------------- generator = acquire_data(hdf_files=self.fnames, var_names=['temperature', 'salinity'], concatenate_size=26, bounds=(slice(63, 120), slice(63, 120))) # Assert an error. with self.assertRaises(BadRequest): out = generator.next() #--------------------------------------------------------------------------------------------------- # Test with bounds greater than the dataset length #--------------------------------------------------------------------------------------------------- generator = acquire_data(hdf_files=self.fnames, var_names=['temperature', 'salinity'], concatenate_size=200, bounds=(slice(0, 200))) out = generator.next() # Assert result is the whole dataset truth1 = out['temperature']['values'] == self.t_result truth2 = out['salinity']['values'] == self.s_result self.assertTrue(truth1.all()) self.assertTrue(truth2.all()) # try to get the stop iteration by iterating again with self.assertRaises(StopIteration): out = generator.next() #--------------------------------------------------------------------------------------------------- # Test with normal bounds slice #--------------------------------------------------------------------------------------------------- generator = acquire_data( hdf_files=self.fnames, var_names=['temperature', 'salinity', 'pressure'], concatenate_size=60, bounds=(slice(30, 80))) out = generator.next() # assert result truth1 = out['temperature']['values'] == self.t_result[30:80] truth2 = out['salinity']['values'] == self.s_result[30:80] self.assertTrue(truth1.all()) self.assertTrue(truth2.all()) with self.assertRaises(StopIteration): out = generator.next() #--------------------------------------------------------------------------------------------------- # Test with no bounds #--------------------------------------------------------------------------------------------------- concatenate_size = 60 generator = acquire_data( hdf_files=self.fnames, var_names=['temperature', 'salinity', 'pressure'], concatenate_size=concatenate_size) # assert result self.check_pieces_3_variables_1d(generator, self.sl, concatenate_size) #--------------------------------------------------------------------------------------------------- # Test with concatenate larger than bounds slice #--------------------------------------------------------------------------------------------------- generator = acquire_data( hdf_files=self.fnames, var_names=['temperature', 'salinity', 'pressure'], concatenate_size=60, bounds=(slice(30, 50))) out = generator.next() # assert result truth1 = out['temperature']['values'] == self.t_result[30:50] truth2 = out['salinity']['values'] == self.s_result[30:50] truth3 = out['pressure']['values'] == self.p_result[30:50] self.assertTrue(truth1.all()) self.assertTrue(truth2.all()) self.assertTrue(truth3.all()) with self.assertRaises(StopIteration): out = generator.next() #--------------------------------------------------------------------------------------------------- # Test with concatenate smaller than bounds slice #--------------------------------------------------------------------------------------------------- sl = slice(30, 100) concatenate_size = 10 generator = acquire_data( hdf_files=self.fnames, var_names=['temperature', 'salinity', 'pressure'], concatenate_size=concatenate_size, bounds=(sl)) # assert result self.check_pieces_3_variables_1d(generator, sl, concatenate_size)
def subset(self, granule, coverages): ''' @param granule @return dataset subset based on the fields ''' assert isinstance(granule, StreamGranuleContainer), 'object is not a granule.' field_ids = self.field_ids element_count_id = self.element_count_id values_path = list() domain_ids = list() coverage_ids = list() coverages = list(coverages) log.debug('Coverages include %s of type %s', coverages, type(coverages)) #----------------------------------------------------------------------------------------------------------- # Iterate through the fields IAW stream definition and check for rangesets and coordinate axises # - If its a coordinate axis, it belongs regardless of what the client desires. (It's part of the domain) # - If its a rangeset make sure that it's part of what the client asked for, if not discard it #----------------------------------------------------------------------------------------------------------- for field_id in field_ids: range_id = self.definition.identifiables[field_id].range_id #------------------------------------------------------------------------------------- # Coordinate Axis # - Keep track of this in our domains # - Add it to the paths we need to grab from the file(s) #------------------------------------------------------------------------------------- if isinstance(self.definition.identifiables[range_id], CoordinateAxis): log.debug('got a domain: %s' % range_id) domain_ids.append(field_id) if granule.identifiables.has_key(range_id): value_path = granule.identifiables[ range_id].values_path or self.definition.identifiables[ range_id].values_path values_path.append(value_path) else: value_path = self.definition.identifiables[ range_id].values_path values_path.append(value_path) continue #------------------------------------------------------------------------------------- # Range Set # - If it's part of the coverages we want to keep # - Add it to the list of ranges we're tracking # - Add the value path to the paths we're tracking. #------------------------------------------------------------------------------------- if isinstance(self.definition.identifiables[range_id], RangeSet): # If its a rangeset, a specified coverage and the granule has it, add it to the list if field_id in coverages: if granule.identifiables.has_key(range_id): log.debug('got a range: %s' % range_id) coverage_ids.append(field_id) if granule.identifiables.has_key(range_id): value_path = granule.identifiables[ range_id].values_path or self.definition.identifiables[ range_id].values_path values_path.append(value_path) else: value_path = self.definition.identifiables[ range_id].values_path values_path.append(value_path) continue # ---- # We need to track the range and bounds because, # you guessed it, we need to update the bounds # ---- range_id = self.definition.identifiables[field_id].range_id bounds_id = self.definition.identifiables[range_id].bounds_id #--- # Lastly, if the field is there and we don't want it, we need to strip it #--- if not (field_id in coverages): log.debug('%s doesn\'t belong in %s.', field_id, coverages) log.debug('rebool: %s', bool(field_id in coverages)) if granule.identifiables.has_key(range_id): log.debug('Removing %s from granule', range_id) del granule.identifiables[range_id] if granule.identifiables.has_key(bounds_id): log.debug('Removing %s from granule', bounds_id) del granule.identifiables[bounds_id] log.debug('Domains: %s', domain_ids) log.debug('Ranges: %s', coverage_ids) log.debug('Values_paths: %s', values_path) file_path = self._get_hdf_from_string( granule.identifiables[self.data_stream_id].values) full_coverage = list(domain_ids + coverage_ids) log.debug('Full coverage: %s' % full_coverage) log.debug('Calling acquire_data with: %s, %s, %s', [file_path], values_path, granule.identifiables[element_count_id].value) codec = HDFEncoder() pairs = self._pair_up(granule) var_names = list([i[0] for i in pairs]) record_count = granule.identifiables[self.element_count_id].value data = acquire_data([file_path], var_names, record_count).next() for row, value in data.iteritems(): vp = self._find_vp(pairs, row) codec.add_hdf_dataset(vp, value['values']) hdf_string = codec.encoder_close() self._patch_granule(granule, hdf_string) FileSystem.unlink(file_path) return granule
def test_bounds(self): #--------------------------------------------------------------------------------------------------- # Test with 1 tuple of slices on the 1d dataset #--------------------------------------------------------------------------------------------------- concatenate_size = 26 generator = acquire_data(hdf_files = self.fnames, var_names = ['temperature', 'salinity', 'pressure'], concatenate_size = concatenate_size, bounds = (slice(2,4)) ) # Assert that it interprets the only slice provided as pertaining to the vertical dimension out = generator.next() self.check_pieces_3_variables_2d(generator, (slice(2,4), self.slice_tuple[1]), concatenate_size) #--------------------------------------------------------------------------------------------------- # Test with bounds greater than the dataset length #--------------------------------------------------------------------------------------------------- # bounds = (slice(0,200), slice(0,200)) # concatenate_size = 20 # # generator = acquire_data(hdf_files = self.fnames, # var_names = ['temperature', 'salinity', 'pressure'], # concatenate_size = concatenate_size, # bounds = bounds # ) # # out = generator.next() # # # Assert result is the whole dataset #--------------------------------------------------------------------------------------------------- # Test with normal bounds slice #--------------------------------------------------------------------------------------------------- bounds = (slice(2,3),slice(2,5) ) concatenate_size = 60 generator = acquire_data(hdf_files = self.fnames, var_names = ['temperature', 'salinity', 'pressure'], concatenate_size = concatenate_size, bounds = bounds ) out = generator.next() # assert result self.check_pieces_3_variables_2d(generator, (slice(2,4), self.slice_tuple[1]), concatenate_size) #--------------------------------------------------------------------------------------------------- # Test with no bounds #--------------------------------------------------------------------------------------------------- concatenate_size = 60 generator = acquire_data(hdf_files = self.fnames, var_names = ['temperature', 'salinity', 'pressure'], concatenate_size = concatenate_size ) # assert result self.check_pieces_3_variables_2d(generator, self.slice_tuple, concatenate_size) #--------------------------------------------------------------------------------------------------- # Test with concatenate larger than bounds overall extent #--------------------------------------------------------------------------------------------------- bounds = (slice(2,5), slice(2,10)) concatenate_size = 200 generator = acquire_data(hdf_files = self.fnames, var_names = ['temperature', 'salinity', 'pressure'], concatenate_size = concatenate_size, bounds = bounds ) out = generator.next() # assert result self.check_pieces_3_variables_2d(generator, self.slice_tuple, concatenate_size)
def _merge(self, msgs): ''' @brief Merges all the granules and datasets into one large dataset (Union) @param msgs raw granules from couch @return complete dataset @description n D := U [ msgs_i ] i=0 ''' granule = None file_list = list() count = len(msgs) used_vals = list() #------------------------------------------------------------------------------------- # Merge each granule to another granule one by one. # After each merge operation keep track of what files belong where on the timeline #------------------------------------------------------------------------------------- for i in xrange(count): if i==0: granule = msgs[0]['granule'] psc = PointSupplementConstructor(point_definition=self.definition) res = ReplayProcess.merge_granule(definition=self.definition, granule1=granule, granule2=None) granule = res['granule'] file_pair = res['files'] log.debug('file_pair: %s', file_pair) if file_pair[0] not in file_list and file_pair[0][0] not in used_vals: file_list.append( tuple(file_pair[0])) used_vals.append(file_pair[0][0]) else: res = ReplayProcess.merge_granule(definition=self.definition, granule1=granule, granule2=msgs[i]['granule']) granule = res['granule'] file_pair = res['files'] log.debug('file_pair: %s', file_pair) if file_pair[0] not in file_list and file_pair[0][0] not in used_vals: file_list.append( tuple(file_pair[0])) used_vals.append(file_pair[0][0]) if file_pair[1] not in file_list and file_pair[1][0] not in used_vals: file_list.append(tuple(file_pair[1])) used_vals.append(file_pair[1][0]) if not granule: return log.debug('file_list: %s', file_list) #------------------------------------------------------------------------------------- # Order the lists using a stable sort from python (by the first value in the tuples # Then peel off just the file names # Then get the appropriate URL for the file using FileSystem #------------------------------------------------------------------------------------- file_list.sort() file_list = list(i[1] for i in file_list) file_list = list([FileSystem.get_hierarchical_url(FS.CACHE, '%s' % i) for i in file_list]) pairs = self._pair_up(granule) var_names = list([i[0] for i in pairs]) record_count = granule.identifiables[self.element_count_id].value codec = HDFEncoder() log.debug('acquire_data:') log.debug('\tfile_list: %s', file_list) log.debug('\tfields: %s', var_names) log.debug('\trecords: %s', record_count) data = acquire_data(file_list, var_names, record_count).next() for row,value in data.iteritems(): value_path = self._find_vp(pairs,row) codec.add_hdf_dataset(value_path,nparray=value['values']) #------------------------------------------------------------------------------------- # Debugging #------------------------------------------------------------------------------------- log.debug('row: %s', row) log.debug('value path: %s', value_path) log.debug('value: %s', value['values']) hdf_string = codec.encoder_close() self._patch_granule(granule,hdf_string) return granule