def test_add_hdf_dataset(self): """ Test adding a name and an array """ testencoder = HDFEncoder() testencoder.add_hdf_dataset('test_dataset', self.known_array) testencoder.encoder_close()
def test_add_hdf_dataset_with_bad_array(self): """ Test adding a name and a something other than an array """ testencoder = HDFEncoder() with self.assertRaises(AssertionError): testencoder.add_hdf_dataset(self.dataset_name, 'bad array') testencoder.encoder_close()
def test_add_hdf_dataset_with_bad_name(self): """ Test adding a bad name and an array """ testencoder = HDFEncoder() with self.assertRaises(AssertionError): self.dataset = testencoder.add_hdf_dataset('bad name', self.known_array) testencoder.encoder_close()
def test_encode_known_and_compare(self): """ Create an encoder and add some (one) dataset/array """ hdfencoder = HDFEncoder() hdfencoder.add_hdf_dataset(self.path_to_dataset, self.known_array) # Serialize to string and compare to a know value hdf_string = hdfencoder.encoder_close() self.assertEqual(sha1(hdf_string), self.known_hdf_as_sha1)
def _slice(self, granule, slice_): ''' @brief Creates a granule which is a slice of the granule parameter @param granule the superset @param slice_ The slice values for which to create the granule @return Crafted subset granule of the parameter granule. ''' retval = copy.deepcopy(granule) fields = self._list_data(self.definition, granule) record_count = slice_.stop - slice_.start assert record_count > 0, 'slice is malformed' pairs = self._pair_up(granule) var_names = list([i[0] for i in pairs]) # Get the var_names from the pairs log.debug('var_names: %s', var_names) file_path = self._get_hdf_from_string( granule.identifiables[self.data_stream_id].values) codec = HDFEncoder() vectors = acquire_data([file_path], var_names, record_count, slice_).next() for row, value in vectors.iteritems(): vp = self._find_vp(pairs, row) # Determine the range_id reverse dictionary lookup #@todo: improve this pattern for field, path in fields.iteritems(): if vp == path: range_id = field break bounds_id = retval.identifiables[range_id].bounds_id # Recalculate the bounds for this fields and update the granule range = value['range'] retval.identifiables[bounds_id].value_pair[0] = float(range[0]) retval.identifiables[bounds_id].value_pair[1] = float(range[1]) codec.add_hdf_dataset(vp, value['values']) record_count = len(value['values']) #----- DEBUGGING --------- log.debug('slice- row: %s', row) log.debug('slice- value_path: %s', vp) log.debug('slice- range_id: %s', range_id) log.debug('slice- bounds_id: %s', bounds_id) log.debug('slice- limits: %s', value['range']) #------------------------- retval.identifiables[self.element_count_id].value = record_count hdf_string = codec.encoder_close() self._patch_granule(retval, hdf_string) FileSystem.unlink(file_path) return retval
def test_encode_decode(self): """ Encode some arrays """ hdfencoder = HDFEncoder() # put array into the encoder hdfencoder.add_hdf_dataset(self.path_to_dataset, self.known_array) # get the string out from encoder hdf_string = hdfencoder.encoder_close() # Compare the arrays hdfdecoder = HDFDecoder(hdf_string) # put string in decoder... nparray = hdfdecoder.read_hdf_dataset(self.path_to_dataset) # get array out self.assertEqual(sha1(nparray.tostring()), sha1(self.known_array.tostring()) ) # works for arbitrarily shaped arrays
def test_decode_encode(self): """ Try a decode-encode sequence and compare if its the same string """ # decode an existing hdf file and read out an array hdfdecoder = HDFDecoder(self.known_hdf_as_string) # put known string in decoder... nparray = hdfdecoder.read_hdf_dataset(self.path_to_dataset) # get array out # encode the array and get the binary string containing the encoded hdf file hdfencoder = HDFEncoder() # put the array in the encoder... hdfencoder.add_hdf_dataset(self.path_to_dataset, nparray) hdf_string = hdfencoder.encoder_close() # get string out # compare the two strings self.assertEqual(sha1(hdf_string),self.known_hdf_as_sha1)
def test_encode_with_filename_and_compare(self): """ Create an encoder and add some (one) dataset/array """ testfilename = 'test_encode_with_filename_and_compare' hdfencoder = HDFEncoder(testfilename) hdfencoder.add_hdf_dataset(self.path_to_dataset, self.known_array) # get the string out from encoder hdf_string = hdfencoder.encoder_close() self.assertEqual(sha1(hdf_string),self.known_hdf_as_sha1) hdfdecoder = HDFDecoder(self.known_hdf_as_string) nparray = hdfdecoder.read_hdf_dataset(self.path_to_dataset) self.assertEqual(sha1(nparray.tostring()), sha1(self.known_array.tostring()) )
def add_two_datasets_read_compare(self, filename, dataset_name1, dataset_name2): array1 = numpy.ones((4,5)) array2 = numpy.ones((2,3)) # first create the file hdfencoder = HDFEncoder(filename) hdfencoder.add_hdf_dataset(dataset_name1, array1) hdfencoder.add_hdf_dataset(dataset_name2, array2) hdfstring = hdfencoder.encoder_close() hdfdecoder = HDFDecoder(hdfstring) # Read the first dataset array_decoded_1 = hdfdecoder.read_hdf_dataset(dataset_name1) # Read the second dataset array_decoded_2 = hdfdecoder.read_hdf_dataset(dataset_name2) self.assertEqual(sha1(array1.tostring()), sha1(array_decoded_1.tostring()) ) self.assertEqual(sha1(array2.tostring()), sha1(array_decoded_2.tostring()) )
def close_stream_granule(self, timestamp=None): import numpy encoder = HDFEncoder() for coverage_info in self._coordinates.itervalues(): records = coverage_info['records'] # Turn the list into an array if not records: log.warn('Coverage name "%s" has no values!' % coverage_info['id']) continue array = numpy.asarray(records) # Turn the list into an array # Add the coverage self._granule.identifiables[ coverage_info['id']] = coverage_info['obj'] # Add the range range = [float(numpy.nanmin(array)), float(numpy.nanmax(array))] self._granule.identifiables[ coverage_info['obj'].bounds_id] = QuantityRangeElement( value_pair=range) # Add the data encoder.add_hdf_dataset(name=coverage_info['values_path'], nparray=array) for range_info in self._ranges.itervalues(): records = range_info['records'] # Turn the list into an array if not records: log.warn('Range name "%s" has no values!' % range_info['id']) continue array = numpy.asarray(records) # Turn the list into an array # Add the coverage self._granule.identifiables[range_info['id']] = range_info['obj'] # Add the range range = [float(numpy.nanmin(array)), float(numpy.nanmax(array))] self._granule.identifiables[ range_info['obj'].bounds_id] = QuantityRangeElement( value_pair=range) # Add the data encoder.add_hdf_dataset(name=range_info['values_path'], nparray=array) hdf_string = encoder.encoder_close() sha1 = hashlib.sha1(hdf_string).hexdigest().upper() self._granule.identifiables[self._encoding_id] = Encoding( encoding_type=self._encoding.encoding_type, compression=None, sha1=sha1) tstamp = TimeElement( definition="http://www.opengis.net/def/property/OGC/0/SamplingTime", reference_frame="http://www.opengis.net/def/trs/OGC/0/GPS", reference_time='1970-01-01T00:00:00.000Z', value=timestamp or get_ion_ts()) self._granule.identifiables[self._granule.data_stream_id] = DataStream( values=hdf_string, timestamp=tstamp) return self._granule
def subset(self, granule, coverages): ''' @param granule @return dataset subset based on the fields ''' assert isinstance(granule, StreamGranuleContainer), 'object is not a granule.' field_ids = self.field_ids element_count_id = self.element_count_id values_path = list() domain_ids = list() coverage_ids = list() coverages = list(coverages) log.debug('Coverages include %s of type %s', coverages, type(coverages)) #----------------------------------------------------------------------------------------------------------- # Iterate through the fields IAW stream definition and check for rangesets and coordinate axises # - If its a coordinate axis, it belongs regardless of what the client desires. (It's part of the domain) # - If its a rangeset make sure that it's part of what the client asked for, if not discard it #----------------------------------------------------------------------------------------------------------- for field_id in field_ids: range_id = self.definition.identifiables[field_id].range_id #------------------------------------------------------------------------------------- # Coordinate Axis # - Keep track of this in our domains # - Add it to the paths we need to grab from the file(s) #------------------------------------------------------------------------------------- if isinstance(self.definition.identifiables[range_id], CoordinateAxis): log.debug('got a domain: %s' % range_id) domain_ids.append(field_id) if granule.identifiables.has_key(range_id): value_path = granule.identifiables[ range_id].values_path or self.definition.identifiables[ range_id].values_path values_path.append(value_path) else: value_path = self.definition.identifiables[ range_id].values_path values_path.append(value_path) continue #------------------------------------------------------------------------------------- # Range Set # - If it's part of the coverages we want to keep # - Add it to the list of ranges we're tracking # - Add the value path to the paths we're tracking. #------------------------------------------------------------------------------------- if isinstance(self.definition.identifiables[range_id], RangeSet): # If its a rangeset, a specified coverage and the granule has it, add it to the list if field_id in coverages: if granule.identifiables.has_key(range_id): log.debug('got a range: %s' % range_id) coverage_ids.append(field_id) if granule.identifiables.has_key(range_id): value_path = granule.identifiables[ range_id].values_path or self.definition.identifiables[ range_id].values_path values_path.append(value_path) else: value_path = self.definition.identifiables[ range_id].values_path values_path.append(value_path) continue # ---- # We need to track the range and bounds because, # you guessed it, we need to update the bounds # ---- range_id = self.definition.identifiables[field_id].range_id bounds_id = self.definition.identifiables[range_id].bounds_id #--- # Lastly, if the field is there and we don't want it, we need to strip it #--- if not (field_id in coverages): log.debug('%s doesn\'t belong in %s.', field_id, coverages) log.debug('rebool: %s', bool(field_id in coverages)) if granule.identifiables.has_key(range_id): log.debug('Removing %s from granule', range_id) del granule.identifiables[range_id] if granule.identifiables.has_key(bounds_id): log.debug('Removing %s from granule', bounds_id) del granule.identifiables[bounds_id] log.debug('Domains: %s', domain_ids) log.debug('Ranges: %s', coverage_ids) log.debug('Values_paths: %s', values_path) file_path = self._get_hdf_from_string( granule.identifiables[self.data_stream_id].values) full_coverage = list(domain_ids + coverage_ids) log.debug('Full coverage: %s' % full_coverage) log.debug('Calling acquire_data with: %s, %s, %s', [file_path], values_path, granule.identifiables[element_count_id].value) codec = HDFEncoder() pairs = self._pair_up(granule) var_names = list([i[0] for i in pairs]) record_count = granule.identifiables[self.element_count_id].value data = acquire_data([file_path], var_names, record_count).next() for row, value in data.iteritems(): vp = self._find_vp(pairs, row) codec.add_hdf_dataset(vp, value['values']) hdf_string = codec.encoder_close() self._patch_granule(granule, hdf_string) FileSystem.unlink(file_path) return granule
def _merge(self, msgs): ''' @brief Merges all the granules and datasets into one large dataset (Union) @param msgs raw granules from couch @return complete dataset @description n D := U [ msgs_i ] i=0 ''' granule = None file_list = list() count = len(msgs) used_vals = list() #------------------------------------------------------------------------------------- # Merge each granule to another granule one by one. # After each merge operation keep track of what files belong where on the timeline #------------------------------------------------------------------------------------- for i in xrange(count): if i == 0: granule = msgs[0]['granule'] psc = PointSupplementConstructor( point_definition=self.definition) res = ReplayProcess.merge_granule(definition=self.definition, granule1=granule, granule2=None) granule = res['granule'] file_pair = res['files'] log.debug('file_pair: %s', file_pair) if file_pair[0] not in file_list and file_pair[0][ 0] not in used_vals: file_list.append(tuple(file_pair[0])) used_vals.append(file_pair[0][0]) else: res = ReplayProcess.merge_granule(definition=self.definition, granule1=granule, granule2=msgs[i]['granule']) granule = res['granule'] file_pair = res['files'] log.debug('file_pair: %s', file_pair) if file_pair[0] not in file_list and file_pair[0][ 0] not in used_vals: file_list.append(tuple(file_pair[0])) used_vals.append(file_pair[0][0]) if file_pair[1] not in file_list and file_pair[1][ 0] not in used_vals: file_list.append(tuple(file_pair[1])) used_vals.append(file_pair[1][0]) if not granule: return log.debug('file_list: %s', file_list) #------------------------------------------------------------------------------------- # Order the lists using a stable sort from python (by the first value in the tuples # Then peel off just the file names # Then get the appropriate URL for the file using FileSystem #------------------------------------------------------------------------------------- file_list.sort() file_list = list(i[1] for i in file_list) file_list = list([ FileSystem.get_hierarchical_url(FS.CACHE, '%s' % i) for i in file_list ]) pairs = self._pair_up(granule) var_names = list([i[0] for i in pairs]) record_count = granule.identifiables[self.element_count_id].value codec = HDFEncoder() log.debug('acquire_data:') log.debug('\tfile_list: %s', file_list) log.debug('\tfields: %s', var_names) log.debug('\trecords: %s', record_count) data = acquire_data(file_list, var_names, record_count).next() for row, value in data.iteritems(): value_path = self._find_vp(pairs, row) codec.add_hdf_dataset(value_path, nparray=value['values']) #------------------------------------------------------------------------------------- # Debugging #------------------------------------------------------------------------------------- log.debug('row: %s', row) log.debug('value path: %s', value_path) log.debug('value: %s', value['values']) hdf_string = codec.encoder_close() self._patch_granule(granule, hdf_string) return granule
import numpy, h5py from prototype.hdf.hdf_codec import HDFEncoder, HDFDecoder array1 = numpy.ones((4, 5)) array2 = numpy.ones((2, 3)) array3 = numpy.ones((10, 2)) dataset_name1 = 'rootgroup/mygroup/data/temperature' dataset_name2 = 'rootgroup/mygroup/data/pressure' dname = 'aGroup/adataset' ########################################################### # Create an encoder object hdfencoder = HDFEncoder() # Add data as an array hdfencoder.add_hdf_dataset(dataset_name1, array1) hdfencoder.add_hdf_dataset(dataset_name2, array2) # Convert all the data to a binary string for easy transportation hdfstring1 = hdfencoder.encoder_close() # Create another encoder. This time pass on name of hdf5 file to write hdfencoder = HDFEncoder('/tmp/testHDFEncoder.hdf5') hdfencoder.add_hdf_dataset(dataset_name1, array1) hdfencoder.add_hdf_dataset(dataset_name2, array2) # Convert all the data to a binary string for easy transportation hdfstring2 = hdfencoder.encoder_close() # Create another encoder. This time pass on name of hdf5 file to write hdfencoder = HDFEncoder('/tmp/testHDFEncoder.hdf5') hdfencoder.add_hdf_dataset(dname, array3)