def test_add_hdf_dataset(self): """ Test adding a name and an array """ testencoder = HDFEncoder() testencoder.add_hdf_dataset('test_dataset', self.known_array) testencoder.encoder_close()
def test_add_hdf_dataset_with_bad_name(self): """ Test adding a bad name and an array """ testencoder = HDFEncoder() with self.assertRaises(AssertionError): self.dataset = testencoder.add_hdf_dataset('bad name', self.known_array) testencoder.encoder_close()
def test_add_hdf_dataset_with_bad_name(self): """ Test adding a bad name and an array """ testencoder = HDFEncoder() with self.assertRaises(HDFEncoderException): self.dataset = testencoder.add_hdf_dataset('bad name', self.known_array) testencoder.encoder_close()
def test_add_hdf_dataset_with_bad_array(self): """ Test adding a name and a something other than an array """ testencoder = HDFEncoder() with self.assertRaises(HDFEncoderException): testencoder.add_hdf_dataset(self.dataset_name,'bad array') testencoder.encoder_close()
def test_encode_known_and_compare(self): """ Create an encoder and add some (one) dataset/array """ hdfencoder = HDFEncoder() hdfencoder.add_hdf_dataset(self.path_to_dataset, self.known_array) # Serialize to string and compare to a know value hdf_string = hdfencoder.encoder_close() self.assertEqual(sha1(hdf_string),self.known_hdf_as_sha1)
def test_encode_known_and_compare(self): """ Create an encoder and add some (one) dataset/array """ hdfencoder = HDFEncoder() hdfencoder.add_hdf_dataset(self.path_to_dataset, self.known_array) # Serialize to string and compare to a know value hdf_string = hdfencoder.encoder_close() self.assertEqual(sha1(hdf_string), self.known_hdf_as_sha1)
def _slice(self, granule, slice_): ''' @brief Creates a granule which is a slice of the granule parameter @param granule the superset @param slice_ The slice values for which to create the granule @return Crafted subset granule of the parameter granule. ''' retval = copy.deepcopy(granule) fields = self._list_data(self.definition, granule) record_count = slice_.stop - slice_.start assert record_count > 0, 'slice is malformed' pairs = self._pair_up(granule) var_names = list([i[0] for i in pairs]) # Get the var_names from the pairs log.debug('var_names: %s', var_names) file_path = self._get_hdf_from_string( granule.identifiables[self.data_stream_id].values) codec = HDFEncoder() vectors = acquire_data([file_path], var_names, record_count, slice_).next() for row, value in vectors.iteritems(): vp = self._find_vp(pairs, row) # Determine the range_id reverse dictionary lookup #@todo: improve this pattern for field, path in fields.iteritems(): if vp == path: range_id = field break bounds_id = retval.identifiables[range_id].bounds_id # Recalculate the bounds for this fields and update the granule range = value['range'] retval.identifiables[bounds_id].value_pair[0] = float(range[0]) retval.identifiables[bounds_id].value_pair[1] = float(range[1]) codec.add_hdf_dataset(vp, value['values']) record_count = len(value['values']) #----- DEBUGGING --------- log.debug('slice- row: %s', row) log.debug('slice- value_path: %s', vp) log.debug('slice- range_id: %s', range_id) log.debug('slice- bounds_id: %s', bounds_id) log.debug('slice- limits: %s', value['range']) #------------------------- retval.identifiables[self.element_count_id].value = record_count hdf_string = codec.encoder_close() self._patch_granule(retval, hdf_string) FileSystem.unlink(file_path) return retval
def test_encode_decode(self): """ Encode some arrays """ hdfencoder = HDFEncoder() # put array into the encoder hdfencoder.add_hdf_dataset(self.path_to_dataset, self.known_array) # get the string out from encoder hdf_string = hdfencoder.encoder_close() # Compare the arrays hdfdecoder = HDFDecoder(hdf_string) # put string in decoder... nparray = hdfdecoder.read_hdf_dataset(self.path_to_dataset) # get array out self.assertEqual(sha1(nparray.tostring()), sha1(self.known_array.tostring()) ) # works for arbitrarily shaped arrays
def test_decode_encode(self): """ Try a decode-encode sequence and compare if its the same string """ # decode an existing hdf file and read out an array hdfdecoder = HDFDecoder(self.known_hdf_as_string) # put known string in decoder... nparray = hdfdecoder.read_hdf_dataset(self.path_to_dataset) # get array out # encode the array and get the binary string containing the encoded hdf file hdfencoder = HDFEncoder() # put the array in the encoder... hdfencoder.add_hdf_dataset(self.path_to_dataset, nparray) hdf_string = hdfencoder.encoder_close() # get string out # compare the two strings self.assertEqual(sha1(hdf_string),self.known_hdf_as_sha1)
def _slice(self,granule,slice_): ''' @brief Creates a granule which is a slice of the granule parameter @param granule the superset @param slice_ The slice values for which to create the granule @return Crafted subset granule of the parameter granule. ''' retval = copy.deepcopy(granule) fields = self._list_data(self.definition,granule) record_count = slice_.stop - slice_.start assert record_count > 0, 'slice is malformed' pairs = self._pair_up(granule) var_names = list([i[0] for i in pairs]) # Get the var_names from the pairs log.debug('var_names: %s',var_names) file_path = self._get_hdf_from_string(granule.identifiables[self.data_stream_id].values) codec = HDFEncoder() vectors = acquire_data([file_path],var_names,record_count,slice_ ).next() for row, value in vectors.iteritems(): vp = self._find_vp(pairs, row) # Determine the range_id reverse dictionary lookup #@todo: improve this pattern for field,path in fields.iteritems(): if vp==path: range_id = field break bounds_id = retval.identifiables[range_id].bounds_id # Recalculate the bounds for this fields and update the granule range = value['range'] retval.identifiables[bounds_id].value_pair[0] = float(range[0]) retval.identifiables[bounds_id].value_pair[1] = float(range[1]) codec.add_hdf_dataset(vp, value['values']) record_count = len(value['values']) #----- DEBUGGING --------- log.debug('slice- row: %s', row) log.debug('slice- value_path: %s', vp) log.debug('slice- range_id: %s', range_id) log.debug('slice- bounds_id: %s', bounds_id) log.debug('slice- limits: %s', value['range']) #------------------------- retval.identifiables[self.element_count_id].value = record_count hdf_string = codec.encoder_close() self._patch_granule(retval, hdf_string) FileSystem.unlink(file_path) return retval
def test_encode_with_filename_and_compare(self): """ Create an encoder and add some (one) dataset/array """ testfilename = 'test_encode_with_filename_and_compare' hdfencoder = HDFEncoder(testfilename) hdfencoder.add_hdf_dataset(self.path_to_dataset, self.known_array) # get the string out from encoder hdf_string = hdfencoder.encoder_close() self.assertEqual(sha1(hdf_string),self.known_hdf_as_sha1) hdfdecoder = HDFDecoder(self.known_hdf_as_string) nparray = hdfdecoder.read_hdf_dataset(self.path_to_dataset) self.assertEqual(sha1(nparray.tostring()), sha1(self.known_array.tostring()) )
def close_stream_granule(self): import numpy encoder = HDFEncoder() for coverage_info in self._coordinates.itervalues(): records = coverage_info['records'] # Turn the list into an array if not records: log.warn('Coverage name "%s" has no values!' % coverage_info['id']) continue array = numpy.asarray(records) # Turn the list into an array # Add the coverage self._granule.identifiables[coverage_info['id']] = coverage_info['obj'] # Add the range range = [float(numpy.nanmin(array)), float(numpy.nanmax(array))] self._granule.identifiables[coverage_info['obj'].bounds_id] = QuantityRangeElement(value_pair=range) # Add the data encoder.add_hdf_dataset(name=coverage_info['values_path'],nparray=array) for range_info in self._ranges.itervalues(): records = range_info['records'] # Turn the list into an array if not records: log.warn('Range name "%s" has no values!' % range_info['id']) continue array = numpy.asarray(records) # Turn the list into an array # Add the coverage self._granule.identifiables[range_info['id']] = range_info['obj'] # Add the range range = [float(numpy.nanmin(array)), float(numpy.nanmax(array))] self._granule.identifiables[range_info['obj'].bounds_id] = QuantityRangeElement(value_pair=range) # Add the data encoder.add_hdf_dataset(name=range_info['values_path'],nparray=array) hdf_string = encoder.encoder_close() sha1 = hashlib.sha1(hdf_string).hexdigest().upper() self._granule.identifiables[self._encoding_id] = Encoding( encoding_type='hdf5', compression=None, sha1=sha1 ) self._granule.identifiables[self._granule.data_stream_id] = DataStream( values=hdf_string ) return self._granule
def test_add_hdf_dataset_with_bad_array(self): """ Test adding a name and a something other than an array """ testencoder = HDFEncoder() with self.assertRaises(AssertionError): testencoder.add_hdf_dataset(self.dataset_name, 'bad array') testencoder.encoder_close()
def add_two_datasets_read_compare(self, filename, dataset_name1, dataset_name2): array1 = numpy.ones((4,5)) array2 = numpy.ones((2,3)) # first create the file hdfencoder = HDFEncoder(filename) hdfencoder.add_hdf_dataset(dataset_name1, array1) hdfencoder.add_hdf_dataset(dataset_name2, array2) hdfstring = hdfencoder.encoder_close() hdfdecoder = HDFDecoder(hdfstring) # Read the first dataset array_decoded_1 = hdfdecoder.read_hdf_dataset(dataset_name1) # Read the second dataset array_decoded_2 = hdfdecoder.read_hdf_dataset(dataset_name2) self.assertEqual(sha1(array1.tostring()), sha1(array_decoded_1.tostring()) ) self.assertEqual(sha1(array2.tostring()), sha1(array_decoded_2.tostring()) )
def add_two_datasets_read_compare(self, filename, dataset_name1, dataset_name2): array1 = numpy.ones((4,5)) array2 = numpy.ones((2,3)) # first create the file hdfencoder = HDFEncoder(filename) hdfencoder.add_hdf_dataset(dataset_name1, array1) hdfstring = hdfencoder.encoder_close() # now open the file and add another branch hdfencoder = HDFEncoder(filename) hdfencoder.add_hdf_dataset(dataset_name2, array2) hdfstring = hdfencoder.encoder_close() hdfdecoder = HDFDecoder(hdfstring) # Read the first dataset array_decoded_1 = hdfdecoder.read_hdf_dataset(dataset_name1) hdfdecoder = HDFDecoder(hdfstring) # Read the second dataset array_decoded_2 = hdfdecoder.read_hdf_dataset(dataset_name2) self.assertEqual(array1.tostring(), array_decoded_1.tostring()) self.assertEqual(array2.tostring(), array_decoded_2.tostring())
def close_stream_granule(self, timestamp=None): import numpy encoder = HDFEncoder() for coverage_info in self._coordinates.itervalues(): records = coverage_info['records'] # Turn the list into an array if not records: log.warn('Coverage name "%s" has no values!' % coverage_info['id']) continue array = numpy.asarray(records) # Turn the list into an array # Add the coverage self._granule.identifiables[ coverage_info['id']] = coverage_info['obj'] # Add the range range = [float(numpy.nanmin(array)), float(numpy.nanmax(array))] self._granule.identifiables[ coverage_info['obj'].bounds_id] = QuantityRangeElement( value_pair=range) # Add the data encoder.add_hdf_dataset(name=coverage_info['values_path'], nparray=array) for range_info in self._ranges.itervalues(): records = range_info['records'] # Turn the list into an array if not records: log.warn('Range name "%s" has no values!' % range_info['id']) continue array = numpy.asarray(records) # Turn the list into an array # Add the coverage self._granule.identifiables[range_info['id']] = range_info['obj'] # Add the range range = [float(numpy.nanmin(array)), float(numpy.nanmax(array))] self._granule.identifiables[ range_info['obj'].bounds_id] = QuantityRangeElement( value_pair=range) # Add the data encoder.add_hdf_dataset(name=range_info['values_path'], nparray=array) hdf_string = encoder.encoder_close() sha1 = hashlib.sha1(hdf_string).hexdigest().upper() self._granule.identifiables[self._encoding_id] = Encoding( encoding_type=self._encoding.encoding_type, compression=None, sha1=sha1) tstamp = TimeElement( definition="http://www.opengis.net/def/property/OGC/0/SamplingTime", reference_frame="http://www.opengis.net/def/trs/OGC/0/GPS", reference_time='1970-01-01T00:00:00.000Z', value=timestamp or get_ion_ts()) self._granule.identifiables[self._granule.data_stream_id] = DataStream( values=hdf_string, timestamp=tstamp) return self._granule
def subset(self, granule, coverages): ''' @param granule @return dataset subset based on the fields ''' assert isinstance(granule, StreamGranuleContainer), 'object is not a granule.' field_ids = self.field_ids element_count_id = self.element_count_id values_path = list() domain_ids = list() coverage_ids = list() coverages = list(coverages) log.debug('Coverages include %s of type %s', coverages, type(coverages)) #----------------------------------------------------------------------------------------------------------- # Iterate through the fields IAW stream definition and check for rangesets and coordinate axises # - If its a coordinate axis, it belongs regardless of what the client desires. (It's part of the domain) # - If its a rangeset make sure that it's part of what the client asked for, if not discard it #----------------------------------------------------------------------------------------------------------- for field_id in field_ids: range_id = self.definition.identifiables[field_id].range_id #------------------------------------------------------------------------------------- # Coordinate Axis # - Keep track of this in our domains # - Add it to the paths we need to grab from the file(s) #------------------------------------------------------------------------------------- if isinstance(self.definition.identifiables[range_id], CoordinateAxis): log.debug('got a domain: %s' % range_id) domain_ids.append(field_id) if granule.identifiables.has_key(range_id): value_path = granule.identifiables[ range_id].values_path or self.definition.identifiables[ range_id].values_path values_path.append(value_path) else: value_path = self.definition.identifiables[ range_id].values_path values_path.append(value_path) continue #------------------------------------------------------------------------------------- # Range Set # - If it's part of the coverages we want to keep # - Add it to the list of ranges we're tracking # - Add the value path to the paths we're tracking. #------------------------------------------------------------------------------------- if isinstance(self.definition.identifiables[range_id], RangeSet): # If its a rangeset, a specified coverage and the granule has it, add it to the list if field_id in coverages: if granule.identifiables.has_key(range_id): log.debug('got a range: %s' % range_id) coverage_ids.append(field_id) if granule.identifiables.has_key(range_id): value_path = granule.identifiables[ range_id].values_path or self.definition.identifiables[ range_id].values_path values_path.append(value_path) else: value_path = self.definition.identifiables[ range_id].values_path values_path.append(value_path) continue # ---- # We need to track the range and bounds because, # you guessed it, we need to update the bounds # ---- range_id = self.definition.identifiables[field_id].range_id bounds_id = self.definition.identifiables[range_id].bounds_id #--- # Lastly, if the field is there and we don't want it, we need to strip it #--- if not (field_id in coverages): log.debug('%s doesn\'t belong in %s.', field_id, coverages) log.debug('rebool: %s', bool(field_id in coverages)) if granule.identifiables.has_key(range_id): log.debug('Removing %s from granule', range_id) del granule.identifiables[range_id] if granule.identifiables.has_key(bounds_id): log.debug('Removing %s from granule', bounds_id) del granule.identifiables[bounds_id] log.debug('Domains: %s', domain_ids) log.debug('Ranges: %s', coverage_ids) log.debug('Values_paths: %s', values_path) file_path = self._get_hdf_from_string( granule.identifiables[self.data_stream_id].values) full_coverage = list(domain_ids + coverage_ids) log.debug('Full coverage: %s' % full_coverage) log.debug('Calling acquire_data with: %s, %s, %s', [file_path], values_path, granule.identifiables[element_count_id].value) codec = HDFEncoder() pairs = self._pair_up(granule) var_names = list([i[0] for i in pairs]) record_count = granule.identifiables[self.element_count_id].value data = acquire_data([file_path], var_names, record_count).next() for row, value in data.iteritems(): vp = self._find_vp(pairs, row) codec.add_hdf_dataset(vp, value['values']) hdf_string = codec.encoder_close() self._patch_granule(granule, hdf_string) FileSystem.unlink(file_path) return granule
def _merge(self, msgs): ''' @brief Merges all the granules and datasets into one large dataset (Union) @param msgs raw granules from couch @return complete dataset @description n D := U [ msgs_i ] i=0 ''' granule = None file_list = list() count = len(msgs) used_vals = list() #------------------------------------------------------------------------------------- # Merge each granule to another granule one by one. # After each merge operation keep track of what files belong where on the timeline #------------------------------------------------------------------------------------- for i in xrange(count): if i == 0: granule = msgs[0]['granule'] psc = PointSupplementConstructor( point_definition=self.definition) res = ReplayProcess.merge_granule(definition=self.definition, granule1=granule, granule2=None) granule = res['granule'] file_pair = res['files'] log.debug('file_pair: %s', file_pair) if file_pair[0] not in file_list and file_pair[0][ 0] not in used_vals: file_list.append(tuple(file_pair[0])) used_vals.append(file_pair[0][0]) else: res = ReplayProcess.merge_granule(definition=self.definition, granule1=granule, granule2=msgs[i]['granule']) granule = res['granule'] file_pair = res['files'] log.debug('file_pair: %s', file_pair) if file_pair[0] not in file_list and file_pair[0][ 0] not in used_vals: file_list.append(tuple(file_pair[0])) used_vals.append(file_pair[0][0]) if file_pair[1] not in file_list and file_pair[1][ 0] not in used_vals: file_list.append(tuple(file_pair[1])) used_vals.append(file_pair[1][0]) if not granule: return log.debug('file_list: %s', file_list) #------------------------------------------------------------------------------------- # Order the lists using a stable sort from python (by the first value in the tuples # Then peel off just the file names # Then get the appropriate URL for the file using FileSystem #------------------------------------------------------------------------------------- file_list.sort() file_list = list(i[1] for i in file_list) file_list = list([ FileSystem.get_hierarchical_url(FS.CACHE, '%s' % i) for i in file_list ]) pairs = self._pair_up(granule) var_names = list([i[0] for i in pairs]) record_count = granule.identifiables[self.element_count_id].value codec = HDFEncoder() log.debug('acquire_data:') log.debug('\tfile_list: %s', file_list) log.debug('\tfields: %s', var_names) log.debug('\trecords: %s', record_count) data = acquire_data(file_list, var_names, record_count).next() for row, value in data.iteritems(): value_path = self._find_vp(pairs, row) codec.add_hdf_dataset(value_path, nparray=value['values']) #------------------------------------------------------------------------------------- # Debugging #------------------------------------------------------------------------------------- log.debug('row: %s', row) log.debug('value path: %s', value_path) log.debug('value: %s', value['values']) hdf_string = codec.encoder_close() self._patch_granule(granule, hdf_string) return granule
def subset(self,granule,coverages): ''' @param granule @return dataset subset based on the fields ''' assert isinstance(granule, StreamGranuleContainer), 'object is not a granule.' field_ids = self.field_ids element_count_id = self.element_count_id values_path = list() domain_ids = list() coverage_ids = list() coverages = list(coverages) log.debug('Coverages include %s of type %s', coverages, type(coverages)) #----------------------------------------------------------------------------------------------------------- # Iterate through the fields IAW stream definition and check for rangesets and coordinate axises # - If its a coordinate axis, it belongs regardless of what the client desires. (It's part of the domain) # - If its a rangeset make sure that it's part of what the client asked for, if not discard it #----------------------------------------------------------------------------------------------------------- for field_id in field_ids: range_id = self.definition.identifiables[field_id].range_id #------------------------------------------------------------------------------------- # Coordinate Axis # - Keep track of this in our domains # - Add it to the paths we need to grab from the file(s) #------------------------------------------------------------------------------------- if isinstance(self.definition.identifiables[range_id], CoordinateAxis): log.debug('got a domain: %s' % range_id) domain_ids.append(field_id) if granule.identifiables.has_key(range_id): value_path = granule.identifiables[range_id].values_path or self.definition.identifiables[range_id].values_path values_path.append(value_path) else: value_path = self.definition.identifiables[range_id].values_path values_path.append(value_path) continue #------------------------------------------------------------------------------------- # Range Set # - If it's part of the coverages we want to keep # - Add it to the list of ranges we're tracking # - Add the value path to the paths we're tracking. #------------------------------------------------------------------------------------- if isinstance(self.definition.identifiables[range_id], RangeSet): # If its a rangeset, a specified coverage and the granule has it, add it to the list if field_id in coverages: if granule.identifiables.has_key(range_id): log.debug('got a range: %s' % range_id) coverage_ids.append(field_id) if granule.identifiables.has_key(range_id): value_path = granule.identifiables[range_id].values_path or self.definition.identifiables[range_id].values_path values_path.append(value_path) else: value_path = self.definition.identifiables[range_id].values_path values_path.append(value_path) continue # ---- # We need to track the range and bounds because, # you guessed it, we need to update the bounds # ---- range_id = self.definition.identifiables[field_id].range_id bounds_id = self.definition.identifiables[range_id].bounds_id #--- # Lastly, if the field is there and we don't want it, we need to strip it #--- if not (field_id in coverages): log.debug('%s doesn\'t belong in %s.', field_id, coverages) log.debug('rebool: %s', bool(field_id in coverages)) if granule.identifiables.has_key(range_id): log.debug('Removing %s from granule', range_id) del granule.identifiables[range_id] if granule.identifiables.has_key(bounds_id): log.debug('Removing %s from granule', bounds_id) del granule.identifiables[bounds_id] log.debug('Domains: %s', domain_ids) log.debug('Ranges: %s', coverage_ids) log.debug('Values_paths: %s', values_path) file_path = self._get_hdf_from_string(granule.identifiables[self.data_stream_id].values) full_coverage = list(domain_ids + coverage_ids) log.debug('Full coverage: %s' % full_coverage) log.debug('Calling acquire_data with: %s, %s, %s', [file_path],values_path,granule.identifiables[element_count_id].value) codec = HDFEncoder() pairs = self._pair_up(granule) var_names = list([i[0] for i in pairs]) record_count = granule.identifiables[self.element_count_id].value data = acquire_data([file_path], var_names, record_count).next() for row,value in data.iteritems(): vp = self._find_vp(pairs, row) codec.add_hdf_dataset(vp, value['values']) hdf_string = codec.encoder_close() self._patch_granule(granule,hdf_string) FileSystem.unlink(file_path) return granule
def _encode_supplement(self): """ Method used to encode the point dataset supplement """ def listify(input): if hasattr(input, '__iter__'): return input else: return [input,] # build the hdf and return the ion-object... hdf_string = '' try: import numpy encoder = HDFEncoder() #Need to search through the coordinate_axes dictionary to find out what the values_path #will be for the coordinate axes. #This assumes the coordinate axis names as described below. Will probably need to be #changed to accommodate other labels. for key, coordinate_axis in self._coordinate_axes.iteritems(): if self._times is not None and coordinate_axis.axis.lower() == 'time': time_range = [min(self._times), max(self._times)] self._packet_container.identifiables[key + '_bounds'].value_pair = time_range times = listify(self._times) encoder.add_hdf_dataset(coordinate_axis.values_path, numpy.asanyarray(times)) if self._longitudes is not None and coordinate_axis.axis.lower() == 'longitude': lons_range = [min(self._times), max(self._times)] self._packet_container.identifiables[key + '_bounds'].value_pair = lons_range lons = listify(self._longitudes) encoder.add_hdf_dataset(coordinate_axis.values_path, numpy.asanyarray(lons)) if self._latitudes is not None and coordinate_axis.axis.lower() == 'latitude': lats_range = [min(self._times), max(self._times)] self._packet_container.identifiables[key + '_bounds'].value_pair = lats_range lats = listify(self._latitudes) encoder.add_hdf_dataset(coordinate_axis.values_path, numpy.asanyarray(lats)) #Loop through ranges, one for each coverage. Range objects contain the values_path variable, #so use that to add values to the hdf. for key, range in self._ranges.iteritems(): if key in self._values: v = self._values[key] encoder.add_hdf_dataset(range.values_path, numpy.asanyarray(v)) hdf_string = encoder.encoder_close() sha1 = hashlib.sha1(hdf_string).hexdigest().upper() self._packet_container.identifiables['stream_encoding'] = Encoding( encoding_type='hdf5', compression=None, sha1=sha1 ) return hdf_string except : log.exception('HDF encoder failed. Please make sure you have it properly installed!')
import numpy, h5py from prototype.hdf.hdf_codec import HDFEncoder, HDFDecoder array1 = numpy.ones((4,5)) array2 = numpy.ones((2,3)) array3 = numpy.ones((10,2)) dataset_name1 = 'rootgroup/mygroup/data/temperature' dataset_name2 = 'rootgroup/mygroup/data/pressure' dname = 'aGroup/adataset' ########################################################### # Create an encoder object hdfencoder = HDFEncoder() # Add data as an array hdfencoder.add_hdf_dataset(dataset_name1, array1) hdfencoder.add_hdf_dataset(dataset_name2, array2) # Convert all the data to a binary string for easy transportation hdfstring1 = hdfencoder.encoder_close() # Create another encoder. This time pass on name of hdf5 file to write hdfencoder = HDFEncoder('/tmp/testHDFEncoder.hdf5') hdfencoder.add_hdf_dataset(dataset_name1, array1) hdfencoder.add_hdf_dataset(dataset_name2, array2) # Convert all the data to a binary string for easy transportation hdfstring2 = hdfencoder.encoder_close() # Create another encoder. This time pass on name of hdf5 file to write hdfencoder = HDFEncoder('/tmp/testHDFEncoder.hdf5') hdfencoder.add_hdf_dataset(dname, array3)
def ctd_stream_packet(stream_id = None, c=None, t=None, p=None , lat=None, lon=None, time=None, create_hdf=True): """ This is a simple interface for creating a packet of ctd data for a given stream defined by the method above. The string names of content are tightly coupled to the method above. To send actual data you must have hdf5, numpy and h5py installed. @brief build a demo ctd data packet as an ion object. All values arguments are optional, but any argument provided should have the same length. @param stream_id should be the same as the stream_id for the definition - the stream resource ID @param c is a list, tuple or ndarray of conductivity values @param t is a list, tuple or ndarray of temperature values @param p is a list, tuple or ndarray of presure values @param lat is a list, tuple or ndarray of latitude values @param lon is a list, tuple or ndarray of longitude values @param time is a list, tuple or ndarray of time values """ length = False def listify(input): if hasattr(input, '__iter__'): return input else: return [input,] c_range = [] if c is not None: c = listify(c) c_range = [min(c), max(c)] if length: assert length == len(c), 'Conductivity input is the wrong length' else: length = len(c) t_range = [] if t is not None: t = listify(t) t_range = [min(t), max(t)] if length: assert length == len(t), 'Temperature input is the wrong length' else: length = len(t) p_range = [] if p is not None: p = listify(p) p_range = [min(p), max(p)] if length: assert length == len(p), 'Pressure input is the wrong length' else: length = len(p) lat_range = [] if lat is not None: lat = listify(lat) lat_range = [min(lat), max(lat)] if length: assert length == len(lat), 'Latitude input is the wrong length' else: length = len(lat) lon_range = [] if lon is not None: lon = listify(lon) lon_range = [min(lon), max(lon)] if length: assert length == len(lon), 'Longitude input is the wrong length' else: length = len(lon) time_range = [] if time is not None: time = listify(time) time_range = [min(time), max(time)] if length: assert length == len(time), 'Time input is the wrong length' else: length = len(time) hdf_string = '' if create_hdf: try: # Use inline import to put off making numpy a requirement import numpy as np encoder = HDFEncoder() if t is not None: encoder.add_hdf_dataset('fields/temp_data', np.asanyarray(t)) if c is not None: encoder.add_hdf_dataset('fields/cndr_data', np.asanyarray(c)) if p is not None: encoder.add_hdf_dataset('fields/pressure_data',np.asanyarray(p)) if lat is not None: encoder.add_hdf_dataset('coordinates/latitude', np.asanyarray(lat)) if lon is not None: encoder.add_hdf_dataset('coordinates/longitude',np.asanyarray(lon)) if time is not None: encoder.add_hdf_dataset('coordinates/time',np.asanyarray(time)) hdf_string = encoder.encoder_close() except : log.exception('HDF encoder failed. Please make sure you have it properly installed!') # build a hdf file here # data stream id is the identifier for the DataStream object - the root of the data structure ctd_container = StreamGranuleContainer( stream_resource_id=stream_id, data_stream_id= 'ctd_data' ) ctd_container.identifiables['ctd_data'] = DataStream( id=stream_id, values=hdf_string # put the hdf file here as bytes! ) sha1 = hashlib.sha1(hdf_string).hexdigest().upper() if hdf_string else '' ctd_container.identifiables['stream_encoding'] = Encoding( encoding_type = 'hdf5', compression = None, sha1 = sha1, ) ctd_container.identifiables['record_count'] = CountElement( value= length or -1, ) # Time if time is not None : ctd_container.identifiables['time'] = CoordinateAxis( bounds_id='time_bounds' ) ctd_container.identifiables['time_bounds'] = QuantityRangeElement( value_pair=time_range ) # Latitude if lat is not None: ctd_container.identifiables['latitude'] = CoordinateAxis( bounds_id='latitude_bounds' ) ctd_container.identifiables['latitude_bounds'] = QuantityRangeElement( value_pair=lat_range ) # Longitude if lon is not None: ctd_container.identifiables['longitude'] = CoordinateAxis( bounds_id='longitude_bounds' ) ctd_container.identifiables['longitude_bounds'] = QuantityRangeElement( value_pair=lon_range ) # Pressure if p is not None: ctd_container.identifiables['pressure_data'] = CoordinateAxis( bounds_id='pressure_bounds' ) ctd_container.identifiables['pressure_bounds'] = QuantityRangeElement( value_pair=p_range ) # Temperature if t is not None: ctd_container.identifiables['temp_data'] = RangeSet( bounds_id=['temp_bounds'] ) ctd_container.identifiables['temp_bounds'] = QuantityRangeElement( value_pair=t_range ) # Conductivity if c is not None: ctd_container.identifiables['cndr_data'] = RangeSet( bounds_id='cndr_bounds' ) ctd_container.identifiables['cndr_bounds'] = QuantityRangeElement( value_pair=c_range ) return ctd_container
import numpy, h5py from prototype.hdf.hdf_codec import HDFEncoder, HDFDecoder array1 = numpy.ones((4, 5)) array2 = numpy.ones((2, 3)) array3 = numpy.ones((10, 2)) dataset_name1 = 'rootgroup/mygroup/data/temperature' dataset_name2 = 'rootgroup/mygroup/data/pressure' dname = 'aGroup/adataset' ########################################################### # Create an encoder object hdfencoder = HDFEncoder() # Add data as an array hdfencoder.add_hdf_dataset(dataset_name1, array1) hdfencoder.add_hdf_dataset(dataset_name2, array2) # Convert all the data to a binary string for easy transportation hdfstring1 = hdfencoder.encoder_close() # Create another encoder. This time pass on name of hdf5 file to write hdfencoder = HDFEncoder('/tmp/testHDFEncoder.hdf5') hdfencoder.add_hdf_dataset(dataset_name1, array1) hdfencoder.add_hdf_dataset(dataset_name2, array2) # Convert all the data to a binary string for easy transportation hdfstring2 = hdfencoder.encoder_close() # Create another encoder. This time pass on name of hdf5 file to write hdfencoder = HDFEncoder('/tmp/testHDFEncoder.hdf5') hdfencoder.add_hdf_dataset(dname, array3)
def _merge(self, msgs): ''' @brief Merges all the granules and datasets into one large dataset (Union) @param msgs raw granules from couch @return complete dataset @description n D := U [ msgs_i ] i=0 ''' granule = None file_list = list() count = len(msgs) used_vals = list() #------------------------------------------------------------------------------------- # Merge each granule to another granule one by one. # After each merge operation keep track of what files belong where on the timeline #------------------------------------------------------------------------------------- for i in xrange(count): if i==0: granule = msgs[0]['granule'] psc = PointSupplementConstructor(point_definition=self.definition) res = ReplayProcess.merge_granule(definition=self.definition, granule1=granule, granule2=None) granule = res['granule'] file_pair = res['files'] log.debug('file_pair: %s', file_pair) if file_pair[0] not in file_list and file_pair[0][0] not in used_vals: file_list.append( tuple(file_pair[0])) used_vals.append(file_pair[0][0]) else: res = ReplayProcess.merge_granule(definition=self.definition, granule1=granule, granule2=msgs[i]['granule']) granule = res['granule'] file_pair = res['files'] log.debug('file_pair: %s', file_pair) if file_pair[0] not in file_list and file_pair[0][0] not in used_vals: file_list.append( tuple(file_pair[0])) used_vals.append(file_pair[0][0]) if file_pair[1] not in file_list and file_pair[1][0] not in used_vals: file_list.append(tuple(file_pair[1])) used_vals.append(file_pair[1][0]) if not granule: return log.debug('file_list: %s', file_list) #------------------------------------------------------------------------------------- # Order the lists using a stable sort from python (by the first value in the tuples # Then peel off just the file names # Then get the appropriate URL for the file using FileSystem #------------------------------------------------------------------------------------- file_list.sort() file_list = list(i[1] for i in file_list) file_list = list([FileSystem.get_hierarchical_url(FS.CACHE, '%s' % i) for i in file_list]) pairs = self._pair_up(granule) var_names = list([i[0] for i in pairs]) record_count = granule.identifiables[self.element_count_id].value codec = HDFEncoder() log.debug('acquire_data:') log.debug('\tfile_list: %s', file_list) log.debug('\tfields: %s', var_names) log.debug('\trecords: %s', record_count) data = acquire_data(file_list, var_names, record_count).next() for row,value in data.iteritems(): value_path = self._find_vp(pairs,row) codec.add_hdf_dataset(value_path,nparray=value['values']) #------------------------------------------------------------------------------------- # Debugging #------------------------------------------------------------------------------------- log.debug('row: %s', row) log.debug('value path: %s', value_path) log.debug('value: %s', value['values']) hdf_string = codec.encoder_close() self._patch_granule(granule,hdf_string) return granule
def close_stream_granule(self, timestamp=None): import numpy encoder = HDFEncoder() for coverage_info in self._coordinates.itervalues(): records = coverage_info['records'] # Turn the list into an array if not records: log.warn('Coverage name "%s" has no values!' % coverage_info['id']) continue array = numpy.asarray(records) # Turn the list into an array # Add the coverage self._granule.identifiables[coverage_info['id']] = coverage_info['obj'] # Add the range range = [float(numpy.nanmin(array)), float(numpy.nanmax(array))] self._granule.identifiables[coverage_info['obj'].bounds_id] = QuantityRangeElement(value_pair=range) # Add the data encoder.add_hdf_dataset(name=coverage_info['values_path'],nparray=array) for range_info in self._ranges.itervalues(): records = range_info['records'] # Turn the list into an array if not records: log.warn('Range name "%s" has no values!' % range_info['id']) continue array = numpy.asarray(records) # Turn the list into an array # Add the coverage self._granule.identifiables[range_info['id']] = range_info['obj'] # Add the range range = [float(numpy.nanmin(array)), float(numpy.nanmax(array))] self._granule.identifiables[range_info['obj'].bounds_id] = QuantityRangeElement(value_pair=range) # Add the data encoder.add_hdf_dataset(name=range_info['values_path'],nparray=array) hdf_string = encoder.encoder_close() sha1 = hashlib.sha1(hdf_string).hexdigest().upper() self._granule.identifiables[self._encoding_id] = Encoding( encoding_type = self._encoding.encoding_type, compression=None, sha1=sha1 ) tstamp = TimeElement( definition="http://www.opengis.net/def/property/OGC/0/SamplingTime", reference_frame="http://www.opengis.net/def/trs/OGC/0/GPS", reference_time='1970-01-01T00:00:00.000Z', value= timestamp or get_ion_ts() ) self._granule.identifiables[self._granule.data_stream_id] = DataStream( values=hdf_string, timestamp=tstamp ) return self._granule