Example #1
0
    def test_add_hdf_dataset(self):
        """
        Test adding a name and an array
        """

        testencoder = HDFEncoder()
        testencoder.add_hdf_dataset('test_dataset', self.known_array)
        testencoder.encoder_close()
Example #2
0
    def test_add_hdf_dataset_with_bad_array(self):
        """
        Test adding a name and a something other than an array
        """

        testencoder = HDFEncoder()
        with self.assertRaises(AssertionError):
            testencoder.add_hdf_dataset(self.dataset_name, 'bad array')
        testencoder.encoder_close()
    def test_add_hdf_dataset_with_bad_name(self):
        """
        Test adding a bad name and an array
        """

        testencoder = HDFEncoder()
        with self.assertRaises(AssertionError):
            self.dataset = testencoder.add_hdf_dataset('bad name', self.known_array)
        testencoder.encoder_close()
Example #4
0
    def test_encode_known_and_compare(self):
        """
        Create an encoder and add some (one) dataset/array
        """

        hdfencoder = HDFEncoder()
        hdfencoder.add_hdf_dataset(self.path_to_dataset, self.known_array)
        # Serialize to string and compare to a know value
        hdf_string = hdfencoder.encoder_close()

        self.assertEqual(sha1(hdf_string), self.known_hdf_as_sha1)
Example #5
0
    def _slice(self, granule, slice_):
        '''
        @brief Creates a granule which is a slice of the granule parameter
        @param granule the superset
        @param slice_ The slice values for which to create the granule
        @return Crafted subset granule of the parameter granule.
        '''
        retval = copy.deepcopy(granule)
        fields = self._list_data(self.definition, granule)
        record_count = slice_.stop - slice_.start
        assert record_count > 0, 'slice is malformed'
        pairs = self._pair_up(granule)
        var_names = list([i[0]
                          for i in pairs])  # Get the var_names from the pairs
        log.debug('var_names: %s', var_names)
        file_path = self._get_hdf_from_string(
            granule.identifiables[self.data_stream_id].values)
        codec = HDFEncoder()
        vectors = acquire_data([file_path], var_names, record_count,
                               slice_).next()

        for row, value in vectors.iteritems():
            vp = self._find_vp(pairs, row)
            # Determine the range_id reverse dictionary lookup
            #@todo: improve this pattern
            for field, path in fields.iteritems():
                if vp == path:
                    range_id = field
                    break
            bounds_id = retval.identifiables[range_id].bounds_id
            # Recalculate the bounds for this fields and update the granule
            range = value['range']
            retval.identifiables[bounds_id].value_pair[0] = float(range[0])
            retval.identifiables[bounds_id].value_pair[1] = float(range[1])
            codec.add_hdf_dataset(vp, value['values'])
            record_count = len(value['values'])
            #----- DEBUGGING ---------
            log.debug('slice- row: %s', row)
            log.debug('slice- value_path: %s', vp)
            log.debug('slice- range_id: %s', range_id)
            log.debug('slice- bounds_id: %s', bounds_id)
            log.debug('slice- limits: %s', value['range'])
            #-------------------------

        retval.identifiables[self.element_count_id].value = record_count
        hdf_string = codec.encoder_close()
        self._patch_granule(retval, hdf_string)
        FileSystem.unlink(file_path)
        return retval
    def test_encode_decode(self):
        """
        Encode some arrays
        """

        hdfencoder = HDFEncoder() # put array into the encoder
        hdfencoder.add_hdf_dataset(self.path_to_dataset, self.known_array)
        # get the string out from encoder
        hdf_string = hdfencoder.encoder_close()

        # Compare the arrays
        hdfdecoder = HDFDecoder(hdf_string)  # put string in decoder...
        nparray = hdfdecoder.read_hdf_dataset(self.path_to_dataset) # get array out

        self.assertEqual(sha1(nparray.tostring()), sha1(self.known_array.tostring()) ) # works for arbitrarily shaped arrays
    def test_decode_encode(self):
        """
        Try a decode-encode sequence and compare if its the same string
        """

        # decode an existing hdf file and read out an array
        hdfdecoder = HDFDecoder(self.known_hdf_as_string) # put known string in decoder...
        nparray = hdfdecoder.read_hdf_dataset(self.path_to_dataset) # get array out

        # encode the array and get the binary string containing the encoded hdf file
        hdfencoder = HDFEncoder() # put the array in the encoder...
        hdfencoder.add_hdf_dataset(self.path_to_dataset, nparray)
        hdf_string = hdfencoder.encoder_close() # get string out

        # compare the two strings
        self.assertEqual(sha1(hdf_string),self.known_hdf_as_sha1)
    def test_encode_with_filename_and_compare(self):
        """
        Create an encoder and add some (one) dataset/array
        """
        testfilename = 'test_encode_with_filename_and_compare'

        hdfencoder = HDFEncoder(testfilename)
        hdfencoder.add_hdf_dataset(self.path_to_dataset, self.known_array)
        # get the string out from encoder
        hdf_string = hdfencoder.encoder_close()

        self.assertEqual(sha1(hdf_string),self.known_hdf_as_sha1)

        hdfdecoder = HDFDecoder(self.known_hdf_as_string)
        nparray = hdfdecoder.read_hdf_dataset(self.path_to_dataset)

        self.assertEqual(sha1(nparray.tostring()), sha1(self.known_array.tostring()) )
    def add_two_datasets_read_compare(self, filename, dataset_name1, dataset_name2):
        array1 = numpy.ones((4,5))
        array2 = numpy.ones((2,3))

        # first create the file
        hdfencoder = HDFEncoder(filename)
        hdfencoder.add_hdf_dataset(dataset_name1, array1)

        hdfencoder.add_hdf_dataset(dataset_name2, array2)
        hdfstring = hdfencoder.encoder_close()

        hdfdecoder = HDFDecoder(hdfstring)
        # Read the first dataset
        array_decoded_1 =  hdfdecoder.read_hdf_dataset(dataset_name1)

        # Read the second dataset
        array_decoded_2 = hdfdecoder.read_hdf_dataset(dataset_name2)

        self.assertEqual(sha1(array1.tostring()), sha1(array_decoded_1.tostring()) )
        self.assertEqual(sha1(array2.tostring()), sha1(array_decoded_2.tostring()) )
Example #10
0
    def close_stream_granule(self, timestamp=None):

        import numpy

        encoder = HDFEncoder()

        for coverage_info in self._coordinates.itervalues():

            records = coverage_info['records']  # Turn the list into an array
            if not records:
                log.warn('Coverage name "%s" has no values!' %
                         coverage_info['id'])
                continue

            array = numpy.asarray(records)  # Turn the list into an array

            # Add the coverage
            self._granule.identifiables[
                coverage_info['id']] = coverage_info['obj']

            # Add the range
            range = [float(numpy.nanmin(array)), float(numpy.nanmax(array))]
            self._granule.identifiables[
                coverage_info['obj'].bounds_id] = QuantityRangeElement(
                    value_pair=range)

            # Add the data
            encoder.add_hdf_dataset(name=coverage_info['values_path'],
                                    nparray=array)

        for range_info in self._ranges.itervalues():

            records = range_info['records']  # Turn the list into an array
            if not records:
                log.warn('Range name "%s" has no values!' % range_info['id'])
                continue

            array = numpy.asarray(records)  # Turn the list into an array

            # Add the coverage
            self._granule.identifiables[range_info['id']] = range_info['obj']

            # Add the range
            range = [float(numpy.nanmin(array)), float(numpy.nanmax(array))]
            self._granule.identifiables[
                range_info['obj'].bounds_id] = QuantityRangeElement(
                    value_pair=range)

            # Add the data
            encoder.add_hdf_dataset(name=range_info['values_path'],
                                    nparray=array)

        hdf_string = encoder.encoder_close()

        sha1 = hashlib.sha1(hdf_string).hexdigest().upper()
        self._granule.identifiables[self._encoding_id] = Encoding(
            encoding_type=self._encoding.encoding_type,
            compression=None,
            sha1=sha1)

        tstamp = TimeElement(
            definition="http://www.opengis.net/def/property/OGC/0/SamplingTime",
            reference_frame="http://www.opengis.net/def/trs/OGC/0/GPS",
            reference_time='1970-01-01T00:00:00.000Z',
            value=timestamp or get_ion_ts())

        self._granule.identifiables[self._granule.data_stream_id] = DataStream(
            values=hdf_string, timestamp=tstamp)

        return self._granule
Example #11
0
    def subset(self, granule, coverages):
        '''
        @param granule
        @return dataset subset based on the fields
        '''
        assert isinstance(granule,
                          StreamGranuleContainer), 'object is not a granule.'
        field_ids = self.field_ids
        element_count_id = self.element_count_id

        values_path = list()
        domain_ids = list()
        coverage_ids = list()
        coverages = list(coverages)
        log.debug('Coverages include %s of type %s', coverages,
                  type(coverages))
        #-----------------------------------------------------------------------------------------------------------
        # Iterate through the fields IAW stream definition and check for rangesets and coordinate axises
        #  - If its a coordinate axis, it belongs regardless of what the client desires. (It's part of the domain)
        #  - If its a rangeset make sure that it's part of what the client asked for, if not discard it
        #-----------------------------------------------------------------------------------------------------------

        for field_id in field_ids:

            range_id = self.definition.identifiables[field_id].range_id

            #-------------------------------------------------------------------------------------
            # Coordinate Axis
            # - Keep track of this in our domains
            # - Add it to the paths we need to grab from the file(s)
            #-------------------------------------------------------------------------------------

            if isinstance(self.definition.identifiables[range_id],
                          CoordinateAxis):
                log.debug('got a domain: %s' % range_id)
                domain_ids.append(field_id)
                if granule.identifiables.has_key(range_id):
                    value_path = granule.identifiables[
                        range_id].values_path or self.definition.identifiables[
                            range_id].values_path
                    values_path.append(value_path)
                else:
                    value_path = self.definition.identifiables[
                        range_id].values_path
                    values_path.append(value_path)
                continue

            #-------------------------------------------------------------------------------------
            # Range Set
            # - If it's part of the coverages we want to keep
            #   - Add it to the list of ranges we're tracking
            #   - Add the value path to the paths we're tracking.
            #-------------------------------------------------------------------------------------

            if isinstance(self.definition.identifiables[range_id], RangeSet):
                # If its a rangeset, a specified coverage and the granule has it, add it to the list
                if field_id in coverages:
                    if granule.identifiables.has_key(range_id):
                        log.debug('got a range: %s' % range_id)
                        coverage_ids.append(field_id)
                        if granule.identifiables.has_key(range_id):
                            value_path = granule.identifiables[
                                range_id].values_path or self.definition.identifiables[
                                    range_id].values_path
                            values_path.append(value_path)
                        else:
                            value_path = self.definition.identifiables[
                                range_id].values_path
                            values_path.append(value_path)
                        continue

                # ----
                # We need to track the range and bounds because,
                # you guessed it, we need to update the bounds
                # ----

                range_id = self.definition.identifiables[field_id].range_id
                bounds_id = self.definition.identifiables[range_id].bounds_id

                #---
                # Lastly, if the field is there and we don't want it, we need to strip it
                #---

                if not (field_id in coverages):
                    log.debug('%s doesn\'t belong in %s.', field_id, coverages)
                    log.debug('rebool: %s', bool(field_id in coverages))
                    if granule.identifiables.has_key(range_id):
                        log.debug('Removing %s from granule', range_id)
                        del granule.identifiables[range_id]
                    if granule.identifiables.has_key(bounds_id):
                        log.debug('Removing %s from granule', bounds_id)
                        del granule.identifiables[bounds_id]

        log.debug('Domains: %s', domain_ids)
        log.debug('Ranges: %s', coverage_ids)
        log.debug('Values_paths: %s', values_path)

        file_path = self._get_hdf_from_string(
            granule.identifiables[self.data_stream_id].values)
        full_coverage = list(domain_ids + coverage_ids)

        log.debug('Full coverage: %s' % full_coverage)
        log.debug('Calling acquire_data with: %s, %s, %s', [file_path],
                  values_path, granule.identifiables[element_count_id].value)

        codec = HDFEncoder()

        pairs = self._pair_up(granule)
        var_names = list([i[0] for i in pairs])

        record_count = granule.identifiables[self.element_count_id].value
        data = acquire_data([file_path], var_names, record_count).next()
        for row, value in data.iteritems():
            vp = self._find_vp(pairs, row)
            codec.add_hdf_dataset(vp, value['values'])

        hdf_string = codec.encoder_close()
        self._patch_granule(granule, hdf_string)

        FileSystem.unlink(file_path)

        return granule
Example #12
0
    def _merge(self, msgs):
        '''
        @brief Merges all the granules and datasets into one large dataset (Union)
        @param msgs raw granules from couch
        @return complete dataset
        @description
             n
        D := U [ msgs_i ]
            i=0
        '''
        granule = None
        file_list = list()
        count = len(msgs)
        used_vals = list()

        #-------------------------------------------------------------------------------------
        # Merge each granule to another granule one by one.
        # After each merge operation keep track of what files belong where on the timeline
        #-------------------------------------------------------------------------------------

        for i in xrange(count):
            if i == 0:
                granule = msgs[0]['granule']
                psc = PointSupplementConstructor(
                    point_definition=self.definition)

                res = ReplayProcess.merge_granule(definition=self.definition,
                                                  granule1=granule,
                                                  granule2=None)
                granule = res['granule']
                file_pair = res['files']
                log.debug('file_pair: %s', file_pair)

                if file_pair[0] not in file_list and file_pair[0][
                        0] not in used_vals:
                    file_list.append(tuple(file_pair[0]))
                    used_vals.append(file_pair[0][0])

            else:
                res = ReplayProcess.merge_granule(definition=self.definition,
                                                  granule1=granule,
                                                  granule2=msgs[i]['granule'])

                granule = res['granule']
                file_pair = res['files']
                log.debug('file_pair: %s', file_pair)

                if file_pair[0] not in file_list and file_pair[0][
                        0] not in used_vals:
                    file_list.append(tuple(file_pair[0]))
                    used_vals.append(file_pair[0][0])
                if file_pair[1] not in file_list and file_pair[1][
                        0] not in used_vals:
                    file_list.append(tuple(file_pair[1]))
                    used_vals.append(file_pair[1][0])

        if not granule:
            return
        log.debug('file_list: %s', file_list)
        #-------------------------------------------------------------------------------------
        # Order the lists using a stable sort from python (by the first value in the tuples
        # Then peel off just the file names
        # Then get the appropriate URL for the file using FileSystem
        #-------------------------------------------------------------------------------------
        file_list.sort()
        file_list = list(i[1] for i in file_list)
        file_list = list([
            FileSystem.get_hierarchical_url(FS.CACHE, '%s' % i)
            for i in file_list
        ])

        pairs = self._pair_up(granule)
        var_names = list([i[0] for i in pairs])

        record_count = granule.identifiables[self.element_count_id].value
        codec = HDFEncoder()
        log.debug('acquire_data:')
        log.debug('\tfile_list: %s', file_list)
        log.debug('\tfields: %s', var_names)
        log.debug('\trecords: %s', record_count)

        data = acquire_data(file_list, var_names, record_count).next()

        for row, value in data.iteritems():
            value_path = self._find_vp(pairs, row)
            codec.add_hdf_dataset(value_path, nparray=value['values'])
            #-------------------------------------------------------------------------------------
            # Debugging
            #-------------------------------------------------------------------------------------
            log.debug('row: %s', row)
            log.debug('value path: %s', value_path)
            log.debug('value: %s', value['values'])

        hdf_string = codec.encoder_close()
        self._patch_granule(granule, hdf_string)
        return granule
Example #13
0
import numpy, h5py

from prototype.hdf.hdf_codec import HDFEncoder, HDFDecoder

array1 = numpy.ones((4, 5))
array2 = numpy.ones((2, 3))
array3 = numpy.ones((10, 2))
dataset_name1 = 'rootgroup/mygroup/data/temperature'
dataset_name2 = 'rootgroup/mygroup/data/pressure'
dname = 'aGroup/adataset'

###########################################################

# Create an encoder object
hdfencoder = HDFEncoder()
# Add data as an array
hdfencoder.add_hdf_dataset(dataset_name1, array1)
hdfencoder.add_hdf_dataset(dataset_name2, array2)
# Convert all the data to a binary string for easy transportation
hdfstring1 = hdfencoder.encoder_close()

# Create another encoder. This time pass on name of hdf5 file to write
hdfencoder = HDFEncoder('/tmp/testHDFEncoder.hdf5')
hdfencoder.add_hdf_dataset(dataset_name1, array1)
hdfencoder.add_hdf_dataset(dataset_name2, array2)
# Convert all the data to a binary string for easy transportation
hdfstring2 = hdfencoder.encoder_close()

# Create another encoder. This time pass on name of hdf5 file to write
hdfencoder = HDFEncoder('/tmp/testHDFEncoder.hdf5')
hdfencoder.add_hdf_dataset(dname, array3)