def test_concatenate_size(self):

        #--------------------------------------------------------------------------------------
        # Test with a concatenate size greater than the length of the virtual dataset
        #--------------------------------------------------------------------------------------

        generator = acquire_data(
            hdf_files=self.fnames,
            var_names=['temperature', 'salinity', 'pressure'],
            concatenate_size=175)

        out = generator.next()

        # assert the result...
        truth1 = out['temperature']['values'] == self.t_result
        truth2 = out['salinity']['values'] == self.s_result
        truth3 = out['pressure']['values'] == self.p_result

        self.assertTrue(truth1.all())
        self.assertTrue(truth2.all())
        self.assertTrue(truth3.all())

        with self.assertRaises(StopIteration):
            out = generator.next()

        #--------------------------------------------------------------------------------------------------------------------------
        # Test with a concatenate size less than the length of the virtual dataset such that mod(length, concatenate_size) == 0
        #--------------------------------------------------------------------------------------------------------------------------

        concatenate_size = 14

        generator = acquire_data(
            hdf_files=self.fnames,
            var_names=['temperature', 'salinity', 'pressure'],
            concatenate_size=concatenate_size)

        self.check_pieces_3_variables_2d(generator, self.slice_tuple,
                                         concatenate_size)

        #--------------------------------------------------------------------------------------------------------------------------
        # Test with a concatenate size less than the length of the virtual dataset such that mod(length, concatenate_size) != 0
        #--------------------------------------------------------------------------------------------------------------------------

        bounds = (
            slice(2, 4), slice(2, 8)
        )  # on the x axis, choose indices 3..9, on the y axis, choose indices 2..7
        #@todo calculate the slice_tuple
        # slice_tuple =
        concatenate_size = 26

        generator = acquire_data(
            hdf_files=self.fnames,
            var_names=['temperature', 'salinity', 'pressure'],
            concatenate_size=concatenate_size,
            bounds=bounds)

        # assert the result...
        self.check_pieces_3_variables_2d(generator, bounds, concatenate_size)
    def test_concatenate_size(self):

        #--------------------------------------------------------------------------------------
        # Test with a concatenate size greater than the length of the virtual dataset
        #--------------------------------------------------------------------------------------

        generator = acquire_data(hdf_files = self.fnames,
            var_names = ['temperature', 'salinity', 'pressure'],
            concatenate_size = 175
        )

        out = generator.next()

        # assert the result...
        truth1 = out['temperature']['values'] == self.t_result
        truth2 = out['salinity']['values'] == self.s_result
        truth3 = out['pressure']['values'] == self.p_result

        self.assertTrue(truth1.all())
        self.assertTrue(truth2.all())
        self.assertTrue(truth3.all())

        with self.assertRaises(StopIteration):
            out = generator.next()

        #--------------------------------------------------------------------------------------------------------------------------
        # Test with a concatenate size less than the length of the virtual dataset such that mod(length, concatenate_size) == 0
        #--------------------------------------------------------------------------------------------------------------------------

        concatenate_size = 14

        generator = acquire_data(hdf_files = self.fnames,
            var_names = ['temperature', 'salinity', 'pressure'],
            concatenate_size = concatenate_size
        )

        self.check_pieces_3_variables_2d(generator, self.slice_tuple, concatenate_size)

        #--------------------------------------------------------------------------------------------------------------------------
        # Test with a concatenate size less than the length of the virtual dataset such that mod(length, concatenate_size) != 0
        #--------------------------------------------------------------------------------------------------------------------------

        bounds = (slice(2,4), slice(2,8)) # on the x axis, choose indices 3..9, on the y axis, choose indices 2..7
        #@todo calculate the slice_tuple
        # slice_tuple =
        concatenate_size = 26

        generator = acquire_data(hdf_files = self.fnames,
            var_names = ['temperature', 'salinity', 'pressure'],
            concatenate_size = concatenate_size,
            bounds=bounds
        )

        # assert the result...
        self.check_pieces_3_variables_2d(generator, bounds, concatenate_size)
    def test_concatenate_size(self):

        #--------------------------------------------------------------------------------------
        # Test with a concatenate size greater than the length of the virtual dataset
        #--------------------------------------------------------------------------------------

        generator = acquire_data(hdf_files = self.fnames,
            var_names = ['temperature', 'salinity', 'pressure'],
            concatenate_size = 175
        )

        out = generator.next()
        # assert the result...
        truth1 = out['temperature']['values'] == self.t_result
        truth2 = out['salinity']['values'] == self.s_result
        truth3 = out['pressure']['values'] == self.p_result

        self.assertTrue(truth1.all())
        self.assertTrue(truth2.all())
        self.assertTrue(truth3.all())

        with self.assertRaises(StopIteration):
            out = generator.next()

        #--------------------------------------------------------------------------------------------------------------------------
        # Test with a concatenate size less than the length of the virtual dataset such that mod(length, concatenate_size) == 0
        #--------------------------------------------------------------------------------------------------------------------------

        concatenate_size = 25

        generator = acquire_data(hdf_files = self.fnames,
            var_names = ['temperature', 'salinity', 'pressure'],
            concatenate_size = concatenate_size
        )

        self.check_pieces_3_variables_1d(generator, self.sl, concatenate_size)

        #--------------------------------------------------------------------------------------------------------------------------
        # Test with a concatenate size less than the length of the virtual dataset such that mod(length, concatenate_size) != 0
        #--------------------------------------------------------------------------------------------------------------------------

        sl = slice(3,63)
        concatenate_size = 26

        generator = acquire_data(hdf_files = self.fnames,
            var_names = ['temperature', 'salinity', 'pressure'],
            concatenate_size = concatenate_size,
            bounds=(sl)
        )

        # assert the result...
        self.check_pieces_3_variables_1d(generator, sl, concatenate_size)
    def test_concatenate_size(self):

        #--------------------------------------------------------------------------------------
        # Test with a concatenate size greater than the length of the virtual dataset
        #--------------------------------------------------------------------------------------

        generator = acquire_data(
            hdf_files=self.fnames,
            var_names=['temperature', 'salinity', 'pressure'],
            concatenate_size=175)

        out = generator.next()
        # assert the result...
        truth1 = out['temperature']['values'] == self.t_result
        truth2 = out['salinity']['values'] == self.s_result
        truth3 = out['pressure']['values'] == self.p_result

        self.assertTrue(truth1.all())
        self.assertTrue(truth2.all())
        self.assertTrue(truth3.all())

        with self.assertRaises(StopIteration):
            out = generator.next()

        #--------------------------------------------------------------------------------------------------------------------------
        # Test with a concatenate size less than the length of the virtual dataset such that mod(length, concatenate_size) == 0
        #--------------------------------------------------------------------------------------------------------------------------

        concatenate_size = 25

        generator = acquire_data(
            hdf_files=self.fnames,
            var_names=['temperature', 'salinity', 'pressure'],
            concatenate_size=concatenate_size)

        self.check_pieces_3_variables_1d(generator, self.sl, concatenate_size)

        #--------------------------------------------------------------------------------------------------------------------------
        # Test with a concatenate size less than the length of the virtual dataset such that mod(length, concatenate_size) != 0
        #--------------------------------------------------------------------------------------------------------------------------

        sl = slice(3, 63)
        concatenate_size = 26

        generator = acquire_data(
            hdf_files=self.fnames,
            var_names=['temperature', 'salinity', 'pressure'],
            concatenate_size=concatenate_size,
            bounds=(sl))

        # assert the result...
        self.check_pieces_3_variables_1d(generator, sl, concatenate_size)
        def sub_listen(msg, headers):

            assertions(isinstance(msg, StreamGranuleContainer),
                       'replayed message is not a granule.')
            hdf_string = msg.identifiables[data_stream_id].values
            sha1 = hashlib.sha1(hdf_string).hexdigest().upper()
            assertions(sha1 == msg.identifiables[encoding_id].sha1,
                       'Checksum failed.')
            assertions(
                msg.identifiables[element_count_id].value == 1,
                'record replay count is incorrect %d.' %
                msg.identifiables[element_count_id].value)
            output_file = FileSystem.mktemp()
            output_file.write(msg.identifiables[data_stream_id].values)
            output_file_path = output_file.name
            output_file.close()
            output_vectors = acquire_data([output_file_path], fields, 2).next()
            for field in fields:
                comparison = (input_vectors[field]['values'] ==
                              output_vectors[field]['values'])
                assertions(
                    comparison.all(), 'vector mismatch: %s vs %s' %
                    (input_vectors[field]['values'],
                     output_vectors[field]['values']))
            FileSystem.unlink(output_file_path)
            ar.set(True)
Beispiel #6
0
    def _get_time_index(self, granule, timeval):
        '''
        @brief Obtains the index where a time's value is
        @param granule must be a complete dataset (hdf_string provided)
        @param timeval the vector value
        @return Index value for timeval or closest approx such that timeval is IN the subset
        '''
        assert isinstance(granule,
                          StreamGranuleContainer), 'object is not a granule.'
        assert granule.identifiables[
            self.data_stream_id].values, 'hdf_string is not provided.'

        hdf_string = granule.identifiables[self.data_stream_id].values
        file_path = self._get_hdf_from_string(hdf_string)

        #-------------------------------------------------------------------------------------
        # Determine the field_id for the temporal coordinate vector (aka time)
        #-------------------------------------------------------------------------------------

        time_field = self.definition.identifiables[
            self.time_id].coordinate_ids[0]
        value_path = granule.identifiables[
            time_field].values_path or self.definition.identifiables[
                time_field].values_path
        record_count = granule.identifiables[self.element_count_id].value

        #-------------------------------------------------------------------------------------
        # Go through the time vector and get the indexes that correspond to the timeval
        # It will find a value such that
        # t_n <= i < t_(n+1), where i is the index
        #-------------------------------------------------------------------------------------

        var_name = value_path.split('/').pop()
        res = acquire_data([file_path], [var_name], record_count).next()
        time_vector = res[var_name]['values']
        retval = 0
        for i in xrange(len(time_vector)):
            if time_vector[i] == timeval:
                retval = i
                break
            elif i == 0 and time_vector[i] > timeval:
                retval = i
                break
            elif (i + 1) < len(time_vector):  # not last val
                if time_vector[i] < timeval and time_vector[i + 1] > timeval:
                    retval = i
                    break
            else:  # last val
                retval = i
                break
        FileSystem.unlink(file_path)
        return retval
Beispiel #7
0
    def _slice(self, granule, slice_):
        '''
        @brief Creates a granule which is a slice of the granule parameter
        @param granule the superset
        @param slice_ The slice values for which to create the granule
        @return Crafted subset granule of the parameter granule.
        '''
        retval = copy.deepcopy(granule)
        fields = self._list_data(self.definition, granule)
        record_count = slice_.stop - slice_.start
        assert record_count > 0, 'slice is malformed'
        pairs = self._pair_up(granule)
        var_names = list([i[0]
                          for i in pairs])  # Get the var_names from the pairs
        log.debug('var_names: %s', var_names)
        file_path = self._get_hdf_from_string(
            granule.identifiables[self.data_stream_id].values)
        codec = HDFEncoder()
        vectors = acquire_data([file_path], var_names, record_count,
                               slice_).next()

        for row, value in vectors.iteritems():
            vp = self._find_vp(pairs, row)
            # Determine the range_id reverse dictionary lookup
            #@todo: improve this pattern
            for field, path in fields.iteritems():
                if vp == path:
                    range_id = field
                    break
            bounds_id = retval.identifiables[range_id].bounds_id
            # Recalculate the bounds for this fields and update the granule
            range = value['range']
            retval.identifiables[bounds_id].value_pair[0] = float(range[0])
            retval.identifiables[bounds_id].value_pair[1] = float(range[1])
            codec.add_hdf_dataset(vp, value['values'])
            record_count = len(value['values'])
            #----- DEBUGGING ---------
            log.debug('slice- row: %s', row)
            log.debug('slice- value_path: %s', vp)
            log.debug('slice- range_id: %s', range_id)
            log.debug('slice- bounds_id: %s', bounds_id)
            log.debug('slice- limits: %s', value['range'])
            #-------------------------

        retval.identifiables[self.element_count_id].value = record_count
        hdf_string = codec.encoder_close()
        self._patch_granule(retval, hdf_string)
        FileSystem.unlink(file_path)
        return retval
Beispiel #8
0
    def _get_time_index(self, granule, timeval):
        '''
        @brief Obtains the index where a time's value is
        @param granule must be a complete dataset (hdf_string provided)
        @param timeval the vector value
        @return Index value for timeval or closest approx such that timeval is IN the subset
        '''
        assert isinstance(granule, StreamGranuleContainer), 'object is not a granule.'
        assert granule.identifiables[self.data_stream_id].values, 'hdf_string is not provided.'

        hdf_string = granule.identifiables[self.data_stream_id].values
        file_path = self._get_hdf_from_string(hdf_string)

        #-------------------------------------------------------------------------------------
        # Determine the field_id for the temporal coordinate vector (aka time)
        #-------------------------------------------------------------------------------------

        time_field = self.definition.identifiables[self.time_id].coordinate_ids[0]
        value_path = granule.identifiables[time_field].values_path or self.definition.identifiables[time_field].values_path
        record_count = granule.identifiables[self.element_count_id].value

        #-------------------------------------------------------------------------------------
        # Go through the time vector and get the indexes that correspond to the timeval
        # It will find a value such that
        # t_n <= i < t_(n+1), where i is the index
        #-------------------------------------------------------------------------------------


        var_name = value_path.split('/').pop()
        res = acquire_data([file_path], [var_name], record_count).next()
        time_vector = res[var_name]['values']
        retval = 0
        for i in xrange(len(time_vector)):
            if time_vector[i] == timeval:
                retval = i
                break
            elif i==0 and time_vector[i] > timeval:
                retval = i
                break
            elif (i+1) < len(time_vector): # not last val
                if time_vector[i] < timeval and time_vector[i+1] > timeval:
                    retval = i
                    break
            else: # last val
                retval = i
                break
        FileSystem.unlink(file_path)
        return retval
Beispiel #9
0
    def _slice(self,granule,slice_):
        '''
        @brief Creates a granule which is a slice of the granule parameter
        @param granule the superset
        @param slice_ The slice values for which to create the granule
        @return Crafted subset granule of the parameter granule.
        '''
        retval = copy.deepcopy(granule)
        fields = self._list_data(self.definition,granule)
        record_count = slice_.stop - slice_.start
        assert record_count > 0, 'slice is malformed'
        pairs = self._pair_up(granule)
        var_names = list([i[0] for i in pairs]) # Get the var_names from the pairs
        log.debug('var_names: %s',var_names)
        file_path = self._get_hdf_from_string(granule.identifiables[self.data_stream_id].values)
        codec = HDFEncoder()
        vectors = acquire_data([file_path],var_names,record_count,slice_ ).next()

        for row, value in vectors.iteritems():
            vp = self._find_vp(pairs, row)
            # Determine the range_id reverse dictionary lookup
            #@todo: improve this pattern
            for field,path in fields.iteritems():
                if vp==path:
                    range_id = field
                    break
            bounds_id = retval.identifiables[range_id].bounds_id
            # Recalculate the bounds for this fields and update the granule
            range = value['range']
            retval.identifiables[bounds_id].value_pair[0] = float(range[0])
            retval.identifiables[bounds_id].value_pair[1] = float(range[1])
            codec.add_hdf_dataset(vp, value['values'])
            record_count = len(value['values'])
            #----- DEBUGGING ---------
            log.debug('slice- row: %s', row)
            log.debug('slice- value_path: %s', vp)
            log.debug('slice- range_id: %s', range_id)
            log.debug('slice- bounds_id: %s', bounds_id)
            log.debug('slice- limits: %s', value['range'])
            #-------------------------


        retval.identifiables[self.element_count_id].value = record_count
        hdf_string = codec.encoder_close()
        self._patch_granule(retval, hdf_string)
        FileSystem.unlink(file_path)
        return retval
        def sub_listen(msg, headers):

            assertions(isinstance(msg,StreamGranuleContainer),'replayed message is not a granule.')
            hdf_string = msg.identifiables[data_stream_id].values
            sha1 = hashlib.sha1(hdf_string).hexdigest().upper()
            assertions(sha1 == msg.identifiables[encoding_id].sha1,'Checksum failed.')
            assertions(msg.identifiables[element_count_id].value==1, 'record replay count is incorrect %d.' % msg.identifiables[element_count_id].value)
            output_file = FileSystem.mktemp()
            output_file.write(msg.identifiables[data_stream_id].values)
            output_file_path = output_file.name
            output_file.close()
            output_vectors = acquire_data([output_file_path],fields,2).next()
            for field in fields:
                comparison = (input_vectors[field]['values']==output_vectors[field]['values'])
                assertions(comparison.all(), 'vector mismatch: %s vs %s' %
                                             (input_vectors[field]['values'],output_vectors[field]['values']))
            FileSystem.unlink(output_file_path)
            ar.set(True)
 def test_acquire_data_closes_files_when_exception(self, h5mock):
     self.assertRaises(TypeError, acquire_data(['anything'], [], [], []))
     h5mock.File.close.assert_called_once_with()
    def test_bounds(self):

        #---------------------------------------------------------------------------------------------------
        # Test with bad input not a slice and not a tuple...
        #---------------------------------------------------------------------------------------------------

        generator = acquire_data(hdf_files = self.fnames,
            var_names =  ['temperature', 'salinity'],
            concatenate_size = 26,
            bounds = 's'
        )

        # Assert an error.

        with self.assertRaises(BadRequest):
            out = generator.next()

        #@todo can we make the error more transparent to the use - easier to correct their mistake?

        #---------------------------------------------------------------------------------------------------
        # Test with 2 tuple of slices on the 1d dataset
        #---------------------------------------------------------------------------------------------------

        generator = acquire_data(hdf_files = self.fnames,
            var_names =  ['temperature', 'salinity'],
            concatenate_size = 26,
            bounds = (slice(63,120), slice(63,120))
        )

        # Assert an error.

        with self.assertRaises(BadRequest):
            out = generator.next()

        #---------------------------------------------------------------------------------------------------
        # Test with bounds greater than the dataset length
        #---------------------------------------------------------------------------------------------------

        generator = acquire_data(hdf_files = self.fnames,
            var_names =  ['temperature', 'salinity'],
            concatenate_size = 200,
            bounds = (slice(0,200))
        )

        out = generator.next()

        # Assert result is the whole dataset

        truth1 = out['temperature']['values'] == self.t_result
        truth2 = out['salinity']['values'] == self.s_result

        self.assertTrue(truth1.all())
        self.assertTrue(truth2.all())

        # try to get the stop iteration by iterating again

        with self.assertRaises(StopIteration):
            out = generator.next()

        #---------------------------------------------------------------------------------------------------
        # Test with normal bounds slice
        #---------------------------------------------------------------------------------------------------

        generator = acquire_data(hdf_files = self.fnames,
            var_names =  ['temperature', 'salinity', 'pressure'],
            concatenate_size = 60,
            bounds = (slice(30,80))
        )
        out = generator.next()

        # assert result

        truth1 = out['temperature']['values'] == self.t_result[30:80]
        truth2 = out['salinity']['values'] == self.s_result[30:80]

        self.assertTrue(truth1.all())
        self.assertTrue(truth2.all())

        with self.assertRaises(StopIteration):
            out = generator.next()

        #---------------------------------------------------------------------------------------------------
        # Test with no bounds
        #---------------------------------------------------------------------------------------------------

        concatenate_size = 60

        generator = acquire_data(hdf_files = self.fnames,
            var_names =  ['temperature', 'salinity', 'pressure'],
            concatenate_size = concatenate_size
        )
        # assert result

        self.check_pieces_3_variables_1d(generator, self.sl, concatenate_size)

        #---------------------------------------------------------------------------------------------------
        # Test with concatenate larger than bounds slice
        #---------------------------------------------------------------------------------------------------

        generator = acquire_data(hdf_files = self.fnames,
            var_names =  ['temperature', 'salinity', 'pressure'],
            concatenate_size = 60,
            bounds = (slice(30,50))
        )
        out = generator.next()

        # assert result

        truth1 = out['temperature']['values'] == self.t_result[30:50]
        truth2 = out['salinity']['values'] == self.s_result[30:50]
        truth3 = out['pressure']['values'] == self.p_result[30:50]

        self.assertTrue(truth1.all())
        self.assertTrue(truth2.all())
        self.assertTrue(truth3.all())

        with self.assertRaises(StopIteration):
            out = generator.next()

        #---------------------------------------------------------------------------------------------------
        # Test with concatenate smaller than bounds slice
        #---------------------------------------------------------------------------------------------------

        sl = slice(30,100)
        concatenate_size = 10

        generator = acquire_data(hdf_files = self.fnames,
            var_names =  ['temperature', 'salinity', 'pressure'],
            concatenate_size = concatenate_size,
            bounds = (sl)
        )

        # assert result

        self.check_pieces_3_variables_1d(generator, sl, concatenate_size)
    def test_var_names(self):

        # Test with no names
        # assert an error?

        generator = acquire_data(hdf_files = self.fnames,
            var_names =  None,
            concatenate_size = 26,
            bounds = (slice(63,120))
        )

        with self.assertRaises(NotFound):
            out = generator.next()

        #---------------------------------------------------------------------------------------------------
        # Test with all names
        #---------------------------------------------------------------------------------------------------

        generator = acquire_data(hdf_files = self.fnames,
            var_names =  ['temperature', 'salinity', 'pressure'],
            concatenate_size = 26,
            bounds = (slice(63,120))
        )
        out = generator.next()

        # assert result

        truth1 = out['temperature']['values'] == self.t_result[63:89]
        truth2 = out['salinity']['values'] == self.s_result[63:89]
        truth3 = out['pressure']['values'] == self.p_result[63:89]

        self.assertTrue(truth1.all())
        self.assertTrue(truth2.all())
        self.assertTrue(truth3.all())

        #---------------------------------------------------------------------------------------------------
        # Test with some names
        #---------------------------------------------------------------------------------------------------

        generator = acquire_data(hdf_files = self.fnames,
            var_names =  ['temperature', 'salinity'],
            concatenate_size = 26,
            bounds = (slice(63,120))
        )
        out = generator.next()

        # assert result

        truth1 = out['temperature']['values'] == self.t_result[63:89]
        truth2 = out['salinity']['values'] == self.s_result[63:89]

        self.assertTrue(truth1.all())
        self.assertTrue(truth2.all())

        self.assertTrue('pressure' not in out)


        #---------------------------------------------------------------------------------------------------
        # Test with name not in dataset
        #---------------------------------------------------------------------------------------------------

        generator = acquire_data(hdf_files = self.fnames,
            var_names =  ['biological_quotient'],
            concatenate_size = 26,
            bounds = (slice(63,120))
        )

        # assert an error

        with self.assertRaises(NotFound):
            out = generator.next()
 def test_acquire_data_closes_files_when_exception(self, h5mock):
     self.assertRaises(TypeError, acquire_data(['anything'], [], [], []))
     h5mock.File.close.assert_called_once_with()
    def test_replay_integration(self):
        '''
        test_replay_integration
        '''
        import numpy as np
        # Keep the import it's used in the vector comparison below even though pycharm says its unused.

        cc = self.container
        XP = self.XP
        assertions = self.assertTrue

        ### Every thing below here can be run as a script:
        log.debug('Got it')

        pubsub_management_service = PubsubManagementServiceClient(node=cc.node)
        ingestion_management_service = IngestionManagementServiceClient(node=cc.node)
        dataset_management_service = DatasetManagementServiceClient(node=cc.node)
        data_retriever_service = DataRetrieverServiceClient(node=cc.node)

        datastore_name = 'dm_test_replay_integration'

        producer = Publisher(name=(XP,'stream producer'))

        ingestion_configuration_id = ingestion_management_service.create_ingestion_configuration(
            exchange_point_id=XP,
            couch_storage=CouchStorage(datastore_name=datastore_name,datastore_profile='SCIDATA'),
            hdf_storage=HdfStorage(),
            number_of_workers=1
        )

        ingestion_management_service.activate_ingestion_configuration(
            ingestion_configuration_id=ingestion_configuration_id
        )

        definition = SBE37_CDM_stream_definition()
        data_stream_id = definition.data_stream_id
        encoding_id = definition.identifiables[data_stream_id].encoding_id
        element_count_id = definition.identifiables[data_stream_id].element_count_id

        stream_def_id = pubsub_management_service.create_stream_definition(
            container=definition
        )
        stream_id = pubsub_management_service.create_stream(
            stream_definition_id=stream_def_id
        )

        dataset_id = dataset_management_service.create_dataset(
            stream_id=stream_id,
            datastore_name=datastore_name,
            view_name='datasets/dataset_by_id'
        )
        ingestion_management_service.create_dataset_configuration(
            dataset_id=dataset_id,
            archive_data=True,
            archive_metadata=True,
            ingestion_configuration_id = ingestion_configuration_id
        )
        definition.stream_resource_id = stream_id

        packet = _create_packet(definition)
        input_file = FileSystem.mktemp()
        input_file.write(packet.identifiables[data_stream_id].values)
        input_file_path = input_file.name
        input_file.close()

        fields=[
            'conductivity',
            'height',
            'latitude',
            'longitude',
            'pressure',
            'temperature',
            'time'
        ]

        input_vectors = acquire_data([input_file_path],fields , 2).next()

        producer.publish(msg=packet, to_name=(XP,'%s.data' % stream_id))

        replay_id, replay_stream_id = data_retriever_service.define_replay(dataset_id)
        ar = gevent.event.AsyncResult()
        def sub_listen(msg, headers):

            assertions(isinstance(msg,StreamGranuleContainer),'replayed message is not a granule.')
            hdf_string = msg.identifiables[data_stream_id].values
            sha1 = hashlib.sha1(hdf_string).hexdigest().upper()
            assertions(sha1 == msg.identifiables[encoding_id].sha1,'Checksum failed.')
            assertions(msg.identifiables[element_count_id].value==1, 'record replay count is incorrect %d.' % msg.identifiables[element_count_id].value)
            output_file = FileSystem.mktemp()
            output_file.write(msg.identifiables[data_stream_id].values)
            output_file_path = output_file.name
            output_file.close()
            output_vectors = acquire_data([output_file_path],fields,2).next()
            for field in fields:
                comparison = (input_vectors[field]['values']==output_vectors[field]['values'])
                assertions(comparison.all(), 'vector mismatch: %s vs %s' %
                                             (input_vectors[field]['values'],output_vectors[field]['values']))
            FileSystem.unlink(output_file_path)
            ar.set(True)

        subscriber = Subscriber(name=(XP,'replay listener'),callback=sub_listen)

        g = gevent.Greenlet(subscriber.listen, binding='%s.data' % replay_stream_id)
        g.start()

        data_retriever_service.start_replay(replay_id)

        ar.get(timeout=10)

        FileSystem.unlink(input_file_path)
    def test_bounds(self):

        #---------------------------------------------------------------------------------------------------
        # Test with 1 tuple of slices on the 1d dataset
        #---------------------------------------------------------------------------------------------------

        concatenate_size = 26

        generator = acquire_data(
            hdf_files=self.fnames,
            var_names=['temperature', 'salinity', 'pressure'],
            concatenate_size=concatenate_size,
            bounds=(slice(2, 4)))

        # Assert that it interprets the only slice provided as pertaining to the vertical dimension

        out = generator.next()

        self.check_pieces_3_variables_2d(generator,
                                         (slice(2, 4), self.slice_tuple[1]),
                                         concatenate_size)

        #---------------------------------------------------------------------------------------------------
        # Test with bounds greater than the dataset length
        #---------------------------------------------------------------------------------------------------

        #        bounds = (slice(0,200), slice(0,200))
        #        concatenate_size = 20
        #
        #        generator = acquire_data(hdf_files = self.fnames,
        #            var_names =  ['temperature', 'salinity', 'pressure'],
        #            concatenate_size = concatenate_size,
        #            bounds = bounds
        #        )
        #
        #        out = generator.next()
        #
        #        # Assert result is the whole dataset

        #---------------------------------------------------------------------------------------------------
        # Test with normal bounds slice
        #---------------------------------------------------------------------------------------------------

        bounds = (slice(2, 3), slice(2, 5))
        concatenate_size = 60

        generator = acquire_data(
            hdf_files=self.fnames,
            var_names=['temperature', 'salinity', 'pressure'],
            concatenate_size=concatenate_size,
            bounds=bounds)
        out = generator.next()

        # assert result

        self.check_pieces_3_variables_2d(generator,
                                         (slice(2, 4), self.slice_tuple[1]),
                                         concatenate_size)

        #---------------------------------------------------------------------------------------------------
        # Test with no bounds
        #---------------------------------------------------------------------------------------------------

        concatenate_size = 60

        generator = acquire_data(
            hdf_files=self.fnames,
            var_names=['temperature', 'salinity', 'pressure'],
            concatenate_size=concatenate_size)
        # assert result

        self.check_pieces_3_variables_2d(generator, self.slice_tuple,
                                         concatenate_size)

        #---------------------------------------------------------------------------------------------------
        # Test with concatenate larger than bounds overall extent
        #---------------------------------------------------------------------------------------------------

        bounds = (slice(2, 5), slice(2, 10))
        concatenate_size = 200
        generator = acquire_data(
            hdf_files=self.fnames,
            var_names=['temperature', 'salinity', 'pressure'],
            concatenate_size=concatenate_size,
            bounds=bounds)
        out = generator.next()

        # assert result

        self.check_pieces_3_variables_2d(generator, self.slice_tuple,
                                         concatenate_size)
Beispiel #17
0
    def _merge(self, msgs):
        '''
        @brief Merges all the granules and datasets into one large dataset (Union)
        @param msgs raw granules from couch
        @return complete dataset
        @description
             n
        D := U [ msgs_i ]
            i=0
        '''
        granule = None
        file_list = list()
        count = len(msgs)
        used_vals = list()

        #-------------------------------------------------------------------------------------
        # Merge each granule to another granule one by one.
        # After each merge operation keep track of what files belong where on the timeline
        #-------------------------------------------------------------------------------------

        for i in xrange(count):
            if i == 0:
                granule = msgs[0]['granule']
                psc = PointSupplementConstructor(
                    point_definition=self.definition)

                res = ReplayProcess.merge_granule(definition=self.definition,
                                                  granule1=granule,
                                                  granule2=None)
                granule = res['granule']
                file_pair = res['files']
                log.debug('file_pair: %s', file_pair)

                if file_pair[0] not in file_list and file_pair[0][
                        0] not in used_vals:
                    file_list.append(tuple(file_pair[0]))
                    used_vals.append(file_pair[0][0])

            else:
                res = ReplayProcess.merge_granule(definition=self.definition,
                                                  granule1=granule,
                                                  granule2=msgs[i]['granule'])

                granule = res['granule']
                file_pair = res['files']
                log.debug('file_pair: %s', file_pair)

                if file_pair[0] not in file_list and file_pair[0][
                        0] not in used_vals:
                    file_list.append(tuple(file_pair[0]))
                    used_vals.append(file_pair[0][0])
                if file_pair[1] not in file_list and file_pair[1][
                        0] not in used_vals:
                    file_list.append(tuple(file_pair[1]))
                    used_vals.append(file_pair[1][0])

        if not granule:
            return
        log.debug('file_list: %s', file_list)
        #-------------------------------------------------------------------------------------
        # Order the lists using a stable sort from python (by the first value in the tuples
        # Then peel off just the file names
        # Then get the appropriate URL for the file using FileSystem
        #-------------------------------------------------------------------------------------
        file_list.sort()
        file_list = list(i[1] for i in file_list)
        file_list = list([
            FileSystem.get_hierarchical_url(FS.CACHE, '%s' % i)
            for i in file_list
        ])

        pairs = self._pair_up(granule)
        var_names = list([i[0] for i in pairs])

        record_count = granule.identifiables[self.element_count_id].value
        codec = HDFEncoder()
        log.debug('acquire_data:')
        log.debug('\tfile_list: %s', file_list)
        log.debug('\tfields: %s', var_names)
        log.debug('\trecords: %s', record_count)

        data = acquire_data(file_list, var_names, record_count).next()

        for row, value in data.iteritems():
            value_path = self._find_vp(pairs, row)
            codec.add_hdf_dataset(value_path, nparray=value['values'])
            #-------------------------------------------------------------------------------------
            # Debugging
            #-------------------------------------------------------------------------------------
            log.debug('row: %s', row)
            log.debug('value path: %s', value_path)
            log.debug('value: %s', value['values'])

        hdf_string = codec.encoder_close()
        self._patch_granule(granule, hdf_string)
        return granule
    def test_var_names(self):

        # Test with no names
        # assert an error?

        generator = acquire_data(hdf_files=self.fnames,
                                 var_names=None,
                                 concatenate_size=26,
                                 bounds=(slice(63, 120)))

        with self.assertRaises(NotFound):
            out = generator.next()

        #---------------------------------------------------------------------------------------------------
        # Test with all names
        #---------------------------------------------------------------------------------------------------

        generator = acquire_data(
            hdf_files=self.fnames,
            var_names=['temperature', 'salinity', 'pressure'],
            concatenate_size=26,
            bounds=(slice(63, 120)))
        out = generator.next()

        # assert result

        truth1 = out['temperature']['values'] == self.t_result[63:89]
        truth2 = out['salinity']['values'] == self.s_result[63:89]
        truth3 = out['pressure']['values'] == self.p_result[63:89]

        self.assertTrue(truth1.all())
        self.assertTrue(truth2.all())
        self.assertTrue(truth3.all())

        #---------------------------------------------------------------------------------------------------
        # Test with some names
        #---------------------------------------------------------------------------------------------------

        generator = acquire_data(hdf_files=self.fnames,
                                 var_names=['temperature', 'salinity'],
                                 concatenate_size=26,
                                 bounds=(slice(63, 120)))
        out = generator.next()

        # assert result

        truth1 = out['temperature']['values'] == self.t_result[63:89]
        truth2 = out['salinity']['values'] == self.s_result[63:89]

        self.assertTrue(truth1.all())
        self.assertTrue(truth2.all())

        self.assertTrue('pressure' not in out)

        #---------------------------------------------------------------------------------------------------
        # Test with name not in dataset
        #---------------------------------------------------------------------------------------------------

        generator = acquire_data(hdf_files=self.fnames,
                                 var_names=['biological_quotient'],
                                 concatenate_size=26,
                                 bounds=(slice(63, 120)))

        # assert an error

        with self.assertRaises(NotFound):
            out = generator.next()
    def test_replay_integration(self):
        '''
        test_replay_integration
        '''
        import numpy as np
        # Keep the import it's used in the vector comparison below even though pycharm says its unused.

        cc = self.container
        XP = self.XP
        assertions = self.assertTrue

        ### Every thing below here can be run as a script:
        log.debug('Got it')

        pubsub_management_service = PubsubManagementServiceClient(node=cc.node)
        ingestion_management_service = IngestionManagementServiceClient(
            node=cc.node)
        dataset_management_service = DatasetManagementServiceClient(
            node=cc.node)
        data_retriever_service = DataRetrieverServiceClient(node=cc.node)

        datastore_name = 'dm_test_replay_integration'

        producer = Publisher(name=(XP, 'stream producer'))

        ingestion_configuration_id = ingestion_management_service.create_ingestion_configuration(
            exchange_point_id=XP,
            couch_storage=CouchStorage(datastore_name=datastore_name,
                                       datastore_profile='SCIDATA'),
            hdf_storage=HdfStorage(),
            number_of_workers=1)

        ingestion_management_service.activate_ingestion_configuration(
            ingestion_configuration_id=ingestion_configuration_id)

        definition = SBE37_CDM_stream_definition()
        data_stream_id = definition.data_stream_id
        encoding_id = definition.identifiables[data_stream_id].encoding_id
        element_count_id = definition.identifiables[
            data_stream_id].element_count_id

        stream_def_id = pubsub_management_service.create_stream_definition(
            container=definition)
        stream_id = pubsub_management_service.create_stream(
            stream_definition_id=stream_def_id)

        dataset_id = dataset_management_service.create_dataset(
            stream_id=stream_id,
            datastore_name=datastore_name,
            view_name='datasets/dataset_by_id')
        ingestion_management_service.create_dataset_configuration(
            dataset_id=dataset_id,
            archive_data=True,
            archive_metadata=True,
            ingestion_configuration_id=ingestion_configuration_id)
        definition.stream_resource_id = stream_id

        packet = _create_packet(definition)
        input_file = FileSystem.mktemp()
        input_file.write(packet.identifiables[data_stream_id].values)
        input_file_path = input_file.name
        input_file.close()

        fields = [
            'conductivity', 'height', 'latitude', 'longitude', 'pressure',
            'temperature', 'time'
        ]

        input_vectors = acquire_data([input_file_path], fields, 2).next()

        producer.publish(msg=packet, to_name=(XP, '%s.data' % stream_id))

        replay_id, replay_stream_id = data_retriever_service.define_replay(
            dataset_id)
        ar = gevent.event.AsyncResult()

        def sub_listen(msg, headers):

            assertions(isinstance(msg, StreamGranuleContainer),
                       'replayed message is not a granule.')
            hdf_string = msg.identifiables[data_stream_id].values
            sha1 = hashlib.sha1(hdf_string).hexdigest().upper()
            assertions(sha1 == msg.identifiables[encoding_id].sha1,
                       'Checksum failed.')
            assertions(
                msg.identifiables[element_count_id].value == 1,
                'record replay count is incorrect %d.' %
                msg.identifiables[element_count_id].value)
            output_file = FileSystem.mktemp()
            output_file.write(msg.identifiables[data_stream_id].values)
            output_file_path = output_file.name
            output_file.close()
            output_vectors = acquire_data([output_file_path], fields, 2).next()
            for field in fields:
                comparison = (input_vectors[field]['values'] ==
                              output_vectors[field]['values'])
                assertions(
                    comparison.all(), 'vector mismatch: %s vs %s' %
                    (input_vectors[field]['values'],
                     output_vectors[field]['values']))
            FileSystem.unlink(output_file_path)
            ar.set(True)

        subscriber = Subscriber(name=(XP, 'replay listener'),
                                callback=sub_listen)

        g = gevent.Greenlet(subscriber.listen,
                            binding='%s.data' % replay_stream_id)
        g.start()

        data_retriever_service.start_replay(replay_id)

        ar.get(timeout=10)

        FileSystem.unlink(input_file_path)
Beispiel #20
0
    def subset(self,granule,coverages):
        '''
        @param granule
        @return dataset subset based on the fields
        '''
        assert isinstance(granule, StreamGranuleContainer), 'object is not a granule.'
        field_ids = self.field_ids
        element_count_id = self.element_count_id


        values_path = list()
        domain_ids = list()
        coverage_ids = list()
        coverages = list(coverages)
        log.debug('Coverages include %s of type %s', coverages, type(coverages))
        #-----------------------------------------------------------------------------------------------------------
        # Iterate through the fields IAW stream definition and check for rangesets and coordinate axises
        #  - If its a coordinate axis, it belongs regardless of what the client desires. (It's part of the domain)
        #  - If its a rangeset make sure that it's part of what the client asked for, if not discard it
        #-----------------------------------------------------------------------------------------------------------


        for field_id in field_ids:

            range_id = self.definition.identifiables[field_id].range_id

            #-------------------------------------------------------------------------------------
            # Coordinate Axis
            # - Keep track of this in our domains
            # - Add it to the paths we need to grab from the file(s)
            #-------------------------------------------------------------------------------------

            if isinstance(self.definition.identifiables[range_id], CoordinateAxis):
                log.debug('got a domain: %s' % range_id)
                domain_ids.append(field_id)
                if granule.identifiables.has_key(range_id):
                    value_path = granule.identifiables[range_id].values_path or self.definition.identifiables[range_id].values_path
                    values_path.append(value_path)
                else:
                    value_path = self.definition.identifiables[range_id].values_path
                    values_path.append(value_path)
                continue

            #-------------------------------------------------------------------------------------
            # Range Set
            # - If it's part of the coverages we want to keep
            #   - Add it to the list of ranges we're tracking
            #   - Add the value path to the paths we're tracking.
            #-------------------------------------------------------------------------------------


            if isinstance(self.definition.identifiables[range_id], RangeSet):
                # If its a rangeset, a specified coverage and the granule has it, add it to the list
                if  field_id in coverages:
                    if granule.identifiables.has_key(range_id):
                        log.debug('got a range: %s' % range_id)
                        coverage_ids.append(field_id)
                        if granule.identifiables.has_key(range_id):
                            value_path = granule.identifiables[range_id].values_path or self.definition.identifiables[range_id].values_path
                            values_path.append(value_path)
                        else:
                            value_path = self.definition.identifiables[range_id].values_path
                            values_path.append(value_path)
                        continue

                # ----
                # We need to track the range and bounds because,
                # you guessed it, we need to update the bounds
                # ----

                range_id = self.definition.identifiables[field_id].range_id
                bounds_id = self.definition.identifiables[range_id].bounds_id


                #---
                # Lastly, if the field is there and we don't want it, we need to strip it
                #---

                if not (field_id in coverages):
                    log.debug('%s doesn\'t belong in %s.', field_id, coverages)
                    log.debug('rebool: %s', bool(field_id in coverages))
                    if granule.identifiables.has_key(range_id):
                        log.debug('Removing %s from granule', range_id)
                        del granule.identifiables[range_id]
                    if granule.identifiables.has_key(bounds_id):
                        log.debug('Removing %s from granule', bounds_id)
                        del granule.identifiables[bounds_id]

        log.debug('Domains: %s', domain_ids)
        log.debug('Ranges: %s', coverage_ids)
        log.debug('Values_paths: %s', values_path)

        file_path = self._get_hdf_from_string(granule.identifiables[self.data_stream_id].values)
        full_coverage = list(domain_ids + coverage_ids)

        log.debug('Full coverage: %s' % full_coverage)
        log.debug('Calling acquire_data with: %s, %s, %s', [file_path],values_path,granule.identifiables[element_count_id].value)

        codec = HDFEncoder()

        pairs = self._pair_up(granule)
        var_names = list([i[0] for i in pairs])

        record_count = granule.identifiables[self.element_count_id].value
        data = acquire_data([file_path], var_names, record_count).next()
        for row,value in data.iteritems():
            vp = self._find_vp(pairs, row)
            codec.add_hdf_dataset(vp, value['values'])

        hdf_string = codec.encoder_close()
        self._patch_granule(granule,hdf_string)

        FileSystem.unlink(file_path)

        return granule
    def test_bounds(self):

        #---------------------------------------------------------------------------------------------------
        # Test with bad input not a slice and not a tuple...
        #---------------------------------------------------------------------------------------------------

        generator = acquire_data(hdf_files=self.fnames,
                                 var_names=['temperature', 'salinity'],
                                 concatenate_size=26,
                                 bounds='s')

        # Assert an error.

        with self.assertRaises(BadRequest):
            out = generator.next()

        #@todo can we make the error more transparent to the use - easier to correct their mistake?

        #---------------------------------------------------------------------------------------------------
        # Test with 2 tuple of slices on the 1d dataset
        #---------------------------------------------------------------------------------------------------

        generator = acquire_data(hdf_files=self.fnames,
                                 var_names=['temperature', 'salinity'],
                                 concatenate_size=26,
                                 bounds=(slice(63, 120), slice(63, 120)))

        # Assert an error.

        with self.assertRaises(BadRequest):
            out = generator.next()

        #---------------------------------------------------------------------------------------------------
        # Test with bounds greater than the dataset length
        #---------------------------------------------------------------------------------------------------

        generator = acquire_data(hdf_files=self.fnames,
                                 var_names=['temperature', 'salinity'],
                                 concatenate_size=200,
                                 bounds=(slice(0, 200)))

        out = generator.next()

        # Assert result is the whole dataset

        truth1 = out['temperature']['values'] == self.t_result
        truth2 = out['salinity']['values'] == self.s_result

        self.assertTrue(truth1.all())
        self.assertTrue(truth2.all())

        # try to get the stop iteration by iterating again

        with self.assertRaises(StopIteration):
            out = generator.next()

        #---------------------------------------------------------------------------------------------------
        # Test with normal bounds slice
        #---------------------------------------------------------------------------------------------------

        generator = acquire_data(
            hdf_files=self.fnames,
            var_names=['temperature', 'salinity', 'pressure'],
            concatenate_size=60,
            bounds=(slice(30, 80)))
        out = generator.next()

        # assert result

        truth1 = out['temperature']['values'] == self.t_result[30:80]
        truth2 = out['salinity']['values'] == self.s_result[30:80]

        self.assertTrue(truth1.all())
        self.assertTrue(truth2.all())

        with self.assertRaises(StopIteration):
            out = generator.next()

        #---------------------------------------------------------------------------------------------------
        # Test with no bounds
        #---------------------------------------------------------------------------------------------------

        concatenate_size = 60

        generator = acquire_data(
            hdf_files=self.fnames,
            var_names=['temperature', 'salinity', 'pressure'],
            concatenate_size=concatenate_size)
        # assert result

        self.check_pieces_3_variables_1d(generator, self.sl, concatenate_size)

        #---------------------------------------------------------------------------------------------------
        # Test with concatenate larger than bounds slice
        #---------------------------------------------------------------------------------------------------

        generator = acquire_data(
            hdf_files=self.fnames,
            var_names=['temperature', 'salinity', 'pressure'],
            concatenate_size=60,
            bounds=(slice(30, 50)))
        out = generator.next()

        # assert result

        truth1 = out['temperature']['values'] == self.t_result[30:50]
        truth2 = out['salinity']['values'] == self.s_result[30:50]
        truth3 = out['pressure']['values'] == self.p_result[30:50]

        self.assertTrue(truth1.all())
        self.assertTrue(truth2.all())
        self.assertTrue(truth3.all())

        with self.assertRaises(StopIteration):
            out = generator.next()

        #---------------------------------------------------------------------------------------------------
        # Test with concatenate smaller than bounds slice
        #---------------------------------------------------------------------------------------------------

        sl = slice(30, 100)
        concatenate_size = 10

        generator = acquire_data(
            hdf_files=self.fnames,
            var_names=['temperature', 'salinity', 'pressure'],
            concatenate_size=concatenate_size,
            bounds=(sl))

        # assert result

        self.check_pieces_3_variables_1d(generator, sl, concatenate_size)
Beispiel #22
0
    def subset(self, granule, coverages):
        '''
        @param granule
        @return dataset subset based on the fields
        '''
        assert isinstance(granule,
                          StreamGranuleContainer), 'object is not a granule.'
        field_ids = self.field_ids
        element_count_id = self.element_count_id

        values_path = list()
        domain_ids = list()
        coverage_ids = list()
        coverages = list(coverages)
        log.debug('Coverages include %s of type %s', coverages,
                  type(coverages))
        #-----------------------------------------------------------------------------------------------------------
        # Iterate through the fields IAW stream definition and check for rangesets and coordinate axises
        #  - If its a coordinate axis, it belongs regardless of what the client desires. (It's part of the domain)
        #  - If its a rangeset make sure that it's part of what the client asked for, if not discard it
        #-----------------------------------------------------------------------------------------------------------

        for field_id in field_ids:

            range_id = self.definition.identifiables[field_id].range_id

            #-------------------------------------------------------------------------------------
            # Coordinate Axis
            # - Keep track of this in our domains
            # - Add it to the paths we need to grab from the file(s)
            #-------------------------------------------------------------------------------------

            if isinstance(self.definition.identifiables[range_id],
                          CoordinateAxis):
                log.debug('got a domain: %s' % range_id)
                domain_ids.append(field_id)
                if granule.identifiables.has_key(range_id):
                    value_path = granule.identifiables[
                        range_id].values_path or self.definition.identifiables[
                            range_id].values_path
                    values_path.append(value_path)
                else:
                    value_path = self.definition.identifiables[
                        range_id].values_path
                    values_path.append(value_path)
                continue

            #-------------------------------------------------------------------------------------
            # Range Set
            # - If it's part of the coverages we want to keep
            #   - Add it to the list of ranges we're tracking
            #   - Add the value path to the paths we're tracking.
            #-------------------------------------------------------------------------------------

            if isinstance(self.definition.identifiables[range_id], RangeSet):
                # If its a rangeset, a specified coverage and the granule has it, add it to the list
                if field_id in coverages:
                    if granule.identifiables.has_key(range_id):
                        log.debug('got a range: %s' % range_id)
                        coverage_ids.append(field_id)
                        if granule.identifiables.has_key(range_id):
                            value_path = granule.identifiables[
                                range_id].values_path or self.definition.identifiables[
                                    range_id].values_path
                            values_path.append(value_path)
                        else:
                            value_path = self.definition.identifiables[
                                range_id].values_path
                            values_path.append(value_path)
                        continue

                # ----
                # We need to track the range and bounds because,
                # you guessed it, we need to update the bounds
                # ----

                range_id = self.definition.identifiables[field_id].range_id
                bounds_id = self.definition.identifiables[range_id].bounds_id

                #---
                # Lastly, if the field is there and we don't want it, we need to strip it
                #---

                if not (field_id in coverages):
                    log.debug('%s doesn\'t belong in %s.', field_id, coverages)
                    log.debug('rebool: %s', bool(field_id in coverages))
                    if granule.identifiables.has_key(range_id):
                        log.debug('Removing %s from granule', range_id)
                        del granule.identifiables[range_id]
                    if granule.identifiables.has_key(bounds_id):
                        log.debug('Removing %s from granule', bounds_id)
                        del granule.identifiables[bounds_id]

        log.debug('Domains: %s', domain_ids)
        log.debug('Ranges: %s', coverage_ids)
        log.debug('Values_paths: %s', values_path)

        file_path = self._get_hdf_from_string(
            granule.identifiables[self.data_stream_id].values)
        full_coverage = list(domain_ids + coverage_ids)

        log.debug('Full coverage: %s' % full_coverage)
        log.debug('Calling acquire_data with: %s, %s, %s', [file_path],
                  values_path, granule.identifiables[element_count_id].value)

        codec = HDFEncoder()

        pairs = self._pair_up(granule)
        var_names = list([i[0] for i in pairs])

        record_count = granule.identifiables[self.element_count_id].value
        data = acquire_data([file_path], var_names, record_count).next()
        for row, value in data.iteritems():
            vp = self._find_vp(pairs, row)
            codec.add_hdf_dataset(vp, value['values'])

        hdf_string = codec.encoder_close()
        self._patch_granule(granule, hdf_string)

        FileSystem.unlink(file_path)

        return granule
    def test_bounds(self):


        #---------------------------------------------------------------------------------------------------
        # Test with 1 tuple of slices on the 1d dataset
        #---------------------------------------------------------------------------------------------------

        concatenate_size = 26

        generator = acquire_data(hdf_files = self.fnames,
            var_names =  ['temperature', 'salinity', 'pressure'],
            concatenate_size = concatenate_size,
            bounds = (slice(2,4))
        )

        # Assert that it interprets the only slice provided as pertaining to the vertical dimension


        out = generator.next()


        self.check_pieces_3_variables_2d(generator, (slice(2,4), self.slice_tuple[1]), concatenate_size)


        #---------------------------------------------------------------------------------------------------
        # Test with bounds greater than the dataset length
        #---------------------------------------------------------------------------------------------------

#        bounds = (slice(0,200), slice(0,200))
#        concatenate_size = 20
#
#        generator = acquire_data(hdf_files = self.fnames,
#            var_names =  ['temperature', 'salinity', 'pressure'],
#            concatenate_size = concatenate_size,
#            bounds = bounds
#        )
#
#        out = generator.next()
#
#        # Assert result is the whole dataset

        #---------------------------------------------------------------------------------------------------
        # Test with normal bounds slice
        #---------------------------------------------------------------------------------------------------

        bounds = (slice(2,3),slice(2,5) )
        concatenate_size = 60

        generator = acquire_data(hdf_files = self.fnames,
            var_names =  ['temperature', 'salinity', 'pressure'],
            concatenate_size = concatenate_size,
            bounds = bounds
        )
        out = generator.next()

        # assert result

        self.check_pieces_3_variables_2d(generator, (slice(2,4), self.slice_tuple[1]), concatenate_size)

        #---------------------------------------------------------------------------------------------------
        # Test with no bounds
        #---------------------------------------------------------------------------------------------------

        concatenate_size = 60

        generator = acquire_data(hdf_files = self.fnames,
            var_names =  ['temperature', 'salinity', 'pressure'],
            concatenate_size = concatenate_size
        )
        # assert result

        self.check_pieces_3_variables_2d(generator, self.slice_tuple, concatenate_size)

        #---------------------------------------------------------------------------------------------------
        # Test with concatenate larger than bounds overall extent
        #---------------------------------------------------------------------------------------------------

        bounds = (slice(2,5), slice(2,10))
        concatenate_size = 200
        generator = acquire_data(hdf_files = self.fnames,
            var_names =  ['temperature', 'salinity', 'pressure'],
            concatenate_size = concatenate_size,
            bounds = bounds
        )
        out = generator.next()

        # assert result

        self.check_pieces_3_variables_2d(generator, self.slice_tuple, concatenate_size)
Beispiel #24
0
    def _merge(self, msgs):
        '''
        @brief Merges all the granules and datasets into one large dataset (Union)
        @param msgs raw granules from couch
        @return complete dataset
        @description
             n
        D := U [ msgs_i ]
            i=0
        '''
        granule = None
        file_list = list()
        count = len(msgs)
        used_vals = list()

        #-------------------------------------------------------------------------------------
        # Merge each granule to another granule one by one.
        # After each merge operation keep track of what files belong where on the timeline
        #-------------------------------------------------------------------------------------


        for i in xrange(count):
            if i==0:
                granule = msgs[0]['granule']
                psc = PointSupplementConstructor(point_definition=self.definition)

                res = ReplayProcess.merge_granule(definition=self.definition, granule1=granule, granule2=None)
                granule = res['granule']
                file_pair = res['files']
                log.debug('file_pair: %s', file_pair)

                if file_pair[0] not in file_list and file_pair[0][0] not in used_vals:
                    file_list.append( tuple(file_pair[0]))
                    used_vals.append(file_pair[0][0])


            else:
                res = ReplayProcess.merge_granule(definition=self.definition, granule1=granule, granule2=msgs[i]['granule'])

                granule = res['granule']
                file_pair = res['files']
                log.debug('file_pair: %s', file_pair)

                if file_pair[0] not in file_list and file_pair[0][0] not in used_vals:
                    file_list.append( tuple(file_pair[0]))
                    used_vals.append(file_pair[0][0])
                if file_pair[1] not in file_list and file_pair[1][0] not in used_vals:
                    file_list.append(tuple(file_pair[1]))
                    used_vals.append(file_pair[1][0])

        if not granule:
            return
        log.debug('file_list: %s', file_list)
        #-------------------------------------------------------------------------------------
        # Order the lists using a stable sort from python (by the first value in the tuples
        # Then peel off just the file names
        # Then get the appropriate URL for the file using FileSystem
        #-------------------------------------------------------------------------------------
        file_list.sort()
        file_list = list(i[1] for i in file_list)
        file_list = list([FileSystem.get_hierarchical_url(FS.CACHE, '%s' % i) for i in file_list])

        pairs = self._pair_up(granule)
        var_names = list([i[0] for i in pairs])

        record_count = granule.identifiables[self.element_count_id].value
        codec = HDFEncoder()
        log.debug('acquire_data:')
        log.debug('\tfile_list: %s', file_list)
        log.debug('\tfields: %s', var_names)
        log.debug('\trecords: %s', record_count)

        data = acquire_data(file_list, var_names, record_count).next()

        for row,value in data.iteritems():
            value_path = self._find_vp(pairs,row)
            codec.add_hdf_dataset(value_path,nparray=value['values'])
            #-------------------------------------------------------------------------------------
            # Debugging
            #-------------------------------------------------------------------------------------
            log.debug('row: %s', row)
            log.debug('value path: %s', value_path)
            log.debug('value: %s', value['values'])

        hdf_string = codec.encoder_close()
        self._patch_granule(granule,hdf_string)
        return granule